| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """Ernie model configuration""" |
| | import copy |
| |
|
| | from typing import List, Optional, Tuple, Union |
| |
|
| | from transformers import PretrainedConfig |
| |
|
| |
|
| | __all__ = [ |
| | "ERNIE_PRETRAINED_INIT_CONFIGURATION", |
| | "Ernie4_5_Config", |
| | "Ernie4_5_MoEConfig", |
| | "Ernie4_5_VLMoEConfig", |
| | ] |
| |
|
| |
|
| | class DFNRopeVisionTransformerConfig(PretrainedConfig): |
| | """ |
| | Configuration class for DFNRopeVisionTransformer model. |
| | This class inherits from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| | documentation from [`PretrainedConfig`] for more information. |
| | """ |
| |
|
| | model_type = "DFNRope_vision_transformer" |
| | base_model_tp_plan = {} |
| |
|
| | def __init__( |
| | self, |
| | depth=32, |
| | embed_dim=1280, |
| | hidden_size=3584, |
| | hidden_act="quick_gelu", |
| | mlp_ratio=4, |
| | num_heads=16, |
| | in_channels=3, |
| | patch_size=14, |
| | spatial_merge_size=2, |
| | attn_implementation="eager", |
| | pp_data_balance=False, |
| | recompute=False, |
| | attn_sep=False, |
| | vit_first_fwd_bsz=128, |
| | vit_num_recompute_layers=10000, |
| | **kwargs, |
| | ): |
| | """ |
| | Initialize DFNRopeVisionTransformer model configuration with default or specified parameters. |
| | |
| | Args: |
| | depth (int): Number of transformer layers in the model. |
| | embed_dim (int): Dimensionality of the embedding layer. |
| | hidden_size (int): Dimensionality of the feedforward network. |
| | hidden_act (str): Activation function for the feedforward network. |
| | mlp_ratio (float): Ratio between the number of input features and |
| | the number of output features in the feedforward network. |
| | num_heads (int): Number of attention heads in each attention layer. |
| | in_channels (int): Number of channels in the input image. |
| | patch_size (int): |
| | Size of patches in the input image. Defaults to 14. |
| | spatial_merge_size (int): |
| | Spatial merge size for the spatial transformer module. Defaults to 2. |
| | attn_implementation (str): Attention implementation type. Defaults to "eager". |
| | pp_data_balance (bool): Whether to balance data during preprocessing. Defaults to False. |
| | recompute (bool): Whether to use recompute. Defaults to False. |
| | attn_sep (bool): Whether to separate attention computation into two stages. Defaults to False. |
| | vit_first_fwd_bsz (int): First forward batch size for ViT. Defaults to 128. |
| | vit_num_recompute_layers (int): Number of recomputed layers for ViT. Defaults to |
| | """ |
| | super().__init__(**kwargs) |
| |
|
| | self.depth = depth |
| | self.embed_dim = embed_dim |
| | self.hidden_size = hidden_size |
| | self.hidden_act = hidden_act |
| | self.mlp_ratio = mlp_ratio |
| | self.num_heads = num_heads |
| | self.in_channels = in_channels |
| | self.patch_size = patch_size |
| | self.spatial_merge_size = spatial_merge_size |
| | self.attn_implementation = attn_implementation |
| | self.pp_data_balance = pp_data_balance |
| | self.recompute = recompute |
| | self.attn_sep = attn_sep |
| | self.vit_first_fwd_bsz = vit_first_fwd_bsz |
| | self.vit_num_recompute_layers = vit_num_recompute_layers |
| |
|
| | def get(self, key, default=None): |
| | """get config value by key""" |
| | if hasattr(self, key): |
| | return getattr(self, key) |
| | else: |
| | return default |
| |
|
| |
|
| | ERNIE_PRETRAINED_INIT_CONFIGURATION = { |
| | "ernie/tiny-random-ernie": { |
| | "hidden_size": 768, |
| | "initializer_range": 0.02, |
| | "intermediate_size": 11008, |
| | "max_position_embeddings": 2048, |
| | "model_type": "ernie", |
| | "num_attention_heads": 2, |
| | "num_hidden_layers": 2, |
| | "rms_norm_eps": 1e-06, |
| | "vocab_size": 32000, |
| | "bos_token_id": 1, |
| | "eos_token_id": 2, |
| | "pad_token_id": 0, |
| | "use_cache": False, |
| | "recompute": False, |
| | "use_flash_attn": True, |
| | "use_pure_fp16": False, |
| | }, |
| | } |
| |
|
| |
|
| | class Ernie4_5_Config(PretrainedConfig): |
| | """ |
| | Configuration class for ERNIE model. |
| | |
| | This class stores the configuration of an ERNIE model, defining the model architecture. |
| | It inherits from PretrainedConfig and can be used to control model outputs. |
| | """ |
| |
|
| | model_type = "ernie" |
| | pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
| | base_model_tp_plan = {} |
| |
|
| | def __init__( |
| | self, |
| | vocab_size=32000, |
| | hidden_size=768, |
| | intermediate_size=11008, |
| | max_position_embeddings=32768, |
| | num_hidden_layers=2, |
| | num_attention_heads=2, |
| | initializer_range=0.02, |
| | rms_norm_eps=1e-6, |
| | use_cache=False, |
| | use_flash_attention=True, |
| | use_sparse_flash_attn=True, |
| | use_var_len_flash_attn=False, |
| | recompute=False, |
| | recompute_granularity="core_attn", |
| | recompute_use_reentrant=False, |
| | use_rmsnorm=True, |
| | fuse_rms_norm=False, |
| | fuse_ln=False, |
| | pad_token_id=0, |
| | bos_token_id=1, |
| | eos_token_id=2, |
| | fuse_swiglu=False, |
| | use_bias=False, |
| | rope_theta=10000, |
| | fuse_rope=False, |
| | fuse_softmax_mask=False, |
| | use_fast_ln=False, |
| | weight_share_add_bias=True, |
| | fuse_linear=False, |
| | max_sequence_length=None, |
| | ignored_index=-100, |
| | add_tail_layers=False, |
| | use_recompute_lm_head=False, |
| | use_recompute_loss_fn=False, |
| | refined_recompute=dict(), |
| | attention_probs_dropout_prob=0.0, |
| | hidden_dropout_prob=0.0, |
| | compression_ratio: float = 1.0, |
| | num_key_value_heads=None, |
| | use_sparse_head_and_loss_fn=False, |
| | micro_batch_size=-1, |
| | use_ep_comm_overlap=False, |
| | use_fused_head_and_loss_fn=False, |
| | token_balance_loss=False, |
| | token_balance_seqlen=False, |
| | cachekv_quant: bool = False, |
| | pp_seg_method="layer:ErnieDecoderLayer|EmptyLayer", |
| | **kwargs, |
| | ): |
| | """ |
| | Initialize ERNIE model configuration with default or specified parameters. |
| | |
| | Args: |
| | vocab_size (int): Size of the vocabulary (number of unique tokens) |
| | hidden_size (int): Dimensionality of the encoder layers and the pooler layer |
| | intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer |
| | max_position_embeddings (int): Maximum sequence length the model can handle |
| | num_hidden_layers (int): Number of hidden layers in the Transformer encoder |
| | num_attention_heads (int): Number of attention heads for each attention layer |
| | rms_norm_eps (float): The epsilon used by the RMS normalization layers |
| | use_cache (bool): Whether to use caching for faster generation (decoding) |
| | use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation |
| | use_sparse_flash_attn (bool): Whether to use sparse FlashAttention |
| | use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention |
| | recompute (bool): Whether to use gradient checkpointing to save memory |
| | recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.) |
| | recompute_use_reentrant (bool): Whether to use reentrant checkpointing |
| | use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm |
| | fuse_rms_norm (bool): Whether to fuse RMSNorm operations for optimization |
| | fuse_ln (bool): Whether to fuse LayerNorm operations |
| | pad_token_id (int): Token ID used for padding sequences |
| | bos_token_id (int): Token ID used for beginning-of-sequence |
| | eos_token_id (int): Token ID used for end-of-sequence |
| | fuse_swiglu (bool): Whether to fuse SwiGLU operations |
| | use_bias (bool): Whether to use bias terms in linear layers |
| | rope_theta (float): The base period of the RoPE embeddings |
| | fuse_rope (bool): Whether to fuse RoPE operations |
| | use_fast_ln (bool): Whether to use optimized LayerNorm implementation |
| | weight_share_add_bias (bool): Whether to share bias weights in certain layers |
| | fuse_linear (bool): Whether to fuse linear operations |
| | max_sequence_length (int): Maximum sequence length for positional embeddings |
| | ignored_index (int): Target value that is ignored during loss computation |
| | add_tail_layers (bool): Whether to add additional layers at the end |
| | use_recompute_lm_head (bool): Whether to recompute gradients for language model head |
| | use_recompute_loss_fn (bool): Whether to recompute gradients for loss function |
| | refined_recompute (dict): Dictionary specifying refined recomputation settings |
| | attention_probs_dropout_prob (float): Dropout probability for attention weights |
| | hidden_dropout_prob (float): Dropout probability for hidden layers |
| | compression_ratio (float): Ratio for KV cache compression (1.0 = no compression) |
| | num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention) |
| | use_sparse_head_and_loss_fn (bool): Whether to use sparse attention head and loss function |
| | micro_batch_size (int): Size of micro batches (-1 for automatic) |
| | use_ep_comm_overlap (bool): Whether to overlap communication with computation |
| | use_fused_head_loss_fn (bool): Whether to use fused head and loss function |
| | token_balance_loss (bool): Whether to balance loss by token count |
| | token_balance_seqlen (bool): Whether to balance sequence lengths |
| | cachekv_quant (bool): Whether to quantize key-value cache |
| | pp_seg_method (str): Method for pipeline parallel segmentation |
| | **kwargs: Additional keyword arguments passed to parent class |
| | """ |
| |
|
| | |
| | if "tie_word_embeddings" not in kwargs: |
| | kwargs["tie_word_embeddings"] = False |
| | super().__init__( |
| | pad_token_id=pad_token_id, |
| | bos_token_id=bos_token_id, |
| | eos_token_id=eos_token_id, |
| | **kwargs, |
| | ) |
| | self.vocab_size = vocab_size |
| | self.hidden_size = hidden_size |
| | self.intermediate_size = intermediate_size |
| | self.max_position_embeddings = max_position_embeddings |
| | self.num_hidden_layers = num_hidden_layers |
| | self.num_attention_heads = num_attention_heads |
| | self.initializer_range = initializer_range |
| | self.rms_norm_eps = rms_norm_eps |
| | self.use_cache = use_cache |
| | self.recompute = recompute |
| | self.recompute_granularity = recompute_granularity |
| | self.use_flash_attention = use_flash_attention |
| | self.use_sparse_flash_attn = use_sparse_flash_attn |
| | self.recompute_use_reentrant = recompute_use_reentrant |
| | self.use_var_len_flash_attn = use_var_len_flash_attn |
| | self.pad_token_id = pad_token_id |
| | self.bos_token_id = bos_token_id |
| | self.eos_token_id = eos_token_id |
| | self.fuse_swiglu = fuse_swiglu |
| | self.fuse_rms_norm = fuse_rms_norm |
| | self.fuse_ln = fuse_ln |
| | self.use_rmsnorm = use_rmsnorm |
| | self.micro_batch_size = micro_batch_size |
| |
|
| | self.max_sequence_length = max_sequence_length |
| | self.use_bias = use_bias |
| | self.weight_share_add_bias = weight_share_add_bias |
| | self.rope_theta = rope_theta |
| | self.fuse_rope = fuse_rope |
| | self.fuse_softmax_mask = fuse_softmax_mask |
| | self.use_fast_ln = use_fast_ln |
| |
|
| | self.fuse_linear = fuse_linear |
| | self.ignored_index = ignored_index |
| | self.add_tail_layers = add_tail_layers |
| | self.use_recompute_lm_head = use_recompute_lm_head |
| | self.use_recompute_loss_fn = use_recompute_loss_fn |
| |
|
| | self.refined_recompute = refined_recompute |
| | self.skip_recompute_ops = dict() |
| | """ |
| | `refined_recompute` is a dictionary that specifies fine-grained gradient recomputation settings, |
| | which currently only takes effect in Pipeline Parallel (PP) mode. |
| | |
| | In PP mode, this dictionary populates `self.skip_recompute_ops` with the following structure: |
| | - Key (`op_name`): The operation name to configure, with possible values: |
| | * "mlp_row_ln" - MLP row-wise layer normalization |
| | * "flash_attn" - Flash attention operation |
| | * "attention_row_ln" - Attention row-wise layer normalization |
| | * "attention_column_ln" - Attention column-wise layer normalization |
| | * "mlp_column_ln" - MLP column-wise layer normalization |
| | |
| | - Value (`skip_num`): Controls how many times to skip recomputation: |
| | * 0: Never skip recomputation (minimum memory usage) |
| | * -1: Always skip recomputation (maximum memory usage) |
| | * [0,1,...,12]: Skip recomputation for specified number of times |
| | * ≥12: Equivalent to -1 (always skip recomputation) |
| | |
| | This allows precise control over memory/computation tradeoffs for different operations. |
| | """ |
| | self.attention_probs_dropout_prob = attention_probs_dropout_prob |
| | self.hidden_dropout_prob = hidden_dropout_prob |
| | self.compression_ratio = compression_ratio |
| | self.num_key_value_heads = num_key_value_heads |
| | self.use_sparse_head_and_loss_fn = use_sparse_head_and_loss_fn |
| | self.use_ep_comm_overlap = use_ep_comm_overlap |
| | self.use_fused_head_and_loss_fn = use_fused_head_and_loss_fn |
| | self.token_balance_loss = token_balance_loss |
| | self.token_balance_seqlen = token_balance_seqlen |
| | self.cachekv_quant = cachekv_quant |
| | self.pp_seg_method = pp_seg_method |
| |
|
| | def get(self, key, default=None): |
| | """get config value by key""" |
| | if hasattr(self, key): |
| | return getattr(self, key) |
| | else: |
| | return default |
| |
|
| |
|
| | class Ernie4_5_MoEConfig(Ernie4_5_Config): |
| | r""" |
| | Configuration class for ErnieMoE model architecture. |
| | |
| | This class stores the configuration for a [`~ErnieModel`] and is used to instantiate |
| | an ErnieMoE model according to the specified arguments. Inherits from [`PretrainedConfig`] |
| | and can control model outputs. |
| | |
| | Attributes: |
| | Inherits all attributes from Ernie4_5_Config and adds MoE-specific configurations. |
| | """ |
| |
|
| | model_type = "ernie" |
| | attribute_map = { |
| | "n_positions": "max_position_embeddings", |
| | "n_embd": "hidden_size", |
| | "n_layer": "num_hidden_layers", |
| | "n_head": "num_attention_heads", |
| | "n_inner": "intermediate_size", |
| | "activation_function": "hidden_act", |
| | } |
| | pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
| | base_model_tp_plan = {} |
| |
|
| | def __init__( |
| | self, |
| | moe_num_experts: Union[int, list] = 0, |
| | use_recompute_moe=False, |
| | moe_capacity=(), |
| | moe_layer_interval=2, |
| | moe_layer_start_index=0, |
| | moe_layer_end_index=-1, |
| | moe_aux_loss_lambda=1e-2, |
| | moe_z_loss_lambda=1e-4, |
| | moe_orthogonal_loss_lambda=1e-2, |
| | sinkhorn_2gate=True, |
| | sinkhorn_temp=3e-2, |
| | global_aux_loss=False, |
| | moe_dropout_prob=0.0, |
| | moe_group="world", |
| | moe_gate="top2", |
| | moe_intermediate_size: Union[int, list] = 0, |
| | moe_num_shared_experts: int = 0, |
| | moe_reverse_token_drop: bool = False, |
| | moe_gate_act: str = "softmax", |
| | moe_norm_gate_logits=True, |
| | moe_all_to_all_dropout: float = 0.0, |
| | moe_k=2, |
| | moe_use_aux_free: bool = False, |
| | |
| | moe_group_experts: bool = False, |
| | moe_group_orthogonal_loss: bool = True, |
| | enable_delay_scale_loss: bool = True, |
| | num_acc_steps: int = 1, |
| | fuse_gate_detach_matmul: bool = False, |
| | dpo_config=None, |
| | moe_multimodal_dispatch_use_allgather: str = "", |
| | moe_use_hard_gate=False, |
| | moe_dense_experts_token_type_id=3, |
| | **kwargs, |
| | ): |
| | """ |
| | Initialize ErnieMoE configuration with MoE-specific parameters. |
| | |
| | Args: |
| | moe_num_experts: Number of experts in MoE layers |
| | use_recompute_moe: Whether to use recomputation for MoE layers |
| | moe_capacity: Capacity configuration for MoE layers |
| | moe_layer_interval: Interval between MoE layers |
| | moe_layer_start_index: Starting layer index for MoE |
| | moe_layer_end_index: Ending layer index for MoE (-1 means last layer) |
| | moe_aux_loss_lambda: Weight for auxiliary loss |
| | moe_z_loss_lambda: Weight for z-loss |
| | moe_orthogonal_loss_lambda: Weight for orthogonal loss |
| | sinkhorn_2gate: Whether to use sinkhorn 2-gate routing |
| | sinkhorn_temp: Temperature for sinkhorn routing |
| | global_aux_loss: Whether to use global auxiliary loss |
| | moe_dropout_prob: Dropout probability for MoE layers |
| | moe_group: Group configuration for MoE experts |
| | moe_gate: Type of gating mechanism ('top2', etc.) |
| | moe_intermediate_size: Intermediate size for MoE layers |
| | moe_num_shared_experts: Number of shared experts |
| | moe_reverse_token_drop: Whether to use reverse token dropping |
| | moe_gate_act: Activation function for gating |
| | moe_norm_gate_logits: Whether to normalize gate logits |
| | moe_all_to_all_dropout: Dropout for all-to-all communication |
| | moe_k: Number of experts to route to |
| | moe_use_aux_free: Whether to use auxiliary-free routing |
| | moe_group_experts: Whether to group experts (requires hard gating) |
| | moe_group_orthogonal_loss: Whether to use group orthogonal loss |
| | enable_delay_scale_loss: Whether to enable delayed loss scaling |
| | num_acc_steps: Number of accumulation steps |
| | fuse_gate_detach_matmul: Whether to fuse gate detach matmul |
| | **kwargs: Additional base model configuration parameters |
| | |
| | Note: |
| | When use_recompute_moe is True, recompute_granularity will be changed to full_attn. |
| | """ |
| |
|
| | if use_recompute_moe: |
| | logger.warning( |
| | "set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn." |
| | ) |
| | if kwargs["recompute"] and kwargs["recompute_granularity"] == "full": |
| | kwargs["recompute_granularity"] = "full_attn" |
| | super().__init__(**kwargs) |
| |
|
| | self.moe_num_experts = moe_num_experts |
| | self.use_recompute_moe = use_recompute_moe |
| | self.moe_capacity = moe_capacity |
| | self.moe_aux_loss_lambda = moe_aux_loss_lambda |
| | self.moe_z_loss_lambda = moe_z_loss_lambda |
| | self.moe_orthogonal_loss_lambda = moe_orthogonal_loss_lambda |
| | self.global_aux_loss = global_aux_loss |
| | self.sinkhorn_2gate = sinkhorn_2gate |
| | self.sinkhorn_temp = sinkhorn_temp |
| | self.moe_layer_interval = moe_layer_interval |
| | self.moe_dropout_prob = moe_dropout_prob |
| | self.moe_group = moe_group |
| | self.moe_gate = moe_gate |
| | self.moe_intermediate_size = moe_intermediate_size |
| | self.moe_num_shared_experts = moe_num_shared_experts |
| | self.moe_reverse_token_drop = moe_reverse_token_drop |
| | self.moe_k = moe_k |
| | self.moe_all_to_all_dropout = moe_all_to_all_dropout |
| | self.moe_group_experts = moe_group_experts |
| | self.moe_group_orthogonal_loss = moe_group_orthogonal_loss |
| | self.enable_delay_scale_loss = enable_delay_scale_loss |
| | self.num_acc_steps = num_acc_steps |
| | self.moe_layer_start_index = moe_layer_start_index |
| | self.moe_layer_end_index = ( |
| | self.num_hidden_layers - 1 |
| | if moe_layer_end_index == -1 |
| | else moe_layer_end_index |
| | ) |
| | self.moe_gate_act = moe_gate_act |
| | self.moe_norm_gate_logits = moe_norm_gate_logits |
| | self.moe_use_aux_free = moe_use_aux_free |
| | self.fuse_gate_detach_matmul = fuse_gate_detach_matmul |
| | self.dpo_config = dpo_config |
| | self.moe_multimodal_dispatch_use_allgather = ( |
| | moe_multimodal_dispatch_use_allgather |
| | ) |
| | self.moe_use_hard_gate = moe_use_hard_gate |
| | self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
| |
|
| | @property |
| | def multimodel_experts(self) -> bool: |
| | """multimodel experts.""" |
| | return ( |
| | isinstance(self.moe_num_experts, (tuple, list)) |
| | and len(self.moe_num_experts) > 1 |
| | ) |
| |
|
| | @property |
| | def use_moe(self) -> bool: |
| | """ |
| | Check if model is using MoE architecture. |
| | |
| | Returns: |
| | bool: True if moe_num_experts > 0, False otherwise |
| | """ |
| | return self.moe_num_experts > 0 |
| |
|
| |
|
| | class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig): |
| | """ |
| | This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie |
| | model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
| | defaults will yield a similar configuration to that of the Ernie-7B. |
| | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| | documentation from [`PretrainedConfig`] for more information. |
| | Args: |
| | vocab_size (`int`, *optional*, defaults to 32000): |
| | Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the |
| | `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. |
| | hidden_size (`int`, *optional*, defaults to 4096): |
| | Dimension of the hidden representations. |
| | intermediate_size (`int`, *optional*, defaults to 11008): |
| | Dimension of the MLP representations. |
| | num_hidden_layers (`int`, *optional*, defaults to 32): |
| | Number of hidden layers in the Transformer encoder. |
| | num_attention_heads (`int`, *optional*, defaults to 32): |
| | Number of attention heads for each attention layer in the Transformer encoder. |
| | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
| | The non-linear activation function (function or string) in the decoder. |
| | initializer_range (`float`, *optional*, defaults to 0.02): |
| | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| | rms_norm_eps (`float`, *optional*, defaults to 1e-12): |
| | The epsilon used by the rms normalization layers. |
| | use_cache (`bool`, *optional*, defaults to `True`): |
| | Whether or not the model should return the last key/values attentions (not used by all models). Only |
| | relevant if `config.is_decoder=True`. |
| | tie_word_embeddings(`bool`, *optional*, defaults to `False`): |
| | Whether to tie weight embeddings |
| | """ |
| |
|
| | model_type = "ernie4_5_moe_vl" |
| | attribute_map = { |
| | "n_positions": "max_position_embeddings", |
| | "n_embd": "hidden_size", |
| | "n_layer": "num_hidden_layers", |
| | "n_head": "num_attention_heads", |
| | "n_inner": "intermediate_size", |
| | "activation_function": "hidden_act", |
| | } |
| | base_model_tp_plan = { |
| | "model.layers.*.self_attn.q_proj": "colwise_rep", |
| | "model.layers.*.self_attn.k_proj": "colwise_rep", |
| | "model.layers.*.self_attn.v_proj": "colwise_rep", |
| | "model.layers.*.self_attn.o_proj": "rowwise_rep", |
| | "model.layers.*.mlp.experts.*.gate_proj": "colwise", |
| | "model.layers.*.mlp.experts.*.up_proj": "colwise", |
| | "model.layers.*.mlp.experts.*.down_proj": "rowwise", |
| | "model.layers.*.mlp_text.experts.*.gate_proj": "colwise", |
| | "model.layers.*.mlp_text.experts.*.up_proj": "colwise", |
| | "model.layers.*.mlp_text.experts.*.down_proj": "rowwise", |
| | "model.layers.*.mlp.gate_proj": "colwise", |
| | "model.layers.*.mlp.up_proj": "colwise", |
| | "model.layers.*.mlp.down_proj": "rowwise" |
| | } |
| |
|
| | def __init__( |
| | self, |
| | vision_config=None, |
| | im_patch_id=None, |
| | pixel_hidden_size=None, |
| | modality_detach=False, |
| | temporal_conv_size=2, |
| | spatial_conv_size=2, |
| | mm_vocab_size=0, |
| | max_text_id=None, |
| | use_temporal_conv=True, |
| | moe_use_size_all2all=False, |
| | moe_num_attn_experts=False, |
| | moe_dense_experts_token_type_id: int = 3, |
| | moe_use_hard_gate: bool = True, |
| | moe_fuse_experts: bool = False, |
| | moe_use_token_type_bias: bool = False, |
| | disable_ffn_model_parallel=False, |
| | fuse_attn_ffn=True, |
| | rope_3d=True, |
| | freq_allocation=20, |
| | using_precision_check=False, |
| | use_recompute_resampler=False, |
| | resampler_fuse_rms_norm=False, |
| | moe_layer_feed_fake_token=False, |
| | tensor_parallel_degree=1, |
| | **kwargs, |
| | ): |
| | super().__init__(**kwargs) |
| | if isinstance(vision_config, dict): |
| | self.vision_config = DFNRopeVisionTransformerConfig(**vision_config) |
| | else: |
| | self.vision_config = DFNRopeVisionTransformerConfig() |
| | self.im_patch_id = im_patch_id |
| | self.pixel_hidden_size = pixel_hidden_size |
| | self.modality_detach = modality_detach |
| | self.temporal_conv_size = temporal_conv_size |
| | self.spatial_conv_size = spatial_conv_size |
| | self.mm_vocab_size = mm_vocab_size |
| | self.max_text_id = max_text_id |
| | self.use_temporal_conv = use_temporal_conv |
| |
|
| | self.moe_use_size_all2all = moe_use_size_all2all |
| | self.moe_num_attn_experts = moe_num_attn_experts |
| | self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
| | self.moe_use_hard_gate = moe_use_hard_gate |
| | self.moe_fuse_experts = moe_fuse_experts |
| | self.moe_use_token_type_bias = moe_use_token_type_bias |
| | self.disable_ffn_model_parallel = disable_ffn_model_parallel |
| |
|
| | self.fuse_attn_ffn = fuse_attn_ffn |
| | self.rope_3d = rope_3d |
| | self.freq_allocation = freq_allocation |
| | self.using_precision_check = using_precision_check |
| | self.use_recompute_resampler = use_recompute_resampler |
| | self.resampler_fuse_rms_norm = resampler_fuse_rms_norm |
| | self.moe_layer_feed_fake_token = moe_layer_feed_fake_token |
| |
|
| | self.tensor_parallel_degree = tensor_parallel_degree |
| |
|
| | @property |
| | def multimodel_experts(self) -> bool: |
| | """Check if model is using more than 1 multimodel experts.""" |
| | return ( |
| | isinstance(self.moe_num_experts, (tuple, list)) |
| | and len(self.moe_num_experts) > 1 |
| | ) |
| |
|
| | @property |
| | def use_moe(self) -> bool: |
| | """ |
| | Check if model is using MoE architecture. |
| | |
| | Returns: |
| | bool: True if moe_num_experts > 0, False otherwise |
| | """ |
| | return ( |
| | sum(self.moe_num_experts) > 0 |
| | if self.multimodel_experts |
| | else self.moe_num_experts > 0 |
| | ) |
| |
|
| | def to_dict(self, saving_file=False): |
| | """to_dict""" |
| | output = copy.deepcopy(self.__dict__) |
| | if self.vision_config: |
| | output["vision_config"] = ( |
| | self.vision_config.to_dict() |
| | if isinstance(self.vision_config, (DFNRopeVisionTransformerConfig)) |
| | else self.vision_config |
| | ) |
| |
|
| | output["model_type"] = self.__class__.model_type |
| | return output |
| |
|