Update run.py

This refined version focuses on the advanced configurations such as the Transformer model setup with its large embedding size, the use of a Mixture of Experts (MoE) for increased model capacity, and the distributed computing setup for inference, indicating a highly optimized and sophisticated machine learning model deployment.
2026-07-24 12:17:56 +03:00 · 2024-03-24 20:28:45 -04:00
parent 7050ed204b
commit 2dd6511150
1 changed files with 19 additions and 37 deletions
@@ -1,72 +1,54 @@
 # Copyright 2024 X.AI Corp.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit
 from runners import InferenceRunner, ModelRunner, sample_from_model
 CKPT_PATH = "./checkpoints/"
 def main():
    # Advanced model configuration with quantized weights and MoE (Mixture of Experts).
    grok_1_model = LanguageModelConfig(
-        vocab_size=128 * 1024,
+        vocab_size=128 * 1024,  # Large vocabulary size.
-        pad_token=0,
+        sequence_len=8192,  # Long sequence length.
        eos_token=2,
        sequence_len=8192,
        embedding_init_scale=1.0,
        output_multiplier_scale=0.5773502691896257,
        embedding_multiplier_scale=78.38367176906169,
        model=TransformerConfig(
-            emb_size=48 * 128,
+            emb_size=48 * 128,  # Large embedding size.
-            widening_factor=8,
+            widening_factor=8,  # Increases the model width.
-            key_size=128,
+            key_size=128,  # Key size for attention mechanism.
-            num_q_heads=48,
+            num_q_heads=48,  # High number of query heads in multi-head attention.
-            num_kv_heads=8,
+            num_kv_heads=8,  # Number of key/value heads.
-            num_layers=64,
+            num_layers=64,  # Deep transformer with many layers.
            attn_output_multiplier=0.08838834764831845,
-            shard_activations=True,
+            shard_activations=True,  # Activation sharding for memory efficiency.
-            # MoE.
+            num_experts=8,  # MoE configuration: total experts.
-            num_experts=8,
+            num_selected_experts=2,  # MoE configuration: experts used per token.
            num_selected_experts=2,
            # Activation sharding.
            data_axis="data",
            model_axis="model",
        ),
    )
    # Advanced inference runner configuration with support for distributed computation.
    inference_runner = InferenceRunner(
-        pad_sizes=(1024,),
+        pad_sizes=(1024,),  # Padding sizes for batching.
        runner=ModelRunner(
            model=grok_1_model,
-            bs_per_device=0.125,
+            bs_per_device=0.125,  # Batch size per device, indicating data parallelism.
            checkpoint_path=CKPT_PATH,
        ),
        name="local",
        load=CKPT_PATH,
        tokenizer_path="./tokenizer.model",
-        local_mesh_config=(1, 8),
+        local_mesh_config=(1, 8),  # Configuration for running the model on a local mesh.
-        between_hosts_config=(1, 1),
+        between_hosts_config=(1, 1),  # Configuration for distributed computing across hosts.
    )
    inference_runner.initialize()
    gen = inference_runner.run()
    # Sampling from the model with a given prompt.
    inp = "The answer to life the universe and everything is of course"
    print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01))
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()