This commit is contained in:
Michael G. Inso 2024-03-26 17:33:57 +08:00 committed by GitHub
commit d129df04a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

116
run.py
View File

@ -1,7 +1,7 @@
# Copyright 2024 X.AI Corp. # Copyright 2024 X.AI Corp.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
@ -13,60 +13,86 @@
# limitations under the License. # limitations under the License.
import logging import logging
import hashlib
from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit
from runners import InferenceRunner, ModelRunner, sample_from_model from runners import InferenceRunner, ModelRunner, sample_from_model
CKPT_PATH = "./checkpoints/" CKPT_PATH = "./checkpoints/"
CKPT_HASH = "expected_checkpoint_hash"
def validate_checkpoint(path, expected_hash):
calculated_hash = hashlib.sha256(open(path, 'rb').read()).hexdigest()
if calculated_hash != expected_hash:
raise ValueError("Invalid checkpoint file!")
def main(): def main():
grok_1_model = LanguageModelConfig( # Validate checkpoint integrity
vocab_size=128 * 1024, validate_checkpoint(CKPT_PATH, CKPT_HASH)
pad_token=0,
eos_token=2, grok_1_model = LanguageModelConfig(
sequence_len=8192, vocab_size=128 * 1024,
embedding_init_scale=1.0, pad_token=0,
output_multiplier_scale=0.5773502691896257, eos_token=2,
embedding_multiplier_scale=78.38367176906169, sequence_len=8192,
model=TransformerConfig( embedding_init_scale=1.0,
emb_size=48 * 128, output_multiplier_scale=0.5773502691896257,
widening_factor=8, embedding_multiplier_scale=78.38367176906169,
key_size=128, model=TransformerConfig(
num_q_heads=48, emb_size=48 * 128,
num_kv_heads=8, widening_factor=8,
num_layers=64, key_size=128,
attn_output_multiplier=0.08838834764831845, num_q_heads=48,
shard_activations=True, num_kv_heads=8,
# MoE. num_layers=64,
num_experts=8, attn_output_multiplier=0.08838834764831845,
num_selected_experts=2, shard_activations=True,
# Activation sharding. # MoE.
data_axis="data", num_experts=8,
model_axis="model", num_selected_experts=2,
), # Activation sharding.
) data_axis="data",
inference_runner = InferenceRunner( model_axis="model",
pad_sizes=(1024,), ),
runner=ModelRunner( )
model=grok_1_model,
bs_per_device=0.125, inference_runner = InferenceRunner(
checkpoint_path=CKPT_PATH, pad_sizes=(1024,),
), runner=ModelRunner(
name="local", model=grok_1_model,
load=CKPT_PATH, bs_per_device=0.125,
tokenizer_path="./tokenizer.model", checkpoint_path=CKPT_PATH,
local_mesh_config=(1, 8), # Limit inference rate
between_hosts_config=(1, 1), inference_runner.rate_limit = 100
) ),
inference_runner.initialize()
gen = inference_runner.run() name="local",
load=CKPT_PATH,
tokenizer_path="./tokenizer.model",
local_mesh_config=(1, 8),
between_hosts_config=(1, 1),
)
inference_runner.initialize()
gen = inference_runner.run()
inp = "The answer to life the universe and everything is of course" inp = "The answer to life the universe and everything is of course"
print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01)) print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01))
# Add authentication
@app.route("/inference")
@auth.login_required
def inference():
...
gen = inference_runner.run()
# Rest of inference code
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
main() main()