This commit is contained in:
Michael G. Inso 2024-03-26 17:33:57 +08:00 committed by GitHub
commit d129df04a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

114
run.py
View File

@ -13,60 +13,86 @@
# limitations under the License. # limitations under the License.
import logging import logging
import hashlib
from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit
from runners import InferenceRunner, ModelRunner, sample_from_model from runners import InferenceRunner, ModelRunner, sample_from_model
CKPT_PATH = "./checkpoints/" CKPT_PATH = "./checkpoints/"
CKPT_HASH = "expected_checkpoint_hash"
def validate_checkpoint(path, expected_hash):
calculated_hash = hashlib.sha256(open(path, 'rb').read()).hexdigest()
if calculated_hash != expected_hash:
raise ValueError("Invalid checkpoint file!")
def main(): def main():
grok_1_model = LanguageModelConfig( # Validate checkpoint integrity
vocab_size=128 * 1024, validate_checkpoint(CKPT_PATH, CKPT_HASH)
pad_token=0,
eos_token=2,
sequence_len=8192,
embedding_init_scale=1.0,
output_multiplier_scale=0.5773502691896257,
embedding_multiplier_scale=78.38367176906169,
model=TransformerConfig(
emb_size=48 * 128,
widening_factor=8,
key_size=128,
num_q_heads=48,
num_kv_heads=8,
num_layers=64,
attn_output_multiplier=0.08838834764831845,
shard_activations=True,
# MoE.
num_experts=8,
num_selected_experts=2,
# Activation sharding.
data_axis="data",
model_axis="model",
),
)
inference_runner = InferenceRunner(
pad_sizes=(1024,),
runner=ModelRunner(
model=grok_1_model,
bs_per_device=0.125,
checkpoint_path=CKPT_PATH,
),
name="local",
load=CKPT_PATH,
tokenizer_path="./tokenizer.model",
local_mesh_config=(1, 8),
between_hosts_config=(1, 1),
)
inference_runner.initialize()
gen = inference_runner.run()
inp = "The answer to life the universe and everything is of course" grok_1_model = LanguageModelConfig(
print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01)) vocab_size=128 * 1024,
pad_token=0,
eos_token=2,
sequence_len=8192,
embedding_init_scale=1.0,
output_multiplier_scale=0.5773502691896257,
embedding_multiplier_scale=78.38367176906169,
model=TransformerConfig(
emb_size=48 * 128,
widening_factor=8,
key_size=128,
num_q_heads=48,
num_kv_heads=8,
num_layers=64,
attn_output_multiplier=0.08838834764831845,
shard_activations=True,
# MoE.
num_experts=8,
num_selected_experts=2,
# Activation sharding.
data_axis="data",
model_axis="model",
),
)
inference_runner = InferenceRunner(
pad_sizes=(1024,),
runner=ModelRunner(
model=grok_1_model,
bs_per_device=0.125,
checkpoint_path=CKPT_PATH,
# Limit inference rate
inference_runner.rate_limit = 100
),
name="local",
load=CKPT_PATH,
tokenizer_path="./tokenizer.model",
local_mesh_config=(1, 8),
between_hosts_config=(1, 1),
)
inference_runner.initialize()
gen = inference_runner.run()
inp = "The answer to life the universe and everything is of course"
print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01))
# Add authentication
@app.route("/inference")
@auth.login_required
def inference():
...
gen = inference_runner.run()
# Rest of inference code
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
main() main()