# Copyright 2024 X.AI Corp. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import contextlib import logging import math import os import pickle import re import shutil import sys import tempfile from concurrent.futures import ThreadPoolExecutor, wait from typing import Any, Optional import jax import numpy as np from jax.experimental import multihost_utils from model import QuantizedWeight8bit logger = logging.getLogger(__name__) rank_logger = logging.getLogger("rank") # Needed for loading the checkpoint with pickle. sys.modules['__main__'].QuantizedWeight8bit = QuantizedWeight8bit @contextlib.contextmanager def copy_to_shm(file: str): """ Context manager for copying a file to shared memory. If the file is already in shared memory (/dev/shm), yields the same file path. Otherwise, copies the file to a temporary file in shared memory, yields the path to the temporary file, and cleans up by removing the temporary file after use. Parameters: - file (str): The path to the file to be copied. Yields: - str: The path to the file in shared memory. """ if file.startswith("/dev/shm/"): # Nothing to do, the file is already in shared memory. yield file return tmp_dir = "/dev/shm/" fd, tmp_path = tempfile.mkstemp(dir=tmp_dir) try: shutil.copyfile(file, tmp_path) yield tmp_path finally: os.remove(tmp_path) os.close(fd) @contextlib.contextmanager def copy_from_shm(file: str): """ Context manager for copying a file from shared memory to a specified path. It creates a temporary file in shared memory, yields the path to this temporary file for operations, and then copies the temporary file to the specified path, cleaning up the temporary file afterwards. Parameters: - file (str): The target path for the file to be copied to from shared memory. Yields: - str: The path to the temporary file in shared memory. """ tmp_dir = "/dev/shm/" fd, tmp_path = tempfile.mkstemp(dir=tmp_dir) try: yield tmp_path shutil.copyfile(tmp_path, file) finally: os.remove(tmp_path) os.close(fd) def fast_unpickle(path: str) -> Any: """ Unpickles and loads an object from a file, optionally using shared memory for faster access. Parameters: - path (str): The path to the pickle file to load. Returns: - Any: The object loaded from the pickle file. """ with copy_to_shm(path) as tmp_path: with open(tmp_path, "rb") as f: return pickle.load(f) def fast_pickle(obj: Any, path: str) -> None: """ Pickles and saves an object to a file, optionally using shared memory for faster access. Parameters: - obj (Any): The object to be pickled and saved. - path (str): The path where the pickle file should be saved. """ with copy_from_shm(path) as tmp_path: with open(tmp_path, "wb") as f: pickle.dump(obj, f) def load_tensors(shaped_arrays, directory, mesh_config, tensor_indices=None): """ Loads tensors from files in parallel using a ThreadPoolExecutor. This function is intended for use with arrays that have a predefined shape and dtype, loading them from a directory where each tensor is saved in a separate file. Parameters: - shaped_arrays: A sequence of arrays providing the shapes and dtypes of the tensors to load. - directory (str): The directory from which to load the tensors. - mesh_config: Configuration for data parallelism across processes or devices. - tensor_indices (Optional): Specific indices of tensors to load. If None, all tensors are loaded. Returns: - List of numpy arrays or zeros depending on the process's role in data parallelism. """ pool = ThreadPoolExecutor(max_workers=32) fs = list() num_tensors = 0 num_replicas = 1 data_model_shards = math.prod(mesh_config) if tensor_indices is None: iterator = enumerate(shaped_arrays) else: iterator = zip(tensor_indices, shaped_arrays) for i, t in iterator: if (i % num_replicas) == ((jax.process_index() // data_model_shards) % num_replicas): idx = ( jax.process_index() // (num_replicas * data_model_shards) * data_model_shards + jax.process_index() % data_model_shards ) fs.append( pool.submit(fast_unpickle, os.path.join(directory, f"tensor{i:05d}_{idx:03d}")) ) num_tensors += 1 else: fs.append(pool.submit(np.zeros, t.shape, dtype=t.dtype)) wait(fs) return [f.result() for f in fs] def path_tuple_to_string(path: tuple) -> str: """ Converts a tuple representing a path in a nested structure to a string. This function is specifically used for handling paths within the structure of saved states or parameters, making it easier to identify and manipulate specific elements. Parameters: - path (tuple): A tuple representing a path in a nested structure. Returns: - str: A string representation of the path, suitable for logging or identification. """ pieces = [] for elem in path: if isinstance(elem, jax.tree_util.DictKey): pieces.append(elem.key) elif isinstance(elem, jax.tree_util.GetAttrKey): pieces.append(elem.name) else: assert isinstance(elem, (jax.tree_util.FlattenedIndexKey, jax.tree_util.SequenceKey)) return "/".join(pieces) def get_load_path_str( init_path_str: str, load_rename_rules: Optional[list[tuple[str, str]]] = None, load_exclude_rules: Optional[list[str]] = None, ) -> Optional[str]: """ Determines the load path for a given initial path string, applying exclusion and renaming rules. This function is used in the context of loading states or parameters from files, allowing for flexible mapping or exclusion based on pattern matching. Parameters: - init_path_str (str): The initial path string to process. - load_rename_rules (Optional[list[tuple[str, str]]]): Rules for renaming paths based on pattern matching. - load_exclude_rules (Optional[list[str]]): Patterns for paths to exclude from loading. Returns: - Optional[str]: The processed load path string, or None if excluded. """ # Exclusion if load_exclude_rules is not None: for search_pattern in load_exclude_rules: if re.search(search_pattern, init_path_str): return None # Renaming load_path_str = init_path_str if load_rename_rules is not None: for search_pattern, replacement_pattern in load_rename_rules: if re.search(search_pattern, load_path_str): load_path_str = re.sub(search_pattern, replacement_pattern, load_path_str) break return load_path_str def replace_with_load_state( init_state: Any, load_state: Any, load_rename_rules: Optional[list[tuple[str, str]]] = None, load_exclude_rules: Optional[list[str]] = None, mesh_config: tuple = (1, 1), ) -> Any: """ Replaces elements of an initial state with elements from a loaded state, applying renaming and exclusion rules. This function supports conditional inclusion and transformation of state elements based on complex criteria, facilitating flexible state restoration. Parameters: - init_state (Any): The initial state before replacement. - load_state (Any): The state from which to load replacements. - load_rename_rules (Optional[list[tuple[str, str]]]): Rules for renaming paths. - load_exclude_rules (Optional[list[str]]): Patterns for paths to exclude from loading. - mesh_config (tuple): Configuration for data parallelism. Returns: - Any: The initial state with elements replaced from the load state. """ flatten_load, _ = jax.tree_util.tree_flatten_with_path(load_state) flatten_init, structure_init = jax.tree_util.tree_flatten_with_path(init_state) load_map = {path_tuple_to_string(path): tensor for path, tensor in flatten_load} replaced = [] num_replicas = 1 data_model_shards = math.prod(mesh_config) for i, (init_path, tensor) in enumerate(flatten_init): init_path_str = path_tuple_to_string(init_path) load_path_str = get_load_path_str(init_path_str, load_rename_rules, load_exclude_rules) if load_path_str is None: rank_logger.info(f"Excluded from restore: {init_path_str}.") replaced.append(tensor) elif load_path_str in load_map: if load_path_str == init_path_str: rank_logger.info(f"Restored from ckpt: {init_path_str}.") else: rank_logger.info(f"Restored from ckpt: {init_path_str} <-- {load_path_str}.") replaced.append(load_map[load_path_str]) else: rank_logger.info(f"Not found in ckpt: {init_path_str}.") if (i % num_replicas) == ((jax.process_index() // data_model_shards) % num_replicas): replaced.append(tensor) else: replaced.append(np.zeros_like(tensor)) return jax.tree_util.tree_unflatten(structure_init, replaced) def restore( checkpoint_path: str, state_shapes: Any, mesh, between_hosts_config, params_only, state_sharding, init_state: Optional[Any] = None, ) -> Any: """ Restores the state from a checkpoint, optionally focusing on parameters only, and applies sharding configurations. This function is designed for restoring model states from disk with support for distributed environments, handling the intricacies of partitioning and host-specific configurations. Parameters: - checkpoint_path (str): The path to the checkpoint directory. - state_shapes (Any): The expected shapes of the state to restore. - mesh: The mesh configuration for distributed environments. - between_hosts_config: Configuration for data exchange between hosts. - params_only (bool): Whether to restore parameters only, excluding other state parts. - state_sharding: Sharding configuration for the state. - init_state (Optional[Any]): The initial state to which the checkpoint data is applied. Returns: - Any: The restored state, potentially sharded across the distributed environment. """ ckpt_path = os.path.join(checkpoint_path, "ckpt-0") rank_logger.info("Loading checkpoint at {}".format(ckpt_path)) ckpt_shapes = state_shapes ckpt_shapes_with_path, structure = jax.tree_util.tree_flatten_with_path(ckpt_shapes) ckpt_shapes_flat = [elem[1] for elem in ckpt_shapes_with_path] loaded_tensors = load_tensors(ckpt_shapes_flat, ckpt_path, between_hosts_config) state = jax.tree_util.tree_unflatten(structure, loaded_tensors) # Sanity check to give a better error message. ckpt_keys = set(state.params.keys()) code_keys = set(state_sharding.params.keys()) if ckpt_keys != code_keys and init_state is None: missing_in_ckpt = code_keys - ckpt_keys missing_locally = ckpt_keys - code_keys raise ValueError( "Parameters in the code are not matching checkpoint parameters.\n" "Params missing in checkpoint: {}\nParams missing in code: {}".format( missing_in_ckpt, missing_locally ) ) state_sharding = jax.tree_util.tree_map( lambda x: jax.sharding.PartitionSpec() if x is None else x, state_sharding, is_leaf=lambda x: x is None, ) state = multihost_utils.host_local_array_to_global_array(state, mesh, state_sharding) if params_only: state = state.params return state