Merge fb9d433aa9 into 7050ed204b

2025-12-05 17:51:34 +03:00 · 2024-03-19 17:50:23 +02:00
parent 7050ed204b fb9d433aa9
commit 11b95e7a26
4 changed files with 1095 additions and 20 deletions
--- a/checkpoint.py
+++ b/checkpoint.py
@ -41,6 +41,17 @@ sys.modules['__main__'].QuantizedWeight8bit = QuantizedWeight8bit

@contextlib.contextmanager
 def copy_to_shm(file: str):
+    """
+    Context manager for copying a file to shared memory. If the file is already in shared memory (/dev/shm),
+    yields the same file path. Otherwise, copies the file to a temporary file in shared memory, yields the path
+    to the temporary file, and cleans up by removing the temporary file after use.
+    
+    Parameters:
+    - file (str): The path to the file to be copied.
+    
+    Yields:
+    - str: The path to the file in shared memory.
+    """
    if file.startswith("/dev/shm/"):
        # Nothing to do, the file is already in shared memory.
        yield file
@ -58,6 +69,17 @@ def copy_to_shm(file: str):

@contextlib.contextmanager
 def copy_from_shm(file: str):
+    """
+    Context manager for copying a file from shared memory to a specified path. It creates a temporary file
+    in shared memory, yields the path to this temporary file for operations, and then copies the temporary
+    file to the specified path, cleaning up the temporary file afterwards.
+    
+    Parameters:
+    - file (str): The target path for the file to be copied to from shared memory.
+    
+    Yields:
+    - str: The path to the temporary file in shared memory.
+    """
    tmp_dir = "/dev/shm/"
    fd, tmp_path = tempfile.mkstemp(dir=tmp_dir)
    try:
@ -69,19 +91,48 @@ def copy_from_shm(file: str):


 def fast_unpickle(path: str) -> Any:
+    """
+    Unpickles and loads an object from a file, optionally using shared memory for faster access.
+    
+    Parameters:
+    - path (str): The path to the pickle file to load.
+    
+    Returns:
+    - Any: The object loaded from the pickle file.
+    """
    with copy_to_shm(path) as tmp_path:
        with open(tmp_path, "rb") as f:
            return pickle.load(f)


 def fast_pickle(obj: Any, path: str) -> None:
+    """
+    Pickles and saves an object to a file, optionally using shared memory for faster access.
+    
+    Parameters:
+    - obj (Any): The object to be pickled and saved.
+    - path (str): The path where the pickle file should be saved.
+    """
    with copy_from_shm(path) as tmp_path:
        with open(tmp_path, "wb") as f:
            pickle.dump(obj, f)


 def load_tensors(shaped_arrays, directory, mesh_config, tensor_indices=None):
-    """Loads a set of arrays."""
+    """
+    Loads tensors from files in parallel using a ThreadPoolExecutor. This function is intended for use with
+    arrays that have a predefined shape and dtype, loading them from a directory where each tensor is saved
+    in a separate file.
+    
+    Parameters:
+    - shaped_arrays: A sequence of arrays providing the shapes and dtypes of the tensors to load.
+    - directory (str): The directory from which to load the tensors.
+    - mesh_config: Configuration for data parallelism across processes or devices.
+    - tensor_indices (Optional): Specific indices of tensors to load. If None, all tensors are loaded.
+    
+    Returns:
+    - List of numpy arrays or zeros depending on the process's role in data parallelism.
+    """
    pool = ThreadPoolExecutor(max_workers=32)
    fs = list()
    num_tensors = 0
@ -108,6 +159,17 @@ def load_tensors(shaped_arrays, directory, mesh_config, tensor_indices=None):


 def path_tuple_to_string(path: tuple) -> str:
+    """
+    Converts a tuple representing a path in a nested structure to a string. This function is specifically
+    used for handling paths within the structure of saved states or parameters, making it easier to
+    identify and manipulate specific elements.
+    
+    Parameters:
+    - path (tuple): A tuple representing a path in a nested structure.
+    
+    Returns:
+    - str: A string representation of the path, suitable for logging or identification.
+    """
    pieces = []
    for elem in path:
        if isinstance(elem, jax.tree_util.DictKey):
@ -124,6 +186,19 @@ def get_load_path_str(
    load_rename_rules: Optional[list[tuple[str, str]]] = None,
    load_exclude_rules: Optional[list[str]] = None,
 ) -> Optional[str]:
+    """
+    Determines the load path for a given initial path string, applying exclusion and renaming rules. This
+    function is used in the context of loading states or parameters from files, allowing for flexible
+    mapping or exclusion based on pattern matching.
+    
+    Parameters:
+    - init_path_str (str): The initial path string to process.
+    - load_rename_rules (Optional[list[tuple[str, str]]]): Rules for renaming paths based on pattern matching.
+    - load_exclude_rules (Optional[list[str]]): Patterns for paths to exclude from loading.
+    
+    Returns:
+    - Optional[str]: The processed load path string, or None if excluded.
+    """
    # Exclusion
    if load_exclude_rules is not None:
        for search_pattern in load_exclude_rules:
@ -148,6 +223,21 @@ def replace_with_load_state(
    load_exclude_rules: Optional[list[str]] = None,
    mesh_config: tuple = (1, 1),
 ) -> Any:
+    """
+    Replaces elements of an initial state with elements from a loaded state, applying renaming and exclusion
+    rules. This function supports conditional inclusion and transformation of state elements based on complex
+    criteria, facilitating flexible state restoration.
+    
+    Parameters:
+    - init_state (Any): The initial state before replacement.
+    - load_state (Any): The state from which to load replacements.
+    - load_rename_rules (Optional[list[tuple[str, str]]]): Rules for renaming paths.
+    - load_exclude_rules (Optional[list[str]]): Patterns for paths to exclude from loading.
+    - mesh_config (tuple): Configuration for data parallelism.
+    
+    Returns:
+    - Any: The initial state with elements replaced from the load state.
+    """
    flatten_load, _ = jax.tree_util.tree_flatten_with_path(load_state)
    flatten_init, structure_init = jax.tree_util.tree_flatten_with_path(init_state)
    load_map = {path_tuple_to_string(path): tensor for path, tensor in flatten_load}
@ -186,6 +276,23 @@ def restore(
    state_sharding,
    init_state: Optional[Any] = None,
 ) -> Any:
+    """
+    Restores the state from a checkpoint, optionally focusing on parameters only, and applies sharding
+    configurations. This function is designed for restoring model states from disk with support for distributed
+    environments, handling the intricacies of partitioning and host-specific configurations.
+    
+    Parameters:
+    - checkpoint_path (str): The path to the checkpoint directory.
+    - state_shapes (Any): The expected shapes of the state to restore.
+    - mesh: The mesh configuration for distributed environments.
+    - between_hosts_config: Configuration for data exchange between hosts.
+    - params_only (bool): Whether to restore parameters only, excluding other state parts.
+    - state_sharding: Sharding configuration for the state.
+    - init_state (Optional[Any]): The initial state to which the checkpoint data is applied.
+    
+    Returns:
+    - Any: The restored state, potentially sharded across the distributed environment.
+    """
    ckpt_path = os.path.join(checkpoint_path, "ckpt-0")

    rank_logger.info("Loading checkpoint at {}".format(ckpt_path))
--- a/model.py
+++ b/model.py
--- a/run.py
+++ b/run.py
@ -22,6 +22,27 @@ CKPT_PATH = "./checkpoints/"


 def main():
+    """
+    Initializes and runs a text generation model using predefined model configurations and inference settings.
+
+    This function sets up a language model with specific configurations, including model architecture details
+    (e.g., embedding sizes, number of layers, attention heads, and MoE settings) and text generation settings
+    (e.g., vocabulary size, token identifiers). It initializes an inference runner with the model, checkpoint
+    path, tokenizer, and mesh configuration. The inference runner is then used to generate text based on a
+    given prompt and output the result.
+
+    The process involves:
+    - Creating a `LanguageModelConfig` instance with specified model parameters, including transformer
+      configurations and quantization settings for weights.
+    - Initializing an `InferenceRunner` with the model configuration, batch size per device, checkpoint path,
+      and other relevant settings.
+    - Calling the `initialize` method on the inference runner to prepare the model and tokenizer for inference.
+    - Generating text based on a provided prompt using the `sample_from_model` function, which internally
+      manages the sampling process through the inference runner.
+
+    Output:
+    - Prints the generated text continuation for a prompt to the standard output.
+    """
    grok_1_model = LanguageModelConfig(
        vocab_size=128 * 1024,
        pad_token=0,
--- a/runners.py
+++ b/runners.py
@ -48,6 +48,20 @@ TOP_K = 8


 class SampleSettings(NamedTuple):
+    """
+    A NamedTuple for storing settings used during the sampling process.
+
+    Attributes:
+        - temperature (ArrayLike): The temperature controls the randomness of the output. Lower values make
+          the model outputs more deterministic, while higher values increase diversity.
+        - nucleus_p (ArrayLike): The nucleus probability controls the cumulative distribution function of
+          token probabilities, effectively truncating low-probability tokens to focus sampling on a subset
+          of plausible tokens.
+        - mask (ArrayLike): A binary mask indicating which tokens are allowed (1) and which are disallowed (0)
+          for sampling in each batch element.
+        - active (ArrayLike): A binary indicator for whether a given batch element is actively being used for
+          generation. Elements not in use are ignored in computations to save resources.
+    """
    temperature: ArrayLike
    nucleus_p: ArrayLike
    mask: ArrayLike
@ -56,6 +70,15 @@ class SampleSettings(NamedTuple):


 class SampleOutput(NamedTuple):
+    """
+    A NamedTuple for storing the output from the sampling process.
+
+    Attributes:
+        - token_id (ArrayLike): The sampled token ID for each batch element.
+        - prob (ArrayLike): The probability associated with the sampled token for each batch element.
+        - top_k_token_ids (ArrayLike): The IDs of the top-k most probable tokens for each batch element.
+        - top_k_probs (ArrayLike): The probabilities associated with the top-k token IDs for each batch element.
+    """
    token_id: ArrayLike
    prob: ArrayLike
    top_k_token_ids: ArrayLike
@ -63,6 +86,19 @@ class SampleOutput(NamedTuple):


 def insert_slice(memory: Memory, slice, length, i):
+    """
+    Inserts a slice of KVMemory into a Memory object at a specified index. This function updates the Memory
+    object with a new slice that includes updated steps for each layer based on the provided length.
+
+    Parameters:
+    - memory (Memory): The original memory object to be updated.
+    - slice: A slice of the memory to be inserted, typically representing a new or modified piece of information.
+    - length (int): The step length to set for each KVMemory layer in the inserted slice.
+    - i (int): The index at which the slice is to be inserted into the memory.
+
+    Returns:
+    - Memory: The updated memory object with the new slice inserted at the specified index.
+    """
    slice = Memory(
        layers=[
            KVMemory(layer.k, layer.v, step=jnp.array([length]))
@ -75,6 +111,17 @@ def insert_slice(memory: Memory, slice, length, i):


 def pad_to_size(x, size):
+    """
+    Pads or truncates an array to a specified size. If the array is longer than the target size, it will be
+    truncated from the left. If it is shorter, it will be right-padded with zeros.
+
+    Parameters:
+    - x (ArrayLike): The array to be padded or truncated.
+    - size (int): The target size for the array.
+
+    Returns:
+    - ArrayLike: The array adjusted to the specified size.
+    """
    if x.shape[0] > size:
        # Left truncate if the context is too long.
        x = x[-size:]
@ -82,7 +129,20 @@ def pad_to_size(x, size):


 def top_p_filter(logits: jax.Array, top_p: jax.Array) -> jax.Array:
-    """Performs nucleus filtering on logits."""
+    """
+    Performs nucleus filtering on logits.
+    Filters logits to retain only a subset that corresponds to the top-p cumulative probability. This
+    nucleus filtering method is used to focus the sampling process on a subset of plausible tokens, improving
+    the quality of generated text.
+
+    Parameters:
+    - logits (jax.Array): The logits to filter.
+    - top_p (jax.Array): The cumulative probability threshold for filtering logits.
+
+    Returns:
+    - jax.Array: The filtered logits with low-probability tokens set to -inf, effectively removing them
+      from consideration during sampling.
+    """
    assert logits.ndim == top_p.ndim, f"Expected {logits.ndim} equal {top_p.ndim}"
    sorted_logits = jax.lax.sort(logits, is_stable=False)
    sorted_probs = jax.nn.softmax(sorted_logits)
@ -102,6 +162,21 @@ def sample_token(
    lm_outputs: LanguageModelOutput,
    settings: SampleSettings,
 ) -> SampleOutput:
+    """
+    Samples a token from the language model outputs using specified settings, including temperature and
+    nucleus probability filtering. This function also computes the probability of the sampled token and
+    identifies the top-k tokens and their probabilities.
+
+    Parameters:
+    - rngs (jax.random.PRNGKey): The random number generator state for sampling.
+    - lm_outputs (LanguageModelOutput): The outputs from the language model, including logits.
+    - settings (SampleSettings): The settings controlling the sampling process, including temperature,
+      nucleus probability, token masking, and active batch elements indicator.
+
+    Returns:
+    - SampleOutput: The results of the sampling process, including the sampled token ID, its probability,
+      and the top-k tokens and their probabilities for each batch element.
+    """
    # Expand the settings shape to match the logit shape.
    settings = SampleSettings(
        temperature=jnp.expand_dims(settings.temperature, (1, 2)),  # Input [B], output [B, 1, 1].
@ -135,6 +210,25 @@ def sample_token(

@dataclass
 class ModelRunner:
+    """
+    Manages the execution of a language model, including initialization, forward passes, and state management.
+    
+    Attributes:
+    - model (LanguageModelConfig): The configuration for the language model.
+    - bs_per_device (float): The batch size per device.
+    - load_rename_rules (Optional[list[tuple[str, str]]]): Rules for renaming parameters during model loading.
+    - load_exclude_rules (Optional[list[str]]): Rules for excluding parameters during model loading.
+    - rng_seed (int): The initial random number generator seed.
+    - transform_forward (bool): Flag to determine whether to transform the forward function with Haiku.
+    - checkpoint_path (str): The path to the directory containing model checkpoints for loading.
+    
+    Methods:
+    - make_forward_fn: Creates the forward function for the model, potentially transforming it with Haiku.
+    - initialize: Initializes the model runner with data and mesh configuration, preparing it for execution.
+    - init: Initializes the model parameters using provided data.
+    - get_state_sharding: Determines the sharding configuration for model parameters based on the initialization data.
+    - load_or_init: Loads the model from a checkpoint or initializes it if the checkpoint is not available or not specified.
+    """
    model: LanguageModelConfig

    bs_per_device: float = 2.0
@ -148,6 +242,18 @@ class ModelRunner:
    checkpoint_path: str = ""

    def make_forward_fn(self, mesh: Any):
+        """
+        Creates and optionally transforms the forward function of the model. This method constructs a forward
+        function that takes input tokens and produces model outputs. If `transform_forward` is set to True, the
+        method also transforms this function using Haiku's transform method to prepare it for JIT compilation
+        and execution on the mesh.
+    
+        Parameters:
+            mesh (Any): The mesh configuration to be used for distributing the computation across devices.
+    
+        Returns:
+            Callable: The forward function, potentially transformed by Haiku, ready for execution.
+        """
        def forward(tokens):
            out = self.model.make(mesh=mesh)(tokens)
            return out, None
@ -162,6 +268,20 @@ class ModelRunner:
        local_mesh_config: tuple[int, int],
        between_hosts_config: tuple[int, int],
    ):
+        """
+        Initializes the model runner with necessary configurations and data. This includes setting up the mesh
+        for distributed computation, calculating batch sizes based on the provided configuration, and preparing
+        the forward function for execution.
+    
+        Parameters:
+            init_data: Initial data used for model initialization. This is typically dummy data matching the
+                       expected input format and dimensions of the model.
+            local_mesh_config (tuple[int, int]): The configuration of the local mesh, specifying how devices
+                                                 are organized locally for parallel computation.
+            between_hosts_config (tuple[int, int]): The configuration for communication between hosts in a
+                                                    distributed setup, important for multi-host environments.
+    
+        """
        num_replicas = math.prod(between_hosts_config)
        self.model.initialize()
        self.model.fprop_dtype = jnp.bfloat16
@ -191,12 +311,37 @@ class ModelRunner:
            self.init_fn = pjit.pjit(self.init, out_shardings=self.state_sharding)

    def init(self, rng: jax.Array, data) -> TrainingState:
+        """
+        Initializes the model parameters using provided data and a random number generator state. This method
+        is only called when `transform_forward` is True, indicating that the forward function has been transformed
+        by Haiku and requires explicit initialization.
+    
+        Parameters:
+            rng (jax.Array): The random number generator state for initializing model parameters.
+            data: The initial data used for parameter initialization, usually including input tokens and
+                  possibly target tokens for supervised learning tasks.
+    
+        Returns:
+            TrainingState: An instance of TrainingState containing the initialized model parameters.
+        """
        assert self.transform_forward
        rng, init_rng = jax.random.split(rng)
        params = self.forward.init(init_rng, data["inputs"])
        return TrainingState(params=params)

    def get_state_sharding(self, init_data):
+        """
+        Determines the sharding configuration for model parameters based on initialization data. This method
+        evaluates the shape of model parameters during initialization to apply the partition rules specified
+        in the model's configuration, facilitating efficient distributed computation.
+    
+        Parameters:
+            init_data: The initial data used for model initialization, which helps determine the appropriate
+                       sharding configuration based on model parameter shapes.
+    
+        Returns:
+            A sharding configuration that specifies how model parameters should be partitioned across the mesh.
+        """
        assert self.transform_forward
        rng = jax.random.PRNGKey(self.rng_seed)
        rank_logger.info(f"partition rules: {self.model.partition_rules}")
@ -215,6 +360,20 @@ class ModelRunner:
        from_checkpoint: bool = True,
        init_fn: Optional[Callable] = None,
    ):
+        """
+        Loads model parameters from a checkpoint or initializes them if the checkpoint is not available. This
+        method provides flexibility in starting the model from a known state or freshly initializing parameters
+        based on the model configuration and provided initial data.
+    
+        Parameters:
+            init_data: Initial data used for model parameter initialization if needed.
+            from_checkpoint (bool): Flag indicating whether to attempt loading parameters from a checkpoint.
+            init_fn (Optional[Callable]): An optional initialization function that overrides the default
+                                          initialization method if provided.
+    
+        Returns:
+            The model state, either loaded from a checkpoint or newly initialized.
+        """
        rng = jax.random.PRNGKey(self.rng_seed)

        if not self.checkpoint_path or not from_checkpoint:
@ -251,6 +410,16 @@ class ModelRunner:

@dataclass
 class Request:
+    """
+    Data class for storing information about a single sampling request to the model.
+
+    Attributes:
+        prompt (str): The text prompt to generate text from.
+        temperature (float): Controls the randomness of the output. Lower values result in more deterministic outputs.
+        nucleus_p (float): The cumulative probability threshold for nucleus sampling, focusing generation on high-probability tokens.
+        rng_seed (int): Seed for the random number generator to ensure reproducibility of the results.
+        max_len (int): The maximum length of the generated text sequence.
+    """
    prompt: str
    temperature: float
    nucleus_p: float
@ -260,6 +429,24 @@ class Request:

@dataclass
 class InferenceRunner:
+    """
+    Manages inference operations for generating text based on prompts, including initializing the model and tokenizer,
+    prefilling the model memory, and running the actual text generation.
+
+    Attributes:
+        name (str): A name identifier for the inference runner.
+        runner (Any): An instance of ModelRunner that manages the underlying language model.
+        load (str): A path to the model checkpoint for loading.
+        tokenizer_path (str): Path to the SentencePiece tokenizer model file.
+        local_mesh_config (Tuple[int, int]): Configuration for the local device mesh.
+        between_hosts_config (Tuple[int, int]): Configuration for communication between hosts in a distributed setup.
+        pad_sizes (tuple[int]): A tuple of padding sizes for processing different lengths of prompts.
+
+    Methods:
+        get_pad_bucket: Determines the appropriate padding size for a given input length.
+        initialize: Initializes the model runner, tokenizer, and pre-compiles model functions for different input sizes.
+        run: A generator method that accepts prompts and returns generated text, managing batch slots and sampling steps.
+    """
    name: str
    runner: Any
    load: str
@ -269,10 +456,25 @@ class InferenceRunner:
    pad_sizes: tuple[int] = (1024,)

    def get_pad_bucket(self, size):
+        """
+        Determines the appropriate padding size for a given input length. This method uses the predefined
+        pad sizes to find the smallest pad size that is larger than or equal to the given size.
+    
+        Parameters:
+            size (int): The length of the input sequence to be padded.
+    
+        Returns:
+            int: The selected pad size from the predefined pad sizes that best fits the input length.
+        """
        i = bisect.bisect_left(self.pad_sizes, size)
        return self.pad_sizes[min(i, len(self.pad_sizes) - 1)]

    def initialize(self):
+        """
+        Initializes the inference runner by setting up the model runner, loading the tokenizer, pre-compiling
+        model functions for different input sizes, and loading or initializing model parameters. This method
+        ensures that the inference runner is ready to generate text based on prompts.
+        """
        runner = self.runner
        self.runner.transform_forward = True
        dummy_data = dict(
@ -440,7 +642,15 @@ class InferenceRunner:
        )

    def run(self):
-        """Generator that accepts prompts."""
+        """
+        Generator method that accepts prompts and manages the text generation process. This method handles
+        tokenization of prompts, setup of sampling settings, and orchestration of the sampling steps to
+        generate and return the generated text.
+    
+        Yields:
+            str: The generated text for each prompt received. This method operates as a coroutine, yielding
+                 generated text back to the caller and awaiting new prompts for further generation.
+        """
        runner = self.runner
        mesh = runner.mesh
        max_len = runner.model.sequence_len
@ -580,6 +790,18 @@ class InferenceRunner:
 def make_mesh(
    local_mesh_config: tuple[int, ...], between_hosts_config: tuple[int, ...]
 ) -> jax.sharding.Mesh:
+    """
+    Creates a JAX mesh from the provided configuration, which is used for parallel computation across devices.
+
+    Parameters:
+        local_mesh_config (tuple[int, ...]): A tuple specifying the local mesh configuration. This usually corresponds
+                                             to how local devices are arranged and partitioned for computation.
+        between_hosts_config (tuple[int, ...]): A tuple specifying the mesh configuration for communication between
+                                                different hosts, relevant in multi-host distributed setups.
+
+    Returns:
+        jax.sharding.Mesh: A mesh object that encapsulates the device topology and partitioning for parallel computation.
+    """
    assert len(local_mesh_config) == 2
    assert len(between_hosts_config) == 2
    rank_logger.info("Detected %s devices in mesh", jax.device_count())
@ -594,6 +816,32 @@ def make_mesh(


 def sample_from_model(server, prompt, max_len, temperature):
+    """
+    Samples tokens from a trained model given a prompt and sampling settings. This function is designed
+    to interact with a generator-based server that manages the state of the model and performs the actual
+    sampling. The server is expected to yield control back to this function, which then sends a request
+    object containing the prompt and settings, and waits for the generated output.
+
+    The function initializes the server (if not already done), constructs the request with the provided
+    parameters, and sends this request to the server. The sampled output, typically a continuation of the
+    prompt, is then received from the server.
+
+    Parameters:
+    - server (generator): The generator-based server that handles model inference. This server is expected
+      to manage the lifecycle of the model, including loading parameters, setting up the environment, and
+      performing the token sampling.
+    - prompt (str): The initial text prompt to which the model generates a continuation. This prompt is
+      encoded and fed into the model as part of the request.
+    - max_len (int): The maximum length of the generated sequence, including the length of the prompt.
+      This controls how long the model's output can be.
+    - temperature (float): A parameter controlling the randomness of the output. Lower values make the
+      model's outputs more deterministic (and potentially more repetitive), while higher values increase
+      diversity at the cost of coherence.
+
+    Returns:
+    - str: The generated text as a continuation of the provided prompt, up to a maximum length specified
+      by `max_len`.
+    """
    next(server)
    inp = Request(
        prompt=prompt,