diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
old mode 100644
new mode 100755
index 17cf517db..e487258d1
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,19 +41,7 @@ jobs:
 
     - name: Run tests
       run: |
-        pytest \
-          tests/test_entropy_parameters.py \
-          tests/test_context_model.py \
-          tests/test_channel_context.py \
-          tests/test_attention_context.py \
-          tests/test_model_transforms.py \
-          tests/test_integration.py \
-          tests/test_performance.py \
-          tests/test_parallel_process.py \
-          tests/test_colorbar.py \
-          tests/test_entropy_model.py \
-          tests/test_octree_coding.py \
-          -v --cov=src --cov-report=xml -m "not gpu and not slow"
+        pytest tests/ -v --cov=src --cov-report=xml -m "not gpu and not slow"
 
     - name: Upload coverage
       uses: codecov/codecov-action@v4
diff --git a/.python-version b/.python-version
old mode 100644
new mode 100755
index eee6392d5..c8cfe3959
--- a/.python-version
+++ b/.python-version
@@ -1 +1 @@
-3.8.16
+3.10
diff --git a/CLAUDE.md b/CLAUDE.md
old mode 100644
new mode 100755
index 2f7cab7aa..f638e0645
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -68,7 +68,7 @@ else:
 
 **Rules:**
 - Model-level `call()` methods and any layer that branches on training mode **must** accept `training=None` and pass it through to sub-layers that need it.
-- Leaf layers that do not use `training` internally (e.g., `CENICGDN`, `SpatialSeparableConv`, `MaskedConv3D`, `SliceTransform`) currently omit it from their signatures. This is the established convention — do not add `training` to these unless they gain training-dependent behavior.
+- Leaf layers that do not use `training` internally (e.g., `GDN`, `SpatialSeparableConv`, `MaskedConv3D`, `SliceTransform`) currently omit it from their signatures. This is the established convention — do not add `training` to these unless they gain training-dependent behavior.
 - **Never remove the training conditional** from methods that have it. Never replace noise injection with unconditional `tf.round()`.
 - When adding new layers: include `training=None` if the layer has any training-dependent behavior. Omit it for pure computation layers.
 
@@ -77,7 +77,7 @@ else:
 All model tensors are 5D: `(batch, depth, height, width, channels)` — channels-last.
 
 - Convolutions are `Conv3D`, never `Conv2D`. Kernels are 3-tuples: `(3, 3, 3)`.
-- Channel axis is axis 4 (see `CENICGDN.call()` which does `tf.tensordot(norm, self.gamma, [[4], [0]])`).
+- Channel axis is axis 4 (see `GDN.call()` which uses `tf.einsum('...c,cd->...d', ...)`).
 - Input voxel grids have 1 channel: shape `(B, D, H, W, 1)`.
 - Do not flatten spatial dimensions to use 2D ops. The 3D structure is load-bearing.
 
diff --git a/pyproject.toml b/pyproject.toml
old mode 100644
new mode 100755
index 0d00de1cd..2dad2a53e
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.ruff]
 line-length = 120
-target-version = "py38"
+target-version = "py310"
 
 [tool.ruff.lint]
 select = ["F", "I", "E", "W"]
@@ -23,5 +23,5 @@ known-first-party = [
     "ev_compare", "ev_run_render", "mp_report", "mp_run",
     "quick_benchmark", "benchmarks", "parallel_process",
     "point_cloud_metrics", "map_color", "colorbar",
-    "cli_train", "test_utils",
+    "cli_train", "file_io", "test_utils",
 ]
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 77ceefe46..8f01136e1
--- a/setup.py
+++ b/setup.py
@@ -2,12 +2,20 @@
 
 setup(
     name="deepcompress",
-    version="0.1",
+    version="2.0.0",
     package_dir={"": "src"},
     packages=find_namespace_packages(include=["*"], where="src"),
+    python_requires=">=3.10",
     install_requires=[
         'numpy',
-        'pytest',
-        'numba'
+        'tensorflow>=2.11',
+        'tensorflow-probability~=0.19',
+        'matplotlib',
+        'pandas',
+        'tqdm',
+        'pyyaml',
+        'scipy',
+        'numba',
+        'keras-tuner',
     ],
-)
\ No newline at end of file
+)
diff --git a/src/__init__.py b/src/__init__.py
old mode 100644
new mode 100755
index e69de29bb..8c0d5d5bb
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -0,0 +1 @@
+__version__ = "2.0.0"
diff --git a/src/__pycache__/__init__.cpython-38.pyc b/src/__pycache__/__init__.cpython-38.pyc
deleted file mode 100644
index 6cb68d69b..000000000
Binary files a/src/__pycache__/__init__.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/colorbar.cpython-38.pyc b/src/__pycache__/colorbar.cpython-38.pyc
deleted file mode 100644
index a35abf980..000000000
Binary files a/src/__pycache__/colorbar.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/compress_octree.cpython-38.pyc b/src/__pycache__/compress_octree.cpython-38.pyc
deleted file mode 100644
index 98c63d914..000000000
Binary files a/src/__pycache__/compress_octree.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/ds_mesh_to_pc.cpython-38.pyc b/src/__pycache__/ds_mesh_to_pc.cpython-38.pyc
deleted file mode 100644
index fecbee4d3..000000000
Binary files a/src/__pycache__/ds_mesh_to_pc.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/ds_pc_octree_blocks.cpython-38.pyc b/src/__pycache__/ds_pc_octree_blocks.cpython-38.pyc
deleted file mode 100644
index 4037320d6..000000000
Binary files a/src/__pycache__/ds_pc_octree_blocks.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/ev_run_experiment.cpython-38.pyc b/src/__pycache__/ev_run_experiment.cpython-38.pyc
deleted file mode 100644
index 242ce49e7..000000000
Binary files a/src/__pycache__/ev_run_experiment.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/ev_run_render.cpython-38.pyc b/src/__pycache__/ev_run_render.cpython-38.pyc
deleted file mode 100644
index dae019466..000000000
Binary files a/src/__pycache__/ev_run_render.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/experiment.cpython-38.pyc b/src/__pycache__/experiment.cpython-38.pyc
deleted file mode 100644
index 861f805c5..000000000
Binary files a/src/__pycache__/experiment.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/map_color.cpython-38.pyc b/src/__pycache__/map_color.cpython-38.pyc
deleted file mode 100644
index 8305c96e9..000000000
Binary files a/src/__pycache__/map_color.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/model_opt.cpython-38.pyc b/src/__pycache__/model_opt.cpython-38.pyc
deleted file mode 100644
index 9970df534..000000000
Binary files a/src/__pycache__/model_opt.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/model_transforms.cpython-38.pyc b/src/__pycache__/model_transforms.cpython-38.pyc
deleted file mode 100644
index de9c5b972..000000000
Binary files a/src/__pycache__/model_transforms.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/octree_coding.cpython-38.pyc b/src/__pycache__/octree_coding.cpython-38.pyc
deleted file mode 100644
index 1be3bcbcf..000000000
Binary files a/src/__pycache__/octree_coding.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/parallel_process.cpython-38.pyc b/src/__pycache__/parallel_process.cpython-38.pyc
deleted file mode 100644
index c8b2259ce..000000000
Binary files a/src/__pycache__/parallel_process.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/patch_gaussian_conditional.cpython-38.pyc b/src/__pycache__/patch_gaussian_conditional.cpython-38.pyc
deleted file mode 100644
index 3f56e8634..000000000
Binary files a/src/__pycache__/patch_gaussian_conditional.cpython-38.pyc and /dev/null differ
diff --git a/src/__pycache__/pc_metric.cpython-38.pyc b/src/__pycache__/pc_metric.cpython-38.pyc
deleted file mode 100644
index ca28ae5c9..000000000
Binary files a/src/__pycache__/pc_metric.cpython-38.pyc and /dev/null differ
diff --git a/src/attention_context.py b/src/attention_context.py
old mode 100644
new mode 100755
index 5cc6da477..cb5702211
--- a/src/attention_context.py
+++ b/src/attention_context.py
@@ -15,7 +15,7 @@
 
 import tensorflow as tf
 
-from constants import LOG_2_RECIPROCAL
+from .constants import LOG_2_RECIPROCAL
 
 
 class WindowedAttention3D(tf.keras.layers.Layer):
@@ -669,8 +669,8 @@ def __init__(self,
         self.num_attention_layers = num_attention_layers
 
         # Import here to avoid circular dependency
-        from entropy_model import ConditionalGaussian
-        from entropy_parameters import EntropyParameters
+        from .entropy_model import ConditionalGaussian, PatchedGaussianConditional
+        from .entropy_parameters import EntropyParameters
 
         # Hyperprior-based parameter prediction
         self.entropy_parameters = EntropyParameters(
@@ -714,6 +714,9 @@ def __init__(self,
         # Conditional Gaussian for entropy coding
         self.conditional = ConditionalGaussian()
 
+        # Hyperprior entropy model (for z)
+        self.hyper_entropy = PatchedGaussianConditional()
+
         self.scale_min = 0.01
 
     def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -723,6 +726,7 @@ def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         return mean, scale
 
     def call(self, y: tf.Tensor, z_hat: tf.Tensor,
+             z: Optional[tf.Tensor] = None,
              training: Optional[bool] = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
         Process latent y using hyperprior and attention context.
@@ -730,6 +734,7 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         Args:
             y: Main latent representation.
             z_hat: Decoded hyperprior.
+            z: Quantized/noised hyper-latent for computing z rate.
             training: Whether in training mode.
 
         Returns:
@@ -758,10 +763,18 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         # Process through conditional Gaussian
         y_hat, y_likelihood = self.conditional(y, scale, mean, training=training)
 
-        # Compute total bits
-        # Using pre-computed reciprocal: multiplication is faster than division
-        bits_per_element = -y_likelihood * LOG_2_RECIPROCAL
-        total_bits = tf.reduce_sum(bits_per_element)
+        # Compute y bits from discretized likelihood
+        y_bits = tf.reduce_sum(-tf.math.log(y_likelihood) * LOG_2_RECIPROCAL)
+
+        # Compute z bits if z is provided
+        z_bits = tf.constant(0.0)
+        if z is not None:
+            if not self.hyper_entropy.built:
+                self.hyper_entropy.build(z.shape)
+            z_likelihood = self.hyper_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+
+        total_bits = y_bits + z_bits
 
         return y_hat, y_likelihood, total_bits
 
@@ -804,9 +817,9 @@ def __init__(self,
         self.num_channel_groups = num_channel_groups
         self.num_attention_layers = num_attention_layers
 
-        from channel_context import ChannelContext
-        from entropy_model import ConditionalGaussian
-        from entropy_parameters import EntropyParameters
+        from .channel_context import ChannelContext
+        from .entropy_model import ConditionalGaussian, PatchedGaussianConditional
+        from .entropy_parameters import EntropyParameters
 
         # Hyperprior parameters
         self.entropy_parameters = EntropyParameters(
@@ -819,6 +832,8 @@ def __init__(self,
             num_groups=num_channel_groups
         )
 
+        self.channels_per_group = latent_channels // num_channel_groups
+
         # Attention context (applied per channel group)
         self.attention_contexts = [
             BidirectionalMaskTransformer(
@@ -830,6 +845,17 @@ def __init__(self,
             for i in range(num_channel_groups)
         ]
 
+        # Attention output to parameters (replaces concat hack)
+        self.attention_to_params = [
+            tf.keras.layers.Conv3D(
+                filters=self.channels_per_group * 2,  # mean and scale
+                kernel_size=1,
+                padding='same',
+                name=f'attn_to_params_{i}'
+            )
+            for i in range(num_channel_groups)
+        ]
+
         # Parameter fusion per group
         self.param_fusions = [
             tf.keras.layers.Conv3D(
@@ -847,7 +873,9 @@ def __init__(self,
             for i in range(num_channel_groups)
         ]
 
-        self.channels_per_group = latent_channels // num_channel_groups
+        # Hyperprior entropy model (for z)
+        self.hyper_entropy = PatchedGaussianConditional()
+
         self.scale_min = 0.01
 
     def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -856,6 +884,7 @@ def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         return mean, scale
 
     def call(self, y: tf.Tensor, z_hat: tf.Tensor,
+             z: Optional[tf.Tensor] = None,
              training: Optional[bool] = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Process with all context types combined."""
         # Get hyperprior parameters
@@ -883,9 +912,9 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
             combined_mean = hyper_mean_slice + context_mean
             combined_scale = hyper_scale_slice * (1.0 + context_scale)
 
-            # Add attention refinement
+            # Project attention features to mean/scale parameters
             hyper_params = tf.concat([combined_mean, combined_scale], axis=-1)
-            attn_params = tf.concat([attn_features, attn_features], axis=-1)  # Use features for both
+            attn_params = self.attention_to_params[i](attn_features)
             combined = tf.concat([hyper_params, attn_params], axis=-1)
             fused_params = self.param_fusions[i](combined)
 
@@ -902,9 +931,18 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         y_hat = tf.concat(y_hat_parts, axis=-1)
         y_likelihood = tf.concat(likelihood_parts, axis=-1)
 
-        # Using pre-computed reciprocal: multiplication is faster than division
-        bits_per_element = -y_likelihood * LOG_2_RECIPROCAL
-        total_bits = tf.reduce_sum(bits_per_element)
+        # Compute y bits from discretized likelihood
+        y_bits = tf.reduce_sum(-tf.math.log(y_likelihood) * LOG_2_RECIPROCAL)
+
+        # Compute z bits if z is provided
+        z_bits = tf.constant(0.0)
+        if z is not None:
+            if not self.hyper_entropy.built:
+                self.hyper_entropy.build(z.shape)
+            z_likelihood = self.hyper_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+
+        total_bits = y_bits + z_bits
 
         return y_hat, y_likelihood, total_bits
 
diff --git a/src/benchmarks.py b/src/benchmarks.py
old mode 100644
new mode 100755
index 709a47a5c..36797e900
--- a/src/benchmarks.py
+++ b/src/benchmarks.py
@@ -391,7 +391,7 @@ def create_mask_vectorized(kernel_size, mask_type, in_channels, filters):
 
 def benchmark_attention():
     """Benchmark attention implementations."""
-    from attention_context import SparseAttention3D, WindowedAttention3D
+    from .attention_context import SparseAttention3D, WindowedAttention3D
 
     dim = 64
     input_shape = (1, 16, 16, 16, dim)  # Smaller for testing
diff --git a/src/channel_context.py b/src/channel_context.py
old mode 100644
new mode 100755
index 60b1ba76f..7fcb06c4e
--- a/src/channel_context.py
+++ b/src/channel_context.py
@@ -11,7 +11,7 @@
 
 import tensorflow as tf
 
-from constants import LOG_2_RECIPROCAL
+from .constants import LOG_2_RECIPROCAL
 
 
 class SliceTransform(tf.keras.layers.Layer):
@@ -231,8 +231,8 @@ def __init__(self,
         self.channels_per_group = latent_channels // num_groups
 
         # Import here to avoid circular dependency
-        from entropy_model import ConditionalGaussian
-        from entropy_parameters import EntropyParameters
+        from .entropy_model import ConditionalGaussian, PatchedGaussianConditional
+        from .entropy_parameters import EntropyParameters
 
         # Hyperprior-based parameter prediction
         self.entropy_parameters = EntropyParameters(
@@ -251,6 +251,9 @@ def __init__(self,
             for i in range(num_groups)
         ]
 
+        # Hyperprior entropy model (for z)
+        self.hyper_entropy = PatchedGaussianConditional()
+
         self.scale_min = 0.01
 
     def _fuse_params(self,
@@ -269,6 +272,7 @@ def _fuse_params(self,
         return mean, scale
 
     def call(self, y: tf.Tensor, z_hat: tf.Tensor,
+             z: Optional[tf.Tensor] = None,
              training: Optional[bool] = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
         Process latent y using hyperprior and channel-wise context.
@@ -279,6 +283,7 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         Args:
             y: Main latent representation.
             z_hat: Decoded hyperprior.
+            z: Quantized/noised hyper-latent for computing z rate.
             training: Whether in training mode.
 
         Returns:
@@ -305,22 +310,17 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
 
             # Get context params (using y for training, y_hat for inference)
             # Note: Use .call() to pass non-tensor group_idx as keyword argument
-            if training:
+            if i == 0:
+                # First group: no context available, channel_context returns zeros
+                context_mean, context_scale = self.channel_context.call(y, group_idx=0)
+            elif training:
                 # Training: use ground truth y for context (teacher forcing)
                 context_mean, context_scale = self.channel_context.call(y, group_idx=i)
             else:
-                # Inference: use only already decoded groups (no padding needed!)
-                # The channel_context only uses channels 0..group_idx-1, so we
-                # only need to concatenate the decoded parts without padding.
-                # This optimization reduces memory allocations by ~25%.
-                if i == 0:
-                    # First group has no context - channel_context handles this
-                    y_hat_partial = y_hat_parts[0] if y_hat_parts else None
-                else:
-                    # Concatenate only the decoded parts (no zero padding)
-                    y_hat_partial = tf.concat(y_hat_parts, axis=-1)
+                # Inference: use already decoded groups for context
+                y_hat_partial = tf.concat(y_hat_parts, axis=-1)
                 context_mean, context_scale = self.channel_context.call(
-                    y_hat_partial if y_hat_partial is not None else y, group_idx=i
+                    y_hat_partial, group_idx=i
                 )
 
             # Fuse parameters
@@ -341,10 +341,18 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         y_hat = tf.concat(y_hat_parts, axis=-1)
         y_likelihood = tf.concat(likelihood_parts, axis=-1)
 
-        # Compute total bits
-        # Using pre-computed reciprocal: multiplication is faster than division
-        bits_per_element = -y_likelihood * LOG_2_RECIPROCAL
-        total_bits = tf.reduce_sum(bits_per_element)
+        # Compute y bits from discretized likelihood
+        y_bits = tf.reduce_sum(-tf.math.log(y_likelihood) * LOG_2_RECIPROCAL)
+
+        # Compute z bits if z is provided
+        z_bits = tf.constant(0.0)
+        if z is not None:
+            if not self.hyper_entropy.built:
+                self.hyper_entropy.build(z.shape)
+            z_likelihood = self.hyper_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+
+        total_bits = y_bits + z_bits
 
         return y_hat, y_likelihood, total_bits
 
diff --git a/src/cli_train.py b/src/cli_train.py
old mode 100644
new mode 100755
index 9eb21a00e..f873b0300
--- a/src/cli_train.py
+++ b/src/cli_train.py
@@ -5,7 +5,7 @@
 import keras_tuner as kt
 import tensorflow as tf
 
-from ds_mesh_to_pc import read_off
+from .file_io import read_point_cloud
 
 
 def create_model(hp):
@@ -32,8 +32,10 @@ def load_and_preprocess_data(input_dir, batch_size):
     file_paths = glob.glob(os.path.join(input_dir, "*.ply"))
 
     def parse_ply_file(file_path):
-        mesh_data = read_off(file_path)
-        return mesh_data.vertices
+        vertices = read_point_cloud(file_path)
+        if vertices is None:
+            raise ValueError(f"Failed to read point cloud: {file_path}")
+        return vertices
 
     def data_generator():
         for file_path in file_paths:
@@ -70,7 +72,7 @@ def tune_hyperparameters(input_dir, output_dir, num_epochs=10):
     best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
 
     print("Best Hyperparameters:", best_hps.values)
-    best_model.save(os.path.join(output_dir, 'best_model'))
+    best_model.save_weights(os.path.join(output_dir, 'best_model.weights.h5'))
 
 def main():
     parser = argparse.ArgumentParser(description="Train a point cloud compression model with hyperparameter tuning.")
@@ -94,7 +96,7 @@ def main():
         model.compile(optimizer='adam', loss='mean_squared_error')
         dataset = load_and_preprocess_data(args.input_dir, args.batch_size)
         model.fit(dataset, epochs=args.num_epochs)
-        model.save(os.path.join(args.output_dir, 'trained_model'))
+        model.save_weights(os.path.join(args.output_dir, 'trained_model.weights.h5'))
 
 if __name__ == "__main__":
     main()
diff --git a/src/compress_octree.py b/src/compress_octree.py
old mode 100644
new mode 100755
index dd738ac38..8fa5471c3
--- a/src/compress_octree.py
+++ b/src/compress_octree.py
@@ -153,13 +153,15 @@ def partition_recursive(points: np.ndarray, bounds: Tuple[np.ndarray, np.ndarray
                     mid[i] if octant[i] == 0 else bounds[1][i] for i in range(3)
                 ])
 
-                # Find points in this octant with epsilon for stability
-                epsilon = 1e-10
-                mask = np.all(
-                    (points >= min_corner - epsilon) &
-                    (points <= max_corner + epsilon),
-                    axis=1
-                )
+                # Half-open intervals: [min, mid) for lower, [mid, max] for upper
+                lower_cond = points >= min_corner
+                upper_cond = np.array([
+                    points[:, i] <= max_corner[i]
+                    if octant[i] == 1  # upper half: inclusive
+                    else points[:, i] < max_corner[i]  # lower half: exclusive
+                    for i in range(3)
+                ]).T
+                mask = np.all(lower_cond & upper_cond, axis=1)
                 if np.any(mask):
                     partition_recursive(points[mask], (min_corner, max_corner))
 
@@ -181,19 +183,47 @@ def _save_debug_info(self, stage: str, data: Dict[str, Any]) -> None:
         os.makedirs(debug_dir, exist_ok=True)
 
         for name, array in data.items():
-            if isinstance(array, (np.ndarray, dict)):
+            if isinstance(array, np.ndarray):
                 np.save(os.path.join(debug_dir, f"{name}.npy"), array)
 
     def save_compressed(self, grid: np.ndarray, metadata: Dict[str, Any], filename: str) -> None:
         """Save compressed data with metadata."""
-        os.makedirs(os.path.dirname(os.path.abspath(filename)), exist_ok=True)
-        np.savez_compressed(filename, grid=grid, metadata=metadata)
+        import json
+        import math
 
-        if self.debug_output:
-            debug_path = f"{filename}.debug.npz"
-            np.savez_compressed(debug_path, **metadata)
+        os.makedirs(os.path.dirname(os.path.abspath(filename)), exist_ok=True)
+        # Save grid without pickle (bool array, no object dtype)
+        np.savez_compressed(filename, grid=grid)
+        # Save metadata as JSON sidecar (safe, no arbitrary code execution)
+        meta_path = filename + '.meta.json'
+        serializable = {}
+        for k, v in metadata.items():
+            if isinstance(v, np.ndarray):
+                serializable[k] = v.tolist()
+            elif isinstance(v, (np.floating, np.integer)):
+                val = v.item()
+                if isinstance(val, float) and (math.isnan(val) or math.isinf(val)):
+                    serializable[k] = None
+                else:
+                    serializable[k] = val
+            elif isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
+                serializable[k] = None
+            else:
+                serializable[k] = v
+        with open(meta_path, 'w') as f:
+            json.dump(serializable, f)
 
     def load_compressed(self, filename: str) -> Tuple[np.ndarray, Dict[str, Any]]:
         """Load compressed data with metadata."""
-        data = np.load(filename, allow_pickle=True)
-        return data['grid'], data['metadata'].item()
+        import json
+
+        data = np.load(filename, allow_pickle=False)
+        grid = data['grid']
+        meta_path = filename + '.meta.json'
+        with open(meta_path, 'r') as f:
+            metadata = json.load(f)
+        # Convert lists back to numpy arrays for known array fields
+        for key in ('min_bounds', 'max_bounds', 'ranges', 'normal_grid'):
+            if key in metadata:
+                metadata[key] = np.array(metadata[key])
+        return grid, metadata
diff --git a/src/context_model.py b/src/context_model.py
old mode 100644
new mode 100755
index 7fea40322..10703fa07
--- a/src/context_model.py
+++ b/src/context_model.py
@@ -11,7 +11,7 @@
 import numpy as np
 import tensorflow as tf
 
-from constants import LOG_2_RECIPROCAL
+from .constants import LOG_2_RECIPROCAL
 
 
 class MaskedConv3D(tf.keras.layers.Layer):
@@ -265,8 +265,8 @@ def __init__(self,
         self.num_context_layers = num_context_layers
 
         # Import here to avoid circular dependency
-        from entropy_model import ConditionalGaussian
-        from entropy_parameters import EntropyParameters
+        from .entropy_model import ConditionalGaussian, PatchedGaussianConditional
+        from .entropy_parameters import EntropyParameters
 
         # Hyperprior-based parameter prediction
         self.entropy_parameters = EntropyParameters(
@@ -299,6 +299,9 @@ def __init__(self,
         # Conditional Gaussian for entropy coding
         self.conditional = ConditionalGaussian()
 
+        # Hyperprior entropy model (for z)
+        self.hyper_entropy = PatchedGaussianConditional()
+
         self.scale_min = 0.01
 
     def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -308,6 +311,7 @@ def _split_params(self, params: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         return mean, scale
 
     def call(self, y: tf.Tensor, z_hat: tf.Tensor,
+             z: Optional[tf.Tensor] = None,
              training: Optional[bool] = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
         Process latent y using hyperprior and autoregressive context.
@@ -318,6 +322,7 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         Args:
             y: Main latent representation.
             z_hat: Decoded hyperprior.
+            z: Quantized/noised hyper-latent for computing z rate.
             training: Whether in training mode.
 
         Returns:
@@ -342,10 +347,18 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         # Process through conditional Gaussian
         y_hat, y_likelihood = self.conditional(y, scale, mean, training=training)
 
-        # Compute total bits
-        # Using pre-computed reciprocal: multiplication is faster than division
-        bits_per_element = -y_likelihood * LOG_2_RECIPROCAL
-        total_bits = tf.reduce_sum(bits_per_element)
+        # Compute y bits from discretized likelihood
+        y_bits = tf.reduce_sum(-tf.math.log(y_likelihood) * LOG_2_RECIPROCAL)
+
+        # Compute z bits if z is provided
+        z_bits = tf.constant(0.0)
+        if z is not None:
+            if not self.hyper_entropy.built:
+                self.hyper_entropy.build(z.shape)
+            z_likelihood = self.hyper_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+
+        total_bits = y_bits + z_bits
 
         return y_hat, y_likelihood, total_bits
 
diff --git a/src/data_loader.py b/src/data_loader.py
old mode 100644
new mode 100755
index d48127d26..a9a7914ed
--- a/src/data_loader.py
+++ b/src/data_loader.py
@@ -5,8 +5,8 @@
 import numpy as np
 import tensorflow as tf
 
-from ds_mesh_to_pc import read_off
-from ds_pc_octree_blocks import PointCloudProcessor
+from .ds_pc_octree_blocks import PointCloudProcessor
+from .file_io import read_point_cloud
 
 
 class DataLoader:
@@ -22,8 +22,10 @@ def __init__(self, config: Dict[str, Any]):
     def process_point_cloud(self, file_path: str) -> tf.Tensor:
         """Process a single point cloud file."""
         # Read point cloud
-        mesh_data = read_off(file_path.numpy().decode())
-        points = tf.convert_to_tensor(mesh_data.vertices, dtype=tf.float32)
+        vertices = read_point_cloud(file_path.numpy().decode())
+        if vertices is None:
+            raise ValueError(f"Failed to read point cloud: {file_path}")
+        points = tf.convert_to_tensor(vertices, dtype=tf.float32)
 
         # Normalize points to unit cube
         points = self._normalize_points(points)
@@ -43,7 +45,7 @@ def _normalize_points(self, points: tf.Tensor) -> tf.Tensor:
         """Normalize points to unit cube."""
         center = tf.reduce_mean(points, axis=0)
         points = points - center
-        scale = tf.reduce_max(tf.abs(points))
+        scale = tf.maximum(tf.reduce_max(tf.abs(points)), 1e-8)
         points = points / scale
         return points
 
diff --git a/src/deepcompress.egg-info/PKG-INFO b/src/deepcompress.egg-info/PKG-INFO
deleted file mode 100644
index 8fa1abe9b..000000000
--- a/src/deepcompress.egg-info/PKG-INFO
+++ /dev/null
@@ -1,10 +0,0 @@
-Metadata-Version: 1.0
-Name: deepcompress
-Version: 0.1
-Summary: UNKNOWN
-Home-page: UNKNOWN
-Author: UNKNOWN
-Author-email: UNKNOWN
-License: UNKNOWN
-Description: UNKNOWN
-Platform: UNKNOWN
diff --git a/src/deepcompress.egg-info/SOURCES.txt b/src/deepcompress.egg-info/SOURCES.txt
deleted file mode 100644
index 64c7685f4..000000000
--- a/src/deepcompress.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-LICENSE
-README.md
-setup.py
-src/deepcompress.egg-info/PKG-INFO
-src/deepcompress.egg-info/SOURCES.txt
-src/deepcompress.egg-info/dependency_links.txt
-src/deepcompress.egg-info/requires.txt
-src/deepcompress.egg-info/top_level.txt
-src/utils/patch_gaussian_conditional.py
\ No newline at end of file
diff --git a/src/deepcompress.egg-info/dependency_links.txt b/src/deepcompress.egg-info/dependency_links.txt
deleted file mode 100644
index 8b1378917..000000000
--- a/src/deepcompress.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/src/deepcompress.egg-info/requires.txt b/src/deepcompress.egg-info/requires.txt
deleted file mode 100644
index f039a2fae..000000000
--- a/src/deepcompress.egg-info/requires.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-numpy
-pytest
-numba
diff --git a/src/deepcompress.egg-info/top_level.txt b/src/deepcompress.egg-info/top_level.txt
deleted file mode 100644
index 9487075c0..000000000
--- a/src/deepcompress.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-utils
diff --git a/src/ds_mesh_to_pc.py b/src/ds_mesh_to_pc.py
old mode 100644
new mode 100755
index 72f19d0b8..2aa67ac42
--- a/src/ds_mesh_to_pc.py
+++ b/src/ds_mesh_to_pc.py
@@ -42,14 +42,19 @@ def read_off(file_path: str) -> Optional[MeshData]:
                 vertices.append(vertex)
             vertices = np.array(vertices, dtype=np.float32)
 
-            # Read faces if present
+            # Read faces if present, triangulating n-gons via fan triangulation
             faces = None
             if n_faces > 0:
-                faces = []
+                triangles = []
                 for _ in range(n_faces):
-                    face = list(map(int, file.readline().strip().split()[1:]))  # Skip count
-                    faces.append(face)
-                faces = np.array(faces, dtype=np.int32)
+                    indices = list(map(int, file.readline().strip().split()[1:]))
+                    if len(indices) < 3:
+                        continue
+                    # Fan triangulation: (v0, v1, v2), (v0, v2, v3), ...
+                    for i in range(1, len(indices) - 1):
+                        triangles.append([indices[0], indices[i], indices[i + 1]])
+                if triangles:
+                    faces = np.array(triangles, dtype=np.int32)
 
             # Compute face normals if faces are present
             face_normals = None
@@ -90,23 +95,25 @@ def sample_points_from_mesh(
         Tuple of points array and optionally normals array.
     """
     if mesh_data.faces is not None and len(mesh_data.faces) > 0:
-        # Sample from faces using area weighting
-        areas = []
-        centroids = []
-        for face in mesh_data.faces:
-            v1, v2, v3 = mesh_data.vertices[face]
-            area = np.linalg.norm(np.cross(v2 - v1, v3 - v1)) / 2
-            centroid = (v1 + v2 + v3) / 3
-            areas.append(area)
-            centroids.append(centroid)
-
-        # Normalize areas for probability distribution
-        areas = np.array(areas)
+        # Sample from faces using area-weighted barycentric sampling
+        v1s = mesh_data.vertices[mesh_data.faces[:, 0]]
+        v2s = mesh_data.vertices[mesh_data.faces[:, 1]]
+        v3s = mesh_data.vertices[mesh_data.faces[:, 2]]
+
+        areas = np.linalg.norm(np.cross(v2s - v1s, v3s - v1s), axis=1) / 2
         probabilities = areas / areas.sum()
 
-        # Sample points
+        # Sample faces by area
         indices = np.random.choice(len(areas), num_points, p=probabilities)
-        points = np.array(centroids)[indices]
+
+        # Generate random barycentric coordinates for uniform sampling
+        r1 = np.sqrt(np.random.random(num_points))
+        r2 = np.random.random(num_points)
+        points = (
+            (1 - r1)[:, None] * v1s[indices]
+            + (r1 * (1 - r2))[:, None] * v2s[indices]
+            + (r1 * r2)[:, None] * v3s[indices]
+        )
 
         # Get corresponding normals if requested
         normals = None
diff --git a/src/ds_pc_octree_blocks.py b/src/ds_pc_octree_blocks.py
old mode 100644
new mode 100755
index d0791888f..2d712de2b
--- a/src/ds_pc_octree_blocks.py
+++ b/src/ds_pc_octree_blocks.py
@@ -4,6 +4,8 @@
 
 import tensorflow as tf
 
+from .file_io import read_point_cloud as _read_point_cloud
+
 
 class PointCloudProcessor:
     """Point cloud processing with TF 2.x operations."""
@@ -12,22 +14,12 @@ def __init__(self, block_size: float = 1.0, min_points: int = 10):
         self.block_size = block_size
         self.min_points = min_points
 
-    @tf.function
     def read_point_cloud(self, file_path: str) -> tf.Tensor:
-        """Read point cloud using TF file operations."""
-        raw_data = tf.io.read_file(file_path)
-        lines = tf.strings.split(raw_data, '\n')[1:]  # Skip header
-
-        def parse_line(line):
-            values = tf.strings.split(line)
-            return tf.strings.to_number(values[:3], out_type=tf.float32)
-
-        points = tf.map_fn(
-            parse_line,
-            lines,
-            fn_output_signature=tf.float32
-        )
-        return points
+        """Read point cloud from PLY or OFF file."""
+        vertices = _read_point_cloud(file_path)
+        if vertices is None:
+            raise ValueError(f"Failed to read point cloud: {file_path}")
+        return tf.convert_to_tensor(vertices, dtype=tf.float32)
 
     def partition_point_cloud(self, points: tf.Tensor) -> List[tf.Tensor]:
         """Partition point cloud into blocks using TF operations."""
@@ -67,31 +59,18 @@ def save_blocks(self, blocks: List[tf.Tensor], output_dir: str, base_name: str):
 
         for i, block in enumerate(blocks):
             file_path = output_dir / f"{base_name}_block_{i}.ply"
-
-            header = [
-                "ply",
-                "format ascii 1.0",
-                f"element vertex {block.shape[0]}",
-                "property float x",
-                "property float y",
-                "property float z",
-                "end_header"
-            ]
+            points = block.numpy() if isinstance(block, tf.Tensor) else block
 
             with open(file_path, 'w') as f:
-                f.write('\n'.join(header) + '\n')
-
-                # Convert points to strings and write
-                points_str = tf.strings.reduce_join(
-                    tf.strings.as_string(block),
-                    axis=1,
-                    separator=' '
-                )
-                points_str = tf.strings.join([points_str, tf.constant('\n')], '')
-                tf.io.write_file(
-                    str(file_path),
-                    tf.strings.join([tf.strings.join(header, '\n'), points_str])
-                )
+                f.write("ply\n")
+                f.write("format ascii 1.0\n")
+                f.write(f"element vertex {len(points)}\n")
+                f.write("property float x\n")
+                f.write("property float y\n")
+                f.write("property float z\n")
+                f.write("end_header\n")
+                for point in points:
+                    f.write(f"{point[0]} {point[1]} {point[2]}\n")
 
 def main():
     parser = argparse.ArgumentParser(
diff --git a/src/entropy_model.py b/src/entropy_model.py
old mode 100644
new mode 100755
index 47bcbd185..e1d32fe36
--- a/src/entropy_model.py
+++ b/src/entropy_model.py
@@ -1,9 +1,33 @@
 from typing import Any, Dict, Optional, Tuple
 
 import tensorflow as tf
-import tensorflow_probability as tfp
 
-from constants import LOG_2_RECIPROCAL
+from .constants import EPSILON, LOG_2_RECIPROCAL
+
+
+def _discretized_gaussian_likelihood(inputs, mean, scale):
+    """Compute probability mass for quantized inputs under Gaussian model.
+
+    P(x) = CDF((x - mean + 0.5) / scale) - CDF((x - mean - 0.5) / scale)
+
+    This is the correct discretized likelihood for entropy coding, replacing
+    the continuous log-PDF which does not integrate to 1 over integers.
+
+    Args:
+        inputs: Input tensor (quantized or noise-added values).
+        mean: Mean of the Gaussian distribution.
+        scale: Scale (std dev) of the Gaussian distribution.
+
+    Returns:
+        Per-element probability mass, floored at EPSILON to prevent log(0).
+    """
+    scale = tf.maximum(scale, 1e-6)
+    centered = inputs - mean
+    upper = (centered + 0.5) / scale
+    lower = (centered - 0.5) / scale
+    likelihood = 0.5 * (1 + tf.math.erf(upper / tf.sqrt(2.0))) - \
+        0.5 * (1 + tf.math.erf(lower / tf.sqrt(2.0)))
+    return tf.maximum(likelihood, EPSILON)
 
 
 class PatchedGaussianConditional(tf.keras.layers.Layer):
@@ -115,32 +139,33 @@ def quantize_scale(self, scale: tf.Tensor) -> tf.Tensor:
         return tf.reshape(quantized_flat, original_shape)
 
     def compress(self, inputs: tf.Tensor) -> tf.Tensor:
-        scale = self.quantize_scale(self.scale)
+        """Quantize inputs relative to learned mean."""
         centered = inputs - self.mean
-        normalized = centered / scale
-        quantized = tf.round(normalized)
+        quantized = tf.round(centered)
 
         self._debug_tensors.update({
             'compress_inputs': inputs,
-            'compress_scale': scale,
             'compress_outputs': quantized
         })
 
         return quantized
 
     def decompress(self, inputs: tf.Tensor) -> tf.Tensor:
-        scale = self.quantize_scale(self.scale)
-        denormalized = inputs * scale
-        decompressed = denormalized + self.mean
+        """Reconstruct from integer symbols."""
+        decompressed = inputs + self.mean
 
         self._debug_tensors.update({
             'decompress_inputs': inputs,
-            'decompress_scale': scale,
             'decompress_outputs': decompressed
         })
 
         return decompressed
 
+    def likelihood(self, inputs: tf.Tensor) -> tf.Tensor:
+        """Compute discretized Gaussian likelihood for inputs."""
+        scale = tf.maximum(tf.abs(self.scale), 1e-6)
+        return _discretized_gaussian_likelihood(inputs, self.mean, scale)
+
     def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
         self._debug_tensors['inputs'] = inputs
         compressed = self.compress(inputs)
@@ -172,10 +197,7 @@ def call(self, inputs, training=None):
             self.gaussian.build(inputs.shape)
 
         compressed = self.gaussian.compress(inputs)
-        likelihood = tfp.distributions.Normal(
-            loc=self.gaussian.mean,
-            scale=self.gaussian.scale
-        ).log_prob(inputs)
+        likelihood = self.gaussian.likelihood(inputs)
         return compressed, likelihood
 
 
@@ -208,21 +230,11 @@ def _add_noise(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
         return tf.round(inputs)
 
     def compress(self, inputs: tf.Tensor, scale: tf.Tensor, mean: tf.Tensor) -> tf.Tensor:
-        """
-        Compress inputs using provided scale and mean.
+        """Quantize inputs relative to the learned mean.
 
-        Args:
-            inputs: Input tensor to compress.
-            scale: Scale parameter for the Gaussian distribution.
-            mean: Mean parameter for the Gaussian distribution.
-
-        Returns:
-            Quantized (compressed) tensor.
+        The scale parameter affects entropy coding probability, not
+        the quantization grid. This is correct per the standard formulation.
         """
-        # Ensure scale is positive
-        scale = tf.maximum(scale, self.scale_min)
-
-        # Center and normalize
         centered = inputs - mean
         quantized = tf.round(centered)
 
@@ -236,18 +248,7 @@ def compress(self, inputs: tf.Tensor, scale: tf.Tensor, mean: tf.Tensor) -> tf.T
         return quantized
 
     def decompress(self, inputs: tf.Tensor, scale: tf.Tensor, mean: tf.Tensor) -> tf.Tensor:
-        """
-        Decompress inputs using provided scale and mean.
-
-        Args:
-            inputs: Quantized tensor to decompress.
-            scale: Scale parameter for the Gaussian distribution.
-            mean: Mean parameter for the Gaussian distribution.
-
-        Returns:
-            Decompressed (reconstructed) tensor.
-        """
-        # Add back the mean
+        """Reconstruct from integer symbols."""
         decompressed = inputs + mean
 
         self._debug_tensors.update({
@@ -272,7 +273,7 @@ def call(self, inputs: tf.Tensor, scale: tf.Tensor, mean: tf.Tensor,
 
         Returns:
             Tuple of (outputs, likelihood) where outputs are the reconstructed
-            values and likelihood is the log-probability under the distribution.
+            values and likelihood is the discretized probability mass.
         """
         self._debug_tensors['inputs'] = inputs
 
@@ -288,9 +289,8 @@ def call(self, inputs: tf.Tensor, scale: tf.Tensor, mean: tf.Tensor,
         # Reconstruct
         outputs = quantized + mean
 
-        # Compute likelihood using the Gaussian distribution
-        distribution = tfp.distributions.Normal(loc=mean, scale=scale)
-        likelihood = distribution.log_prob(inputs)
+        # Compute discretized likelihood on the output values
+        likelihood = _discretized_gaussian_likelihood(outputs, mean, scale)
 
         self._debug_tensors['outputs'] = outputs
         self._debug_tensors['likelihood'] = likelihood
@@ -335,7 +335,7 @@ def __init__(self,
         self.hidden_channels = hidden_channels or latent_channels * 2
 
         # Import here to avoid circular dependency
-        from entropy_parameters import EntropyParameters
+        from .entropy_parameters import EntropyParameters
 
         # Network to predict mean/scale from hyperprior
         self.entropy_parameters = EntropyParameters(
@@ -350,6 +350,7 @@ def __init__(self,
         self.hyper_entropy = PatchedGaussianConditional()
 
     def call(self, y: tf.Tensor, z_hat: tf.Tensor,
+             z: Optional[tf.Tensor] = None,
              training: Optional[bool] = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
         Process latent y using hyperprior z_hat.
@@ -357,13 +358,14 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         Args:
             y: Main latent representation.
             z_hat: Decoded hyperprior (typically from hyper_synthesis(z)).
+            z: Quantized/noised hyper-latent for computing z rate.
             training: Whether in training mode.
 
         Returns:
             Tuple of (y_hat, y_likelihood, total_bits) where:
                 - y_hat: Reconstructed latent
-                - y_likelihood: Log-probability of y under the predicted distribution
-                - total_bits: Estimated total bits for encoding
+                - y_likelihood: Discretized probability mass of y
+                - total_bits: Estimated total bits (y_bits + z_bits)
         """
         # Predict distribution parameters from hyperprior
         mean, scale = self.entropy_parameters(z_hat)
@@ -371,10 +373,18 @@ def call(self, y: tf.Tensor, z_hat: tf.Tensor,
         # Process through conditional Gaussian
         y_hat, y_likelihood = self.conditional(y, scale, mean, training=training)
 
-        # Estimate bits (negative log-likelihood converted to bits)
-        # Using pre-computed reciprocal: multiplication is faster than division
-        bits_per_element = -y_likelihood * LOG_2_RECIPROCAL
-        total_bits = tf.reduce_sum(bits_per_element)
+        # Compute y bits from discretized likelihood
+        y_bits = tf.reduce_sum(-tf.math.log(y_likelihood) * LOG_2_RECIPROCAL)
+
+        # Compute z bits if z is provided
+        z_bits = tf.constant(0.0)
+        if z is not None:
+            if not self.hyper_entropy.built:
+                self.hyper_entropy.build(z.shape)
+            z_likelihood = self.hyper_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+
+        total_bits = y_bits + z_bits
 
         return y_hat, y_likelihood, total_bits
 
diff --git a/src/evaluation_pipeline.py b/src/evaluation_pipeline.py
old mode 100644
new mode 100755
index a1787110b..f74623f89
--- a/src/evaluation_pipeline.py
+++ b/src/evaluation_pipeline.py
@@ -5,10 +5,10 @@
 
 import tensorflow as tf
 
-from data_loader import DataLoader
-from ev_compare import PointCloudMetrics
-from model_transforms import DeepCompressModel, TransformConfig
-from mp_report import ExperimentReporter
+from .data_loader import DataLoader
+from .ev_compare import PointCloudMetrics
+from .model_transforms import DeepCompressModel, TransformConfig
+from .mp_report import ExperimentReporter
 
 
 @dataclass
@@ -24,10 +24,13 @@ class EvaluationResult:
 class EvaluationPipeline:
     """Pipeline for evaluating DeepCompress model."""
 
-    def __init__(self, config_path: str):
+    def __init__(self, config_path: str, checkpoint_override: str = None):
         self.config = self._load_config(config_path)
         self.logger = logging.getLogger(__name__)
 
+        if checkpoint_override:
+            self.config['checkpoint_path'] = checkpoint_override
+
         # Initialize components
         self.data_loader = DataLoader(self.config)
         self.metrics = PointCloudMetrics()
@@ -52,7 +55,10 @@ def _load_model(self) -> DeepCompressModel:
         # Load weights if checkpoint provided
         checkpoint_path = self.config.get('checkpoint_path')
         if checkpoint_path:
-            model.load_weights(checkpoint_path)
+            resolved = Path(checkpoint_path).resolve()
+            if not resolved.exists():
+                raise FileNotFoundError(f"Checkpoint not found: {resolved}")
+            model.load_weights(str(resolved))
 
         return model
 
@@ -60,7 +66,7 @@ def _evaluate_single(self,
                         point_cloud) -> Dict[str, float]:
         """Evaluate model on single point cloud."""
         # Forward pass through model
-        x_hat, y, y_hat, z = self.model(point_cloud, training=False)
+        x_hat, y, z_hat, z_noisy = self.model(point_cloud, training=False)
 
         # Compute metrics
         results = {}
@@ -136,7 +142,7 @@ def main():
     )
 
     # Run evaluation
-    pipeline = EvaluationPipeline(args.config)
+    pipeline = EvaluationPipeline(args.config, checkpoint_override=args.checkpoint)
     results = pipeline.evaluate()
     pipeline.generate_report(results)
 
diff --git a/src/file_io.py b/src/file_io.py
new file mode 100755
index 000000000..1271c30c3
--- /dev/null
+++ b/src/file_io.py
@@ -0,0 +1,91 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+
+def read_off(file_path: str) -> Optional[np.ndarray]:
+    """Read vertex coordinates from an OFF file.
+
+    Args:
+        file_path: Path to the OFF file.
+
+    Returns:
+        Numpy array of shape (N, 3) with vertex positions, or None on error.
+    """
+    try:
+        with open(file_path, 'r') as f:
+            header = f.readline().strip()
+            if header != "OFF":
+                raise ValueError("Not a valid OFF file")
+
+            n_verts, _, _ = map(int, f.readline().strip().split())
+
+            vertices = []
+            for _ in range(n_verts):
+                values = f.readline().strip().split()
+                vertices.append([float(values[0]), float(values[1]), float(values[2])])
+
+            return np.array(vertices, dtype=np.float32)
+    except Exception as e:
+        logging.error(f"Error reading OFF file {file_path}: {e}")
+        return None
+
+
+def read_ply(file_path: str) -> Optional[np.ndarray]:
+    """Read vertex coordinates from an ASCII PLY file.
+
+    Args:
+        file_path: Path to the PLY file.
+
+    Returns:
+        Numpy array of shape (N, 3) with vertex positions, or None on error.
+    """
+    try:
+        with open(file_path, 'r') as f:
+            line = f.readline().strip()
+            if line != "ply":
+                raise ValueError("Not a valid PLY file")
+
+            n_verts = 0
+            while True:
+                line = f.readline().strip()
+                if line == "end_header":
+                    break
+                if line.startswith("element vertex"):
+                    n_verts = int(line.split()[-1])
+
+            if n_verts == 0:
+                return np.array([], dtype=np.float32).reshape(0, 3)
+
+            vertices = []
+            for _ in range(n_verts):
+                values = f.readline().strip().split()
+                vertices.append([float(values[0]), float(values[1]), float(values[2])])
+
+            return np.array(vertices, dtype=np.float32)
+    except Exception as e:
+        logging.error(f"Error reading PLY file {file_path}: {e}")
+        return None
+
+
+def read_point_cloud(file_path: str) -> Optional[np.ndarray]:
+    """Read a point cloud from a file, dispatching by extension.
+
+    Supports .off and .ply formats.
+
+    Args:
+        file_path: Path to the point cloud file.
+
+    Returns:
+        Numpy array of shape (N, 3) with vertex positions, or None on error.
+    """
+    ext = Path(file_path).suffix.lower()
+    if ext == '.off':
+        return read_off(file_path)
+    elif ext == '.ply':
+        return read_ply(file_path)
+    else:
+        logging.error(f"Unsupported file format: {ext}")
+        return None
diff --git a/src/model_transforms.py b/src/model_transforms.py
old mode 100644
new mode 100755
index ab38c7ec3..d79873bcd
--- a/src/model_transforms.py
+++ b/src/model_transforms.py
@@ -3,7 +3,7 @@
 
 import tensorflow as tf
 
-from constants import EPSILON, LOG_2_RECIPROCAL
+from .constants import LOG_2_RECIPROCAL
 
 
 @dataclass
@@ -16,40 +16,57 @@ class TransformConfig:
     conv_type: str = 'separable'
 
 
-class CENICGDN(tf.keras.layers.Layer):
-    """CENIC-GDN activation function implementation."""
+class GDN(tf.keras.layers.Layer):
+    """Generalized Divisive Normalization (Balle et al., 2016).
 
-    def __init__(self, channels: int, **kwargs):
+    y_i = x_i / sqrt(beta_i + sum_j(gamma_ij * x_j^2))
+
+    When inverse=True, computes IGDN (inverse GDN) for the synthesis path:
+    y_i = x_i * sqrt(beta_i + sum_j(gamma_ij * x_j^2))
+
+    Args:
+        inverse: If True, compute IGDN instead of GDN.
+    """
+
+    def __init__(self, inverse: bool = False, **kwargs):
         super().__init__(**kwargs)
-        self.channels = channels
+        self.inverse = inverse
 
     def build(self, input_shape):
+        num_channels = input_shape[-1]
         self.beta = self.add_weight(
             name='beta',
-            shape=[self.channels],
-            initializer='ones',
+            shape=[num_channels],
+            initializer=tf.initializers.Ones(),
+            constraint=tf.keras.constraints.NonNeg(),
             trainable=True
         )
         self.gamma = self.add_weight(
             name='gamma',
-            shape=[self.channels, self.channels],
-            initializer='zeros',
+            shape=[num_channels, num_channels],
+            initializer=tf.initializers.Identity(gain=0.1),
             trainable=True
         )
         super().build(input_shape)
 
-    def call(self, x):
-        # Note: XLA compilation removed as it breaks gradient flow when layers are composed
-        norm = tf.abs(x)
-        # Use axis 4 (channel dimension) for 5D tensors (batch, D, H, W, C)
-        norm = tf.tensordot(norm, self.gamma, [[4], [0]])
-        norm = tf.nn.bias_add(norm, self.beta)
-        return x / tf.maximum(norm, EPSILON)
+    def call(self, inputs):
+        # Ensure gamma is non-negative and symmetric
+        gamma = tf.nn.relu(self.gamma)
+        gamma = (gamma + tf.transpose(gamma)) / 2.0
+
+        # Compute normalization: beta_i + sum_j(gamma_ij * x_j^2)
+        norm = tf.einsum('...c,cd->...d', inputs ** 2, gamma)
+        norm = tf.sqrt(self.beta + norm)
+
+        if self.inverse:
+            return inputs * norm  # IGDN
+        else:
+            return inputs / norm  # GDN
 
     def get_config(self):
         config = super().get_config()
         config.update({
-            'channels': self.channels
+            'inverse': self.inverse
         })
         return config
 
@@ -126,8 +143,8 @@ def __init__(self, config: TransformConfig, **kwargs):
 
             self.conv_layers.append(conv)
 
-            if config.activation == 'cenic_gdn':
-                self.conv_layers.append(CENICGDN(current_filters))
+            if config.activation in ('gdn', 'cenic_gdn'):
+                self.conv_layers.append(GDN(inverse=False))
             else:
                 self.conv_layers.append(tf.keras.layers.ReLU())
 
@@ -160,24 +177,19 @@ def __init__(self, config: TransformConfig, **kwargs):
         current_filters = config.filters * 4  # Start with max channels
 
         for i in range(3):  # Three blocks as per paper
-            if config.conv_type == 'separable':
-                conv = SpatialSeparableConv(
-                    filters=current_filters,
-                    kernel_size=config.kernel_size,
-                    strides=config.strides
-                )
-            else:
-                conv = tf.keras.layers.Conv3DTranspose(
-                    filters=current_filters,
-                    kernel_size=config.kernel_size,
-                    strides=config.strides,
-                    padding='same'
-                )
+            # Synthesis always needs Conv3DTranspose for upsampling
+            # SpatialSeparableConv only supports forward (downsampling) convolution
+            conv = tf.keras.layers.Conv3DTranspose(
+                filters=current_filters,
+                kernel_size=config.kernel_size,
+                strides=config.strides,
+                padding='same'
+            )
 
             self.conv_layers.append(conv)
 
-            if config.activation == 'cenic_gdn':
-                self.conv_layers.append(CENICGDN(current_filters))
+            if config.activation in ('gdn', 'cenic_gdn'):
+                self.conv_layers.append(GDN(inverse=True))  # IGDN for synthesis
             else:
                 self.conv_layers.append(tf.keras.layers.ReLU())
 
@@ -209,9 +221,9 @@ def __init__(self, config: TransformConfig, **kwargs):
         self.analysis = AnalysisTransform(config)
         self.synthesis = SynthesisTransform(config)
 
-        # Final projection: map from synthesis channels back to 1-channel occupancy
+        # Final projection: outputs raw logits (no activation) for stable loss
         self.output_projection = tf.keras.layers.Conv3D(
-            filters=1, kernel_size=(1, 1, 1), activation='sigmoid', padding='same'
+            filters=1, kernel_size=(1, 1, 1), padding='same'
         )
 
         # Hyperprior
@@ -231,16 +243,19 @@ def call(self, inputs, training=None):
         y = self.analysis(inputs)
         z = self.hyper_analysis(y)
 
-        # Add uniform noise for training
+        # Add uniform noise for training, hard rounding for inference
         if training:
-            y = y + tf.random.uniform(tf.shape(y), -0.5, 0.5)
-            z = z + tf.random.uniform(tf.shape(z), -0.5, 0.5)
+            y_hat = y + tf.random.uniform(tf.shape(y), -0.5, 0.5)
+            z_noisy = z + tf.random.uniform(tf.shape(z), -0.5, 0.5)
+        else:
+            y_hat = tf.round(y)
+            z_noisy = tf.round(z)
 
-        # Synthesis
-        y_hat = self.hyper_synthesis(z)
-        x_hat = self.output_projection(self.synthesis(y))
+        # Synthesis — decode from quantized latent (y_hat), not raw encoder output
+        z_hat = self.hyper_synthesis(z_noisy)
+        x_hat = tf.sigmoid(self.output_projection(self.synthesis(y_hat)))
 
-        return x_hat, y, y_hat, z
+        return x_hat, y, z_hat, z_noisy
 
     def get_config(self):
         config = super().get_config()
@@ -294,9 +309,9 @@ def __init__(self,
         self.analysis = AnalysisTransform(config)
         self.synthesis = SynthesisTransform(config)
 
-        # Final projection: map from synthesis channels back to 1-channel occupancy
+        # Final projection: outputs raw logits (no activation) for stable loss
         self.output_projection = tf.keras.layers.Conv3D(
-            filters=1, kernel_size=(1, 1, 1), activation='sigmoid', padding='same'
+            filters=1, kernel_size=(1, 1, 1), padding='same'
         )
 
         # Hyperprior transforms
@@ -311,10 +326,10 @@ def __init__(self,
             activation='relu'
         ))
 
-        # Compute channel dimensions
-        # Analysis progressively doubles channels 3 times
-        self.latent_channels = config.filters * 4  # After 3 blocks of doubling
-        self.hyper_channels = (config.filters // 2) * 4
+        # Compute latent channel dimensions dynamically from analysis transforms
+        # Analysis doubles channels each block: filters -> 2*filters -> 4*filters
+        self.latent_channels = config.filters * (2 ** 2)  # After 3 conv blocks with doubling
+        self.hyper_channels = (config.filters // 2) * (2 ** 2)
 
         # Create entropy model based on selection
         self._create_entropy_model()
@@ -322,25 +337,25 @@ def __init__(self,
     def _create_entropy_model(self):
         """Create the selected entropy model."""
         if self.entropy_model_type == 'gaussian':
-            from entropy_model import EntropyModel
+            from .entropy_model import EntropyModel
             self.entropy_module = EntropyModel()
 
         elif self.entropy_model_type == 'hyperprior':
-            from entropy_model import MeanScaleHyperprior
+            from .entropy_model import MeanScaleHyperprior
             self.entropy_module = MeanScaleHyperprior(
                 latent_channels=self.latent_channels,
                 hyper_channels=self.hyper_channels
             )
 
         elif self.entropy_model_type == 'context':
-            from context_model import ContextualEntropyModel
+            from .context_model import ContextualEntropyModel
             self.entropy_module = ContextualEntropyModel(
                 latent_channels=self.latent_channels,
                 hyper_channels=self.hyper_channels
             )
 
         elif self.entropy_model_type == 'channel':
-            from channel_context import ChannelContextEntropyModel
+            from .channel_context import ChannelContextEntropyModel
             self.entropy_module = ChannelContextEntropyModel(
                 latent_channels=self.latent_channels,
                 hyper_channels=self.hyper_channels,
@@ -348,7 +363,7 @@ def _create_entropy_model(self):
             )
 
         elif self.entropy_model_type == 'attention':
-            from attention_context import AttentionEntropyModel
+            from .attention_context import AttentionEntropyModel
             self.entropy_module = AttentionEntropyModel(
                 latent_channels=self.latent_channels,
                 hyper_channels=self.hyper_channels,
@@ -356,7 +371,7 @@ def _create_entropy_model(self):
             )
 
         elif self.entropy_model_type == 'hybrid':
-            from attention_context import HybridAttentionEntropyModel
+            from .attention_context import HybridAttentionEntropyModel
             self.entropy_module = HybridAttentionEntropyModel(
                 latent_channels=self.latent_channels,
                 hyper_channels=self.hyper_channels,
@@ -374,7 +389,7 @@ def call(self, inputs, training=None):
 
         Returns:
             Tuple of (x_hat, y, y_hat, z, rate_info) where:
-                - x_hat: Reconstructed input
+                - x_hat: Reconstructed input (sigmoid of logits)
                 - y: Latent representation
                 - y_hat: Quantized latent (or reconstructed)
                 - z: Hyper-latent
@@ -395,29 +410,48 @@ def call(self, inputs, training=None):
 
         # Entropy model processing
         if self.entropy_model_type == 'gaussian':
-            # Original behavior
+            # Original behavior with discretized likelihood
             if training:
                 y_noisy = y + tf.random.uniform(tf.shape(y), -0.5, 0.5)
             else:
                 y_noisy = tf.round(y)
             compressed, likelihood = self.entropy_module(y_noisy)
             y_hat = y_noisy
-            # Using pre-computed reciprocal: multiplication is faster than division
-            total_bits = -tf.reduce_sum(likelihood) * LOG_2_RECIPROCAL
+            y_bits = tf.reduce_sum(-tf.math.log(likelihood) * LOG_2_RECIPROCAL)
         else:
-            # Advanced entropy models
-            y_hat, likelihood, total_bits = self.entropy_module(
-                y, z_hat, training=training
+            # Advanced entropy models — pass z for hyper-latent rate
+            y_hat, likelihood, y_bits = self.entropy_module(
+                y, z_hat, z=z, training=training
             )
 
-        # Synthesis
-        x_hat = self.output_projection(self.synthesis(y_hat))
+        # Compute z bits under learned prior
+        if self.entropy_model_type == 'gaussian':
+            # For gaussian, compute z bits directly
+            if not hasattr(self, '_z_entropy') or not self._z_entropy.built:
+                from .entropy_model import PatchedGaussianConditional
+                self._z_entropy = PatchedGaussianConditional()
+                self._z_entropy.build(z.shape)
+            z_likelihood = self._z_entropy.likelihood(z)
+            z_bits = tf.reduce_sum(-tf.math.log(z_likelihood) * LOG_2_RECIPROCAL)
+        else:
+            # z_bits already included in y_bits (via MeanScaleHyperprior)
+            z_bits = tf.constant(0.0)
+
+        total_bits = y_bits + z_bits
+
+        # Synthesis — apply sigmoid to logits for output
+        logits = self.output_projection(self.synthesis(y_hat))
+        x_hat = tf.sigmoid(logits)
 
         # Rate information
+        num_voxels = tf.cast(tf.reduce_prod(tf.shape(inputs)[1:4]), tf.float32)
         rate_info = {
             'likelihood': likelihood,
             'total_bits': total_bits,
-            'bpp': total_bits / tf.cast(tf.reduce_prod(tf.shape(inputs)[1:4]), tf.float32)
+            'y_bits': y_bits,
+            'z_bits': z_bits,
+            'bpp': total_bits / num_voxels,
+            'logits': logits,
         }
 
         return x_hat, y, y_hat, z, rate_info
@@ -430,7 +464,7 @@ def compress(self, inputs):
             inputs: Input voxel grid.
 
         Returns:
-            Tuple of (compressed_data, metadata) for storage/transmission.
+            Dict with compressed symbols and metadata.
         """
         # Analysis
         y = self.analysis(inputs)
@@ -444,12 +478,17 @@ def compress(self, inputs):
             y_quantized = tf.round(y)
             compressed_y = y_quantized
             side_info = {}
-        elif self.entropy_model_type in ['hyperprior', 'context']:
+        elif self.entropy_model_type in ('hyperprior', 'context'):
             compressed_y, side_info = self.entropy_module.compress(y, z_hat)
         elif self.entropy_model_type == 'channel':
             compressed_y, side_info = self.entropy_module.compress(y, z_hat)
+        elif self.entropy_model_type in ('attention', 'hybrid'):
+            # Attention/hybrid: use hyperprior mean for centered quantization
+            # TODO: implement actual arithmetic coding for attention/hybrid models
+            mean, scale = self.entropy_module.entropy_parameters(z_hat)
+            compressed_y = tf.round(y - mean)
+            side_info = {'mean': mean, 'scale': scale}
         else:
-            # For attention models, use basic quantization
             compressed_y = tf.round(y)
             side_info = {}
 
@@ -467,7 +506,7 @@ def decompress(self, compressed_data):
             compressed_data: Dict with compressed data from compress().
 
         Returns:
-            Reconstructed voxel grid.
+            Reconstructed voxel grid (sigmoid-applied probabilities).
         """
         y_compressed = compressed_data['y']
         z = compressed_data['z']
@@ -481,11 +520,15 @@ def decompress(self, compressed_data):
             y_hat = self.entropy_module.decompress(y_compressed, z_hat)
         elif self.entropy_model_type == 'channel':
             y_hat = self.entropy_module.decode_parallel(z_hat, y_compressed)
+        elif self.entropy_model_type in ('attention', 'hybrid'):
+            # TODO: implement actual arithmetic coding for attention/hybrid models
+            mean, _ = self.entropy_module.entropy_parameters(z_hat)
+            y_hat = y_compressed + mean
         else:
             y_hat = y_compressed
 
-        # Synthesis
-        x_hat = self.output_projection(self.synthesis(y_hat))
+        # Synthesis — apply sigmoid to logits
+        x_hat = tf.sigmoid(self.output_projection(self.synthesis(y_hat)))
 
         return x_hat
 
diff --git a/src/mp_report.py b/src/mp_report.py
old mode 100644
new mode 100755
index b2e6aa22b..9e1d28992
--- a/src/mp_report.py
+++ b/src/mp_report.py
@@ -69,7 +69,7 @@ def _compute_best_metrics(self) -> Dict[str, Any]:
             'psnr': float('-inf'),
             'bd_rate': float('inf'),
             'bitrate': float('inf'),
-            'compression_ratio': float('inf'),
+            'compression_ratio': float('-inf'),
             'compression_time': float('inf'),
             'decompression_time': float('inf')
         }
@@ -91,7 +91,7 @@ def _compute_best_metrics(self) -> Dict[str, Any]:
             for metric in best_metrics.keys():
                 if metric in results:
                     value = results[metric]
-                    if metric == 'psnr':  # Higher is better
+                    if metric in ('psnr', 'compression_ratio'):  # Higher is better
                         if value > best_metrics[metric]:
                             best_metrics[metric] = value
                             best_models[metric] = file_name
diff --git a/src/octree_coding.py b/src/octree_coding.py
old mode 100644
new mode 100755
index 1e9fc33b9..fdb9e1de7
--- a/src/octree_coding.py
+++ b/src/octree_coding.py
@@ -114,18 +114,33 @@ def partition_octree(
         ]
 
         for x_range, y_range, z_range in ranges:
-            # Compute conditions
+            # Half-open intervals: [min, mid) for lower half, [mid, max] for upper
+            x_upper_cond = (
+                point_cloud[:, 0] <= x_range[1]
+                if x_range[1] == xmax
+                else point_cloud[:, 0] < x_range[1]
+            )
             x_cond = tf.logical_and(
-                point_cloud[:, 0] >= x_range[0] - self.config.epsilon,
-                point_cloud[:, 0] <= x_range[1] + self.config.epsilon
+                point_cloud[:, 0] >= x_range[0],
+                x_upper_cond
+            )
+            y_upper_cond = (
+                point_cloud[:, 1] <= y_range[1]
+                if y_range[1] == ymax
+                else point_cloud[:, 1] < y_range[1]
             )
             y_cond = tf.logical_and(
-                point_cloud[:, 1] >= y_range[0] - self.config.epsilon,
-                point_cloud[:, 1] <= y_range[1] + self.config.epsilon
+                point_cloud[:, 1] >= y_range[0],
+                y_upper_cond
+            )
+            z_upper_cond = (
+                point_cloud[:, 2] <= z_range[1]
+                if z_range[1] == zmax
+                else point_cloud[:, 2] < z_range[1]
             )
             z_cond = tf.logical_and(
-                point_cloud[:, 2] >= z_range[0] - self.config.epsilon,
-                point_cloud[:, 2] <= z_range[1] + self.config.epsilon
+                point_cloud[:, 2] >= z_range[0],
+                z_upper_cond
             )
 
             # Combine conditions
diff --git a/src/parallel_process.py b/src/parallel_process.py
old mode 100644
new mode 100755
index 135d74189..663acf033
--- a/src/parallel_process.py
+++ b/src/parallel_process.py
@@ -71,7 +71,13 @@ def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.terminate()
+        if self.process.poll() is None:
+            self.terminate()
+        else:
+            if hasattr(self.process, 'stdout') and self.process.stdout:
+                self.process.stdout.close()
+            if hasattr(self.process, 'stderr') and self.process.stderr:
+                self.process.stderr.close()
 
 def parallel_process(
     func: Callable[[Any], Any],
diff --git a/src/quick_benchmark.py b/src/quick_benchmark.py
old mode 100644
new mode 100755
index 4331ab7f4..5db26e5b6
--- a/src/quick_benchmark.py
+++ b/src/quick_benchmark.py
@@ -28,7 +28,7 @@
 
 sys.path.insert(0, os.path.dirname(__file__))
 
-from model_transforms import DeepCompressModel, DeepCompressModelV2, TransformConfig
+from .model_transforms import DeepCompressModel, DeepCompressModelV2, TransformConfig
 
 
 @dataclass
@@ -165,26 +165,35 @@ def benchmark_model(
     decode_times = []
 
     for _ in range(timed_runs):
-        # Encode
-        start = time.perf_counter()
-        outputs = model(input_tensor, training=False)
-        encode_time = time.perf_counter() - start
-        encode_times.append(encode_time)
+        if isinstance(model, DeepCompressModelV2):
+            # V2: measure encode and decode separately
+            start = time.perf_counter()
+            compressed = model.compress(input_tensor)
+            encode_time = time.perf_counter() - start
+
+            start = time.perf_counter()
+            _ = model.decompress(compressed)
+            decode_time = time.perf_counter() - start
+        else:
+            # V1: full forward pass (no separate encode/decode)
+            start = time.perf_counter()
+            _ = model(input_tensor, training=False)
+            encode_time = time.perf_counter() - start
+            decode_time = 0
 
-        # For decode timing, we'd need separate encode/decode methods
-        # For now, we include it in encode time
-        decode_times.append(0)
+        encode_times.append(encode_time)
+        decode_times.append(decode_time)
 
     # Average times
     avg_encode_ms = np.mean(encode_times) * 1000
     avg_decode_ms = np.mean(decode_times) * 1000
 
     # Get final outputs for metrics
-    # V1 returns (x_hat, y, y_hat, z)
+    # V1 returns (x_hat, y, z_hat, z_noisy)
     # V2 returns (x_hat, y, y_hat, z, rate_info)
     outputs = model(input_tensor, training=False)
     if len(outputs) == 4:
-        x_hat, y, y_hat, z = outputs
+        x_hat, y, z_hat, z_noisy = outputs
         rate_info = None
     else:
         x_hat, y, y_hat, z, rate_info = outputs
@@ -202,12 +211,13 @@ def benchmark_model(
         # Use actual bits from entropy model
         estimated_bits = float(rate_info['total_bits'])
     else:
-        # Approximate - actual bits depend on entropy coding
-        # We use the entropy of the quantized latent
-        y_quantized = tf.round(y_hat)
-        unique_values = len(np.unique(y_quantized.numpy()))
-        entropy_estimate = np.log2(max(unique_values, 1))
-        estimated_bits = latent_elements * entropy_estimate
+        # Approximate using Shannon entropy of quantized latent
+        y_quantized = tf.round(y)
+        y_flat = y_quantized.numpy().flatten()
+        _, counts = np.unique(y_flat, return_counts=True)
+        probs = counts / counts.sum()
+        entropy_per_symbol = -np.sum(probs * np.log2(probs))
+        estimated_bits = latent_elements * entropy_per_symbol
 
     bits_per_voxel = estimated_bits / input_elements
 
diff --git a/src/training_pipeline.py b/src/training_pipeline.py
old mode 100644
new mode 100755
index ffdb1070f..852132468
--- a/src/training_pipeline.py
+++ b/src/training_pipeline.py
@@ -10,9 +10,9 @@ class TrainingPipeline:
     def __init__(self, config_path: str):
         import yaml
 
-        from data_loader import DataLoader
-        from entropy_model import EntropyModel
-        from model_transforms import DeepCompressModel, TransformConfig
+        from .data_loader import DataLoader
+        from .entropy_model import EntropyModel
+        from .model_transforms import DeepCompressModel, TransformConfig
 
         self.config_path = config_path
         with open(config_path, 'r') as f:
@@ -32,6 +32,9 @@ def __init__(self, config_path: str):
         self.model = DeepCompressModel(model_config)
         self.entropy_model = EntropyModel()
 
+        # Rate-distortion trade-off weight
+        self.lambda_rd = self.config['training'].get('lambda_rd', 0.01)
+
         # Initialize optimizers
         lrs = self.config['training']['learning_rates']
         self.optimizers = {
@@ -39,6 +42,9 @@ def __init__(self, config_path: str):
             'entropy': tf.keras.optimizers.Adam(learning_rate=lrs['entropy']),
         }
 
+        # Gradient clipping for training stability
+        self.grad_clip_norm = self.config['training'].get('grad_clip_norm', 1.0)
+
         # Checkpoint directory
         self.checkpoint_dir = Path(self.config['training']['checkpoint_dir'])
         self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
@@ -49,40 +55,47 @@ def __init__(self, config_path: str):
         self.summary_writer = tf.summary.create_file_writer(str(log_dir))
 
     def _train_step(self, batch: tf.Tensor, training: bool = True) -> Dict[str, tf.Tensor]:
-        """Run a single training step."""
-        with tf.GradientTape(persistent=True) as tape:
+        """Run a single training step with joint rate-distortion optimization."""
+        with tf.GradientTape() as tape:
             inputs = batch[..., tf.newaxis] if len(batch.shape) == 4 else batch
-            x_hat, y, y_hat, z = self.model(inputs, training=training)
+            x_hat, y, z_hat, z_noisy = self.model(inputs, training=training)
 
-            # Compute focal loss on reconstruction
+            # Compute focal loss on reconstruction (distortion term)
             focal_loss = self.compute_focal_loss(
                 batch[..., tf.newaxis] if len(batch.shape) == 4 else batch,
                 x_hat,
             )
 
-            # Compute entropy loss
-            # EntropyModel returns log-probabilities, so use them directly
-            _, log_likelihood = self.entropy_model(y, training=training)
-            entropy_loss = -tf.reduce_mean(log_likelihood)
+            # Compute entropy loss (rate term)
+            # EntropyModel returns discretized probability mass
+            _, likelihood = self.entropy_model(y, training=training)
+            entropy_loss = -tf.reduce_mean(tf.math.log(likelihood))
 
-            total_loss = focal_loss + entropy_loss
+            # Joint rate-distortion loss
+            total_loss = focal_loss + self.lambda_rd * entropy_loss
 
         if training:
-            # Update reconstruction model
-            model_grads = tape.gradient(focal_loss, self.model.trainable_variables)
+            # Joint gradient computation over all trainable variables
+            all_vars = self.model.trainable_variables + self.entropy_model.trainable_variables
+            grads = tape.gradient(total_loss, all_vars)
+
+            # Clip gradients for stability
+            grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm)
+
+            # Split gradients and apply to respective optimizers
+            model_var_count = len(self.model.trainable_variables)
+            model_grads = grads[:model_var_count]
+            entropy_grads = grads[model_var_count:]
+
             self.optimizers['reconstruction'].apply_gradients(
                 zip(model_grads, self.model.trainable_variables)
             )
 
-            # Update entropy model
-            entropy_grads = tape.gradient(entropy_loss, self.entropy_model.trainable_variables)
             if entropy_grads and any(g is not None for g in entropy_grads):
                 self.optimizers['entropy'].apply_gradients(
                     zip(entropy_grads, self.entropy_model.trainable_variables)
                 )
 
-        del tape
-
         return {
             'focal_loss': focal_loss,
             'entropy_loss': entropy_loss,
@@ -141,6 +154,9 @@ def _validate(self, val_dataset: tf.data.Dataset) -> Dict[str, float]:
             losses = self._train_step(batch, training=False)
             val_losses.append({k: v.numpy() for k, v in losses.items()})
 
+        if not val_losses:
+            return {'focal_loss': 0.0, 'entropy_loss': 0.0, 'total_loss': float('inf')}
+
         avg_losses = {}
         for metric in val_losses[0].keys():
             avg_losses[metric] = float(tf.reduce_mean([x[metric] for x in val_losses]))
@@ -155,26 +171,29 @@ def save_checkpoint(self, name: str):
 
         for opt_name, optimizer in self.optimizers.items():
             if optimizer.variables:
-                opt_weights = [v.numpy() for v in optimizer.variables]
-                np.save(
-                    str(checkpoint_path / f'{opt_name}_optimizer.npy'),
-                    np.array(opt_weights, dtype=object),
-                    allow_pickle=True,
-                )
+                opt_dir = checkpoint_path / f'{opt_name}_optimizer'
+                opt_dir.mkdir(parents=True, exist_ok=True)
+                for i, v in enumerate(optimizer.variables):
+                    np.save(str(opt_dir / f'{i}.npy'), v.numpy())
 
         self.logger.info(f"Saved checkpoint: {name}")
 
     def load_checkpoint(self, name: str):
-        checkpoint_path = self.checkpoint_dir / name
+        checkpoint_path = (self.checkpoint_dir / name).resolve()
+        try:
+            checkpoint_path.relative_to(self.checkpoint_dir.resolve())
+        except ValueError:
+            raise ValueError(f"Checkpoint path escapes checkpoint directory: {name}")
         self.model.load_weights(str(checkpoint_path / 'model.weights.h5'))
         self.entropy_model.load_weights(str(checkpoint_path / 'entropy.weights.h5'))
 
         for opt_name, optimizer in self.optimizers.items():
-            opt_path = checkpoint_path / f'{opt_name}_optimizer.npy'
-            if opt_path.exists() and optimizer.variables:
-                opt_weights = np.load(str(opt_path), allow_pickle=True)
-                for var, w in zip(optimizer.variables, opt_weights):
-                    var.assign(w)
+            opt_dir = checkpoint_path / f'{opt_name}_optimizer'
+            if opt_dir.exists() and optimizer.variables:
+                for i, var in enumerate(optimizer.variables):
+                    path = opt_dir / f'{i}.npy'
+                    if path.exists():
+                        var.assign(np.load(str(path), allow_pickle=False))
 
         self.logger.info(f"Loaded checkpoint: {name}")
 
diff --git a/src/utils/__pycache__/experiment.cpython-38.pyc b/src/utils/__pycache__/experiment.cpython-38.pyc
deleted file mode 100644
index 66434b236..000000000
Binary files a/src/utils/__pycache__/experiment.cpython-38.pyc and /dev/null differ
diff --git a/src/utils/__pycache__/pc_metric.cpython-38.pyc b/src/utils/__pycache__/pc_metric.cpython-38.pyc
deleted file mode 100644
index 04997eed3..000000000
Binary files a/src/utils/__pycache__/pc_metric.cpython-38.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_colorbar.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_colorbar.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 7bfe97fbf..000000000
Binary files a/tests/__pycache__/test_colorbar.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_compress_octree.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_compress_octree.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 1656f4b7e..000000000
Binary files a/tests/__pycache__/test_compress_octree.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_ds_mesh_to_pc.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_ds_mesh_to_pc.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 6eb4a0255..000000000
Binary files a/tests/__pycache__/test_ds_mesh_to_pc.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_ds_pc_octree_blocks.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_ds_pc_octree_blocks.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 5423206ac..000000000
Binary files a/tests/__pycache__/test_ds_pc_octree_blocks.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_ev_run_experiment.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_ev_run_experiment.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index cf2b47cd2..000000000
Binary files a/tests/__pycache__/test_ev_run_experiment.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_ev_run_render.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_ev_run_render.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 9f4236143..000000000
Binary files a/tests/__pycache__/test_ev_run_render.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_experiment.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_experiment.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 6fcc10f4f..000000000
Binary files a/tests/__pycache__/test_experiment.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_map_color.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_map_color.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index a24185741..000000000
Binary files a/tests/__pycache__/test_map_color.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_model_opt.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_model_opt.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 17543e433..000000000
Binary files a/tests/__pycache__/test_model_opt.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_model_transforms.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_model_transforms.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 15681daee..000000000
Binary files a/tests/__pycache__/test_model_transforms.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_octree_coding.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_octree_coding.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 962eabbe1..000000000
Binary files a/tests/__pycache__/test_octree_coding.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_parallel_process.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_parallel_process.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index 520cbdf63..000000000
Binary files a/tests/__pycache__/test_parallel_process.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_parallel_process.cpython-38.pyc b/tests/__pycache__/test_parallel_process.cpython-38.pyc
deleted file mode 100644
index 7590b28b0..000000000
Binary files a/tests/__pycache__/test_parallel_process.cpython-38.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_patch_gaussian_conditional.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_patch_gaussian_conditional.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index b76f9adbe..000000000
Binary files a/tests/__pycache__/test_patch_gaussian_conditional.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/__pycache__/test_pc_metric.cpython-38-pytest-8.3.4.pyc b/tests/__pycache__/test_pc_metric.cpython-38-pytest-8.3.4.pyc
deleted file mode 100644
index b6b4505c0..000000000
Binary files a/tests/__pycache__/test_pc_metric.cpython-38-pytest-8.3.4.pyc and /dev/null differ
diff --git a/tests/conftest.py b/tests/conftest.py
old mode 100644
new mode 100755
index 285a46f5d..f184a977a
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,53 @@
+import importlib
+import sys
 from pathlib import Path
 
 import numpy as np
 import pytest
 import tensorflow as tf
 
+# ---------------------------------------------------------------------------
+# Import hook: redirect bare module imports to src.X so relative imports work.
+#
+# Test files use `sys.path.insert(0, src/)` then `from model_transforms import X`.
+# Source modules now use relative imports (`from .constants import X`), which
+# require loading as part of the `src` package.  This meta-path finder
+# intercepts bare imports of src modules and loads them as `src.<name>`,
+# making both conventions compatible without changing any test files.
+# ---------------------------------------------------------------------------
+_project_root = str(Path(__file__).parent.parent)
+if _project_root not in sys.path:
+    sys.path.insert(0, _project_root)
+
+_src_dir = Path(__file__).parent.parent / 'src'
+
+
+class _SrcPackageFinder:
+    """Meta-path finder that loads bare src module imports as src.X."""
+
+    _loading = set()
+
+    def find_module(self, fullname, path=None):
+        if fullname in self._loading:
+            return None
+        if '.' not in fullname and (_src_dir / f'{fullname}.py').exists():
+            return self
+        return None
+
+    def load_module(self, fullname):
+        if fullname in sys.modules:
+            return sys.modules[fullname]
+        self._loading.add(fullname)
+        try:
+            mod = importlib.import_module(f'src.{fullname}')
+            sys.modules[fullname] = mod
+            return mod
+        finally:
+            self._loading.discard(fullname)
+
+
+sys.meta_path.insert(0, _SrcPackageFinder())
+
 
 def pytest_collection_modifyitems(items):
     """Filter out tf.test.TestCase.test_session, which is a deprecated
diff --git a/tests/test_attention_context.py b/tests/test_attention_context.py
old mode 100644
new mode 100755
index a1e001bb6..dd5dddb5f
--- a/tests/test_attention_context.py
+++ b/tests/test_attention_context.py
@@ -191,13 +191,12 @@ def test_attention_entropy_improvement(self):
         # Basic sanity checks for untrained models
         avg_likelihood_attn = tf.reduce_mean(likelihood_attn)
 
-        # Likelihood should be finite and reasonable for Gaussian
+        # Likelihood should be finite and reasonable
         self.assertFalse(tf.math.is_nan(avg_likelihood_attn))
         self.assertFalse(tf.math.is_inf(avg_likelihood_attn))
-        # Log-likelihood for Gaussian should be negative (probability < 1)
-        self.assertLess(avg_likelihood_attn, 0.0)
-        # But not catastrophically negative (which would indicate numerical issues)
-        self.assertGreater(avg_likelihood_attn, -100.0)
+        # Discretized probability mass should be in (0, 1]
+        self.assertGreater(avg_likelihood_attn, 0.0)
+        self.assertLessEqual(avg_likelihood_attn, 1.0)
 
     def test_attention_entropy_gradient_flow(self):
         """Gradients flow through the entire model."""
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
new file mode 100755
index 000000000..44b26993b
--- /dev/null
+++ b/tests/test_benchmarks.py
@@ -0,0 +1,213 @@
+"""
+Tests for benchmark utilities and methodology.
+
+Validates that Benchmark, MemoryProfiler, and benchmark_function produce
+sensible results and the comparison utilities work correctly.
+"""
+
+import sys
+import time
+from pathlib import Path
+
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from benchmarks import (
+    Benchmark,
+    BenchmarkResult,
+    benchmark_function,
+    compare_implementations,
+    create_test_input,
+)
+
+
+class TestBenchmarkResult(tf.test.TestCase):
+    """Tests for BenchmarkResult dataclass."""
+
+    def test_ms_per_iteration(self):
+        """ms_per_iteration should correctly compute milliseconds."""
+        result = BenchmarkResult(
+            name="test",
+            elapsed_seconds=1.0,
+            iterations=10
+        )
+        self.assertAlmostEqual(result.ms_per_iteration, 100.0)
+
+    def test_ms_per_iteration_single(self):
+        """Single iteration should report total time in ms."""
+        result = BenchmarkResult(
+            name="test",
+            elapsed_seconds=0.5,
+            iterations=1
+        )
+        self.assertAlmostEqual(result.ms_per_iteration, 500.0)
+
+    def test_str_representation(self):
+        """String representation should include name and timing."""
+        result = BenchmarkResult(
+            name="my_op",
+            elapsed_seconds=1.0,
+            iterations=10,
+            memory_mb=256.0
+        )
+        s = str(result)
+        self.assertIn("my_op", s)
+        self.assertIn("100.00", s)
+        self.assertIn("256.0", s)
+
+
+class TestBenchmarkContextManager(tf.test.TestCase):
+    """Tests for Benchmark context manager."""
+
+    def test_measures_time(self):
+        """Should measure elapsed time > 0."""
+        with Benchmark("sleep_test") as b:
+            time.sleep(0.01)
+
+        self.assertGreater(b.result.elapsed_seconds, 0.0)
+
+    def test_result_has_correct_name(self):
+        """Result should carry the benchmark name."""
+        with Benchmark("named_op") as b:
+            pass
+
+        self.assertEqual(b.result.name, "named_op")
+
+    def test_result_has_correct_iterations(self):
+        """Result should record iteration count."""
+        with Benchmark("iter_test", iterations=5) as b:
+            pass
+
+        self.assertEqual(b.result.iterations, 5)
+
+    def test_timing_is_reasonable(self):
+        """Measured time should be within order of magnitude of actual work."""
+        with Benchmark("timed_op") as b:
+            time.sleep(0.05)
+
+        # Should be at least ~50ms but less than 1s
+        self.assertGreater(b.result.elapsed_seconds, 0.01)
+        self.assertLess(b.result.elapsed_seconds, 1.0)
+
+
+class TestBenchmarkFunction(tf.test.TestCase):
+    """Tests for benchmark_function utility."""
+
+    def test_returns_benchmark_result(self):
+        """Should return a BenchmarkResult."""
+        def noop():
+            return 42
+
+        result = benchmark_function(noop, warmup=1, iterations=3)
+
+        self.assertIsInstance(result, BenchmarkResult)
+        self.assertEqual(result.iterations, 3)
+        self.assertGreater(result.elapsed_seconds, 0.0)
+
+    def test_warmup_not_timed(self):
+        """Warmup iterations should not be included in timing."""
+        call_count = [0]
+
+        def counting_fn():
+            call_count[0] += 1
+            return call_count[0]
+
+        result = benchmark_function(counting_fn, warmup=5, iterations=3)
+
+        # Total calls = warmup + iterations = 8
+        self.assertEqual(call_count[0], 8)
+        # But result should say 3 iterations
+        self.assertEqual(result.iterations, 3)
+
+    def test_custom_name(self):
+        """Should use custom name when provided."""
+        result = benchmark_function(lambda: None, name="custom_name")
+        self.assertEqual(result.name, "custom_name")
+
+    def test_default_name_from_function(self):
+        """Should use function name by default."""
+        def my_function():
+            return None
+
+        result = benchmark_function(my_function, warmup=0, iterations=1)
+        self.assertEqual(result.name, "my_function")
+
+    def test_passes_args_and_kwargs(self):
+        """Should pass args and kwargs to benchmarked function."""
+        def add(a, b, c=0):
+            return a + b + c
+
+        # Should not raise
+        result = benchmark_function(add, args=(1, 2), kwargs={'c': 3})
+        self.assertGreater(result.elapsed_seconds, 0.0)
+
+
+class TestCompareImplementations(tf.test.TestCase):
+    """Tests for compare_implementations utility."""
+
+    def test_returns_all_results(self):
+        """Should return one result per implementation."""
+        impls = {
+            'fast': lambda: 1 + 1,
+            'slow': lambda: sum(range(100)),
+        }
+
+        results = compare_implementations(impls, warmup=1, iterations=3)
+
+        self.assertEqual(len(results), 2)
+        self.assertIn('fast', results)
+        self.assertIn('slow', results)
+
+    def test_faster_impl_is_faster(self):
+        """Faster implementation should measure less time (with tolerance)."""
+        def fast():
+            return 1 + 1
+
+        def slow():
+            total = 0
+            for i in range(10000):
+                total += i
+            return total
+
+        results = compare_implementations(
+            {'fast': fast, 'slow': slow},
+            warmup=2,
+            iterations=10
+        )
+
+        # Fast should be faster (or at least not 10x slower)
+        self.assertLess(
+            results['fast'].elapsed_seconds,
+            results['slow'].elapsed_seconds * 10
+        )
+
+
+class TestCreateTestInput(tf.test.TestCase):
+    """Tests for test input tensor creation."""
+
+    def test_default_shape(self):
+        """Default shape should be (1, 32, 32, 32, 64)."""
+        tensor = create_test_input()
+        self.assertEqual(tensor.shape, (1, 32, 32, 32, 64))
+
+    def test_custom_shape(self):
+        """Should respect custom dimensions."""
+        tensor = create_test_input(
+            batch_size=2, depth=8, height=16, width=4, channels=32
+        )
+        self.assertEqual(tensor.shape, (2, 8, 16, 4, 32))
+
+    def test_default_dtype(self):
+        """Default dtype should be float32."""
+        tensor = create_test_input()
+        self.assertEqual(tensor.dtype, tf.float32)
+
+    def test_custom_dtype(self):
+        """Should respect custom dtype."""
+        tensor = create_test_input(dtype=tf.float16)
+        self.assertEqual(tensor.dtype, tf.float16)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_causality.py b/tests/test_causality.py
new file mode 100755
index 000000000..91092716e
--- /dev/null
+++ b/tests/test_causality.py
@@ -0,0 +1,315 @@
+"""
+Tests for masked convolution causality and autoregressive ordering.
+
+Validates that MaskedConv3D enforces correct causal masks in raster-scan
+order (depth, height, width), type A excludes center, type B includes it,
+and the AutoregressiveContext model maintains causality.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from context_model import AutoregressiveContext, MaskedConv3D
+
+
+class TestMaskedConv3DCausality(tf.test.TestCase):
+    """Tests for MaskedConv3D mask correctness."""
+
+    def test_mask_type_a_excludes_center(self):
+        """Type A mask should be 0 at the center position."""
+        conv = MaskedConv3D(filters=4, kernel_size=3, mask_type='A')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()
+        # Center of 3x3x3 kernel is (1, 1, 1)
+        center_vals = mask[1, 1, 1, :, :]
+        np.testing.assert_array_equal(
+            center_vals, 0.0,
+            err_msg="Type A mask should exclude center position"
+        )
+
+    def test_mask_type_b_includes_center(self):
+        """Type B mask should be 1 at the center position."""
+        conv = MaskedConv3D(filters=4, kernel_size=3, mask_type='B')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()
+        center_vals = mask[1, 1, 1, :, :]
+        np.testing.assert_array_equal(
+            center_vals, 1.0,
+            err_msg="Type B mask should include center position"
+        )
+
+    def test_future_positions_masked(self):
+        """All future positions in raster-scan order should be 0."""
+        conv = MaskedConv3D(filters=4, kernel_size=5, mask_type='A')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()
+        kd, kh, kw = 5, 5, 5
+        center_d, center_h, center_w = 2, 2, 2
+
+        for d in range(kd):
+            for h in range(kh):
+                for w in range(kw):
+                    is_future = (
+                        (d > center_d) or
+                        (d == center_d and h > center_h) or
+                        (d == center_d and h == center_h and w > center_w)
+                    )
+                    is_center = (d == center_d and h == center_h and w == center_w)
+
+                    if is_future or is_center:
+                        np.testing.assert_array_equal(
+                            mask[d, h, w, :, :], 0.0,
+                            err_msg=f"Position ({d},{h},{w}) should be masked"
+                        )
+                    else:
+                        np.testing.assert_array_equal(
+                            mask[d, h, w, :, :], 1.0,
+                            err_msg=f"Position ({d},{h},{w}) should be unmasked"
+                        )
+
+    def test_past_positions_unmasked_type_b(self):
+        """All past + center positions in raster-scan order should be 1 for type B."""
+        conv = MaskedConv3D(filters=4, kernel_size=5, mask_type='B')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()
+        kd, kh, kw = 5, 5, 5
+        center_d, center_h, center_w = 2, 2, 2
+
+        for d in range(kd):
+            for h in range(kh):
+                for w in range(kw):
+                    is_future = (
+                        (d > center_d) or
+                        (d == center_d and h > center_h) or
+                        (d == center_d and h == center_h and w > center_w)
+                    )
+
+                    if is_future:
+                        np.testing.assert_array_equal(
+                            mask[d, h, w, :, :], 0.0,
+                            err_msg=f"Position ({d},{h},{w}) should be masked (future)"
+                        )
+                    else:
+                        np.testing.assert_array_equal(
+                            mask[d, h, w, :, :], 1.0,
+                            err_msg=f"Position ({d},{h},{w}) should be unmasked (past/center)"
+                        )
+
+    def test_mask_shape_matches_kernel(self):
+        """Mask shape should match (kd, kh, kw, in_channels, filters)."""
+        in_channels = 8
+        filters = 16
+        conv = MaskedConv3D(filters=filters, kernel_size=3, mask_type='A')
+        conv.build((None, 8, 8, 8, in_channels))
+
+        self.assertEqual(conv.mask.shape, (3, 3, 3, in_channels, filters))
+
+    def test_mask_broadcast_across_channels(self):
+        """Mask should be the same across all input/output channel pairs."""
+        conv = MaskedConv3D(filters=8, kernel_size=3, mask_type='A')
+        conv.build((None, 8, 8, 8, 4))
+
+        mask = conv.mask.numpy()
+        # All channel slices should be identical
+        reference = mask[:, :, :, 0, 0]
+        for ic in range(4):
+            for oc in range(8):
+                np.testing.assert_array_equal(
+                    mask[:, :, :, ic, oc], reference,
+                    err_msg=f"Channel ({ic},{oc}) mask differs from reference"
+                )
+
+    def test_invalid_mask_type_raises(self):
+        """Invalid mask type should raise ValueError."""
+        with self.assertRaises(ValueError):
+            MaskedConv3D(filters=4, kernel_size=3, mask_type='C')
+
+    def test_output_shape_same_padding(self):
+        """Output should have same spatial dims with 'same' padding."""
+        conv = MaskedConv3D(filters=8, kernel_size=3, mask_type='A', padding='same')
+        inputs = tf.random.normal((1, 8, 8, 8, 4))
+        output = conv(inputs)
+
+        self.assertEqual(output.shape, (1, 8, 8, 8, 8))
+
+    def test_kernel_size_1_type_a_all_zero(self):
+        """Kernel size 1 with type A should have all-zero mask (no past)."""
+        conv = MaskedConv3D(filters=4, kernel_size=1, mask_type='A')
+        conv.build((None, 8, 8, 8, 2))
+
+        mask = conv.mask.numpy()
+        np.testing.assert_array_equal(mask, 0.0)
+
+    def test_kernel_size_1_type_b_all_one(self):
+        """Kernel size 1 with type B should have all-one mask (center only)."""
+        conv = MaskedConv3D(filters=4, kernel_size=1, mask_type='B')
+        conv.build((None, 8, 8, 8, 2))
+
+        mask = conv.mask.numpy()
+        np.testing.assert_array_equal(mask, 1.0)
+
+
+class TestCausalOutputDependence(tf.test.TestCase):
+    """Tests that masked conv output at position (d,h,w) depends only on past."""
+
+    def test_type_a_output_independent_of_current_position(self):
+        """With type A, changing center input should not affect center output."""
+        tf.random.set_seed(42)
+        conv = MaskedConv3D(filters=1, kernel_size=3, mask_type='A')
+
+        # Create two inputs that differ only at center position (4,4,4)
+        input1 = tf.random.normal((1, 8, 8, 8, 1))
+        input2 = tf.identity(input1)
+        # Modify center position
+        input2_np = input2.numpy()
+        input2_np[0, 4, 4, 4, 0] = 999.0
+        input2 = tf.constant(input2_np)
+
+        out1 = conv(input1)
+        out2 = conv(input2)
+
+        # Output at (4,4,4) should be the same (center is masked for type A)
+        self.assertAllClose(
+            out1[0, 4, 4, 4, :], out2[0, 4, 4, 4, :],
+            atol=1e-5,
+            msg="Type A output should not depend on current position"
+        )
+
+    def test_type_b_output_depends_on_current_position(self):
+        """With type B, changing center input should affect center output."""
+        tf.random.set_seed(42)
+        conv = MaskedConv3D(filters=1, kernel_size=3, mask_type='B')
+
+        input1 = tf.random.normal((1, 8, 8, 8, 1))
+        input2_np = input1.numpy().copy()
+        input2_np[0, 4, 4, 4, 0] = 999.0
+        input2 = tf.constant(input2_np)
+
+        out1 = conv(input1)
+        out2 = conv(input2)
+
+        # Output at (4,4,4) should differ (center is unmasked for type B)
+        diff = tf.abs(out1[0, 4, 4, 4, :] - out2[0, 4, 4, 4, :])
+        self.assertGreater(float(tf.reduce_max(diff)), 0.01)
+
+    def test_future_change_does_not_affect_past_output(self):
+        """Changing a future position should not affect any past output."""
+        tf.random.set_seed(42)
+        conv = MaskedConv3D(filters=1, kernel_size=3, mask_type='A')
+
+        input1 = tf.random.normal((1, 8, 8, 8, 1))
+        input2_np = input1.numpy().copy()
+        # Modify a "future" position (7,7,7)
+        input2_np[0, 7, 7, 7, 0] = 999.0
+        input2 = tf.constant(input2_np)
+
+        out1 = conv(input1)
+        out2 = conv(input2)
+
+        # All positions before (7,7,7) should be identical
+        # Check positions 0..6 in depth (all are strictly before depth=7)
+        self.assertAllClose(
+            out1[0, :7, :, :, :], out2[0, :7, :, :, :],
+            atol=1e-5,
+            msg="Past outputs should not change when future input changes"
+        )
+
+
+class TestAutoregressiveContext(tf.test.TestCase):
+    """Tests for the AutoregressiveContext model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.channels = 16
+        self.resolution = 8
+
+    def test_output_shape(self):
+        """Output should have shape (B, D, H, W, channels)."""
+        ctx = AutoregressiveContext(channels=self.channels, num_layers=3)
+        inputs = tf.random.normal((1, self.resolution, self.resolution, self.resolution, 4))
+        output = ctx(inputs)
+
+        self.assertEqual(output.shape, (1, self.resolution, self.resolution, self.resolution, self.channels))
+
+    def test_first_layer_is_type_a(self):
+        """First conv layer should use mask type A."""
+        ctx = AutoregressiveContext(channels=self.channels, num_layers=3)
+        self.assertEqual(ctx.conv_layers[0].mask_type, 'A')
+
+    def test_subsequent_layers_are_type_b(self):
+        """All subsequent conv layers should use mask type B."""
+        ctx = AutoregressiveContext(channels=self.channels, num_layers=3)
+        for conv in ctx.conv_layers[1:]:
+            self.assertEqual(conv.mask_type, 'B')
+
+    def test_causal_output(self):
+        """Changing future input should not affect past outputs."""
+        tf.random.set_seed(42)
+        ctx = AutoregressiveContext(channels=self.channels, num_layers=2, kernel_size=3)
+
+        input1 = tf.random.normal((1, self.resolution, self.resolution, self.resolution, 4))
+        input2_np = input1.numpy().copy()
+        # Modify last depth slice (future)
+        input2_np[0, -1, :, :, :] = 999.0
+        input2 = tf.constant(input2_np)
+
+        out1 = ctx(input1)
+        out2 = ctx(input2)
+
+        # Outputs at depth 0 should be identical (far from modified depth)
+        # With kernel_size=3 and 2 layers, receptive field is at most 4
+        self.assertAllClose(
+            out1[0, 0, :, :, :], out2[0, 0, :, :, :],
+            atol=1e-5,
+            msg="Early depth outputs should not depend on last depth slice"
+        )
+
+
+class TestMaskCountProperties(tf.test.TestCase):
+    """Tests for statistical properties of the mask."""
+
+    def test_type_a_has_fewer_ones_than_type_b(self):
+        """Type A (excludes center) should have fewer 1s than type B."""
+        conv_a = MaskedConv3D(filters=1, kernel_size=3, mask_type='A')
+        conv_b = MaskedConv3D(filters=1, kernel_size=3, mask_type='B')
+        conv_a.build((None, 8, 8, 8, 1))
+        conv_b.build((None, 8, 8, 8, 1))
+
+        ones_a = np.sum(conv_a.mask.numpy())
+        ones_b = np.sum(conv_b.mask.numpy())
+
+        self.assertLess(ones_a, ones_b)
+
+    def test_type_a_count_3x3x3(self):
+        """3x3x3 type A should have 13 unmasked positions (half minus center)."""
+        conv = MaskedConv3D(filters=1, kernel_size=3, mask_type='A')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()[:, :, :, 0, 0]
+        # In 3x3x3=27 positions, 13 are past, 1 is center, 13 are future
+        # Type A: 13 past = unmasked
+        self.assertEqual(int(np.sum(mask)), 13)
+
+    def test_type_b_count_3x3x3(self):
+        """3x3x3 type B should have 14 unmasked positions (half + center)."""
+        conv = MaskedConv3D(filters=1, kernel_size=3, mask_type='B')
+        conv.build((None, 8, 8, 8, 1))
+
+        mask = conv.mask.numpy()[:, :, :, 0, 0]
+        # 13 past + 1 center = 14
+        self.assertEqual(int(np.sum(mask)), 14)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_compress_octree.py b/tests/test_compress_octree.py
index 87ae819f7..d66347908 100644
--- a/tests/test_compress_octree.py
+++ b/tests/test_compress_octree.py
@@ -116,6 +116,7 @@ def test_octree_partitioning(self):
     def test_save_and_load(self):
         """Test saving and loading functionality."""
         save_path = Path(self.test_env['tmp_path']) / "test_compressed.npz"
+        meta_path = Path(str(save_path) + '.meta.json')
 
         # Compress and save
         grid, metadata = self.compressor.compress(
@@ -124,8 +125,9 @@ def test_save_and_load(self):
         )
         self.compressor.save_compressed(grid, metadata, str(save_path))
 
-        # Verify file exists
+        # Verify both files exist
         self.assertTrue(save_path.exists())
+        self.assertTrue(meta_path.exists())
 
         # Load and verify
         loaded_grid, loaded_metadata = self.compressor.load_compressed(str(save_path))
@@ -137,6 +139,10 @@ def test_save_and_load(self):
         for key in ['min_bounds', 'max_bounds', 'ranges', 'has_normals']:
             self.assertIn(key, loaded_metadata)
 
+        # Check array fields are numpy arrays after load
+        for key in ['min_bounds', 'max_bounds', 'ranges']:
+            self.assertIsInstance(loaded_metadata[key], np.ndarray)
+
     def test_error_handling(self):
         """Test error handling."""
         # Test empty point cloud
@@ -156,5 +162,218 @@ def test_error_handling(self):
         with self.assertRaisesRegex(ValueError, "shape must match"):
             self.compressor.compress(self.point_cloud, normals=wrong_shape_normals)
 
+    # --- NaN / Inf / degenerate value tests ---
+
+    def test_save_load_metadata_with_nan_and_inf(self):
+        """NaN and Inf scalar values in metadata are converted to None."""
+        save_path = Path(self.test_env['tmp_path']) / "special_values.npz"
+        grid = np.zeros((64, 64, 64), dtype=bool)
+        grid[0, 0, 0] = True
+        metadata = {
+            'min_bounds': np.array([0.0, 0.0, 0.0]),
+            'max_bounds': np.array([1.0, 1.0, 1.0]),
+            'ranges': np.array([1.0, 1.0, 1.0]),
+            'has_normals': False,
+            'nan_value': float('nan'),
+            'inf_value': float('inf'),
+            'neg_inf_value': float('-inf'),
+        }
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        _, loaded = self.compressor.load_compressed(str(save_path))
+        self.assertIsNone(loaded['nan_value'])
+        self.assertIsNone(loaded['inf_value'])
+        self.assertIsNone(loaded['neg_inf_value'])
+
+    def test_save_load_metadata_with_numpy_nan(self):
+        """NaN from np.floating scalar is also converted to None."""
+        save_path = Path(self.test_env['tmp_path']) / "np_nan.npz"
+        grid = np.zeros((64, 64, 64), dtype=bool)
+        grid[0, 0, 0] = True
+        metadata = {
+            'min_bounds': np.array([0.0, 0.0, 0.0]),
+            'max_bounds': np.array([1.0, 1.0, 1.0]),
+            'ranges': np.array([1.0, 1.0, 1.0]),
+            'has_normals': False,
+            'compression_error': np.float64('nan'),
+        }
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        _, loaded = self.compressor.load_compressed(str(save_path))
+        self.assertIsNone(loaded['compression_error'])
+
+    def test_compress_all_points_same_voxel(self):
+        """All identical points compress to single occupied voxel."""
+        same_points = np.full((100, 3), 5.0, dtype=np.float32)
+        grid, metadata = self.compressor.compress(same_points, validate=False)
+        self.assertEqual(np.sum(grid), 1)
+        np.testing.assert_allclose(metadata['ranges'], [1e-6, 1e-6, 1e-6])
+
+    # --- Zero / empty / boundary tests ---
+
+    def test_save_load_empty_grid(self):
+        """All-False grid saves and loads correctly."""
+        save_path = Path(self.test_env['tmp_path']) / "empty_grid.npz"
+        grid = np.zeros((64, 64, 64), dtype=bool)
+        metadata = {
+            'min_bounds': np.array([0.0, 0.0, 0.0]),
+            'max_bounds': np.array([1.0, 1.0, 1.0]),
+            'ranges': np.array([1.0, 1.0, 1.0]),
+            'has_normals': False,
+        }
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        loaded_grid, loaded_metadata = self.compressor.load_compressed(str(save_path))
+        self.assertEqual(np.sum(loaded_grid), 0)
+        self.assertFalse(loaded_metadata['has_normals'])
+
+    def test_save_load_without_normals(self):
+        """Metadata without normal_grid round-trips correctly."""
+        save_path = Path(self.test_env['tmp_path']) / "no_normals.npz"
+        grid, metadata = self.compressor.compress(self.point_cloud, validate=False)
+        self.assertFalse(metadata['has_normals'])
+        self.assertNotIn('normal_grid', metadata)
+
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        loaded_grid, loaded_metadata = self.compressor.load_compressed(str(save_path))
+        np.testing.assert_array_equal(grid, loaded_grid)
+        self.assertFalse(loaded_metadata['has_normals'])
+        self.assertNotIn('normal_grid', loaded_metadata)
+
+    # --- Negative / error path tests ---
+
+    def test_load_compressed_missing_metadata_file(self):
+        """Missing .meta.json sidecar raises FileNotFoundError."""
+        save_path = Path(self.test_env['tmp_path']) / "partial_write.npz"
+        grid = np.zeros((64, 64, 64), dtype=bool)
+        metadata = {
+            'min_bounds': np.array([0.0, 0.0, 0.0]),
+            'max_bounds': np.array([1.0, 1.0, 1.0]),
+            'ranges': np.array([1.0, 1.0, 1.0]),
+            'has_normals': False,
+        }
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+
+        # Simulate partial write: delete the sidecar
+        meta_path = Path(str(save_path) + '.meta.json')
+        meta_path.unlink()
+
+        with self.assertRaises(FileNotFoundError):
+            self.compressor.load_compressed(str(save_path))
+
+    def test_load_compressed_missing_grid_file(self):
+        """Missing .npz grid file raises error."""
+        missing_path = Path(self.test_env['tmp_path']) / "nonexistent.npz"
+        with self.assertRaises(FileNotFoundError):
+            self.compressor.load_compressed(str(missing_path))
+
+    # --- Debug output security test ---
+
+    def test_debug_info_does_not_pickle_dicts(self):
+        """Debug output skips dict values, only saves numpy arrays."""
+        self.compressor.compress(self.point_cloud, validate=False)
+
+        debug_dir = Path(self.test_env['tmp_path']) / 'debug' / 'grid_creation'
+        self.assertTrue(debug_dir.exists())
+
+        # 'metadata' (a dict) should NOT be saved as .npy
+        self.assertFalse((debug_dir / 'metadata.npy').exists())
+
+        # 'grid' and 'scaled_points' (arrays) SHOULD be saved
+        self.assertTrue((debug_dir / 'grid.npy').exists())
+        self.assertTrue((debug_dir / 'scaled_points.npy').exists())
+
+        # All saved .npy files must be loadable without pickle
+        for npy_file in debug_dir.glob('*.npy'):
+            np.load(str(npy_file), allow_pickle=False)
+
+    # --- Regression / format fidelity tests ---
+
+    def test_save_load_metadata_values_roundtrip(self):
+        """Numeric metadata values are preserved after JSON round-trip."""
+        save_path = Path(self.test_env['tmp_path']) / "fidelity.npz"
+        grid, metadata = self.compressor.compress(self.point_cloud)
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        _, loaded = self.compressor.load_compressed(str(save_path))
+
+        np.testing.assert_allclose(
+            loaded['min_bounds'], metadata['min_bounds'], rtol=1e-6
+        )
+        np.testing.assert_allclose(
+            loaded['max_bounds'], metadata['max_bounds'], rtol=1e-6
+        )
+        np.testing.assert_allclose(
+            loaded['ranges'], metadata['ranges'], rtol=1e-6
+        )
+        self.assertAlmostEqual(
+            loaded['compression_error'], metadata['compression_error'], places=6
+        )
+
+    def test_save_load_numpy_scalar_metadata(self):
+        """np.float64 and np.int32 scalars survive type conversion."""
+        save_path = Path(self.test_env['tmp_path']) / "scalar_types.npz"
+        grid = np.zeros((64, 64, 64), dtype=bool)
+        grid[0, 0, 0] = True
+        metadata = {
+            'min_bounds': np.array([0.0, 0.0, 0.0]),
+            'max_bounds': np.array([1.0, 1.0, 1.0]),
+            'ranges': np.array([1.0, 1.0, 1.0]),
+            'has_normals': False,
+            'float_scalar': np.float64(3.14),
+            'int_scalar': np.int32(42),
+        }
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        _, loaded = self.compressor.load_compressed(str(save_path))
+        self.assertAlmostEqual(loaded['float_scalar'], 3.14, places=10)
+        self.assertEqual(loaded['int_scalar'], 42)
+
+    def test_save_load_dtype_after_roundtrip(self):
+        """Documents that float32 arrays become float64 after JSON round-trip."""
+        save_path = Path(self.test_env['tmp_path']) / "dtype_test.npz"
+        grid, metadata = self.compressor.compress(self.point_cloud, validate=False)
+        # Original is float32 from np.min on float32 input
+        self.assertEqual(metadata['min_bounds'].dtype, np.float32)
+
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        _, loaded = self.compressor.load_compressed(str(save_path))
+        # After JSON round-trip, np.array() defaults to float64
+        self.assertEqual(loaded['min_bounds'].dtype, np.float64)
+
+    def test_decompress_after_save_load_matches_direct(self):
+        """Decompress from loaded metadata produces same points as from original."""
+        save_path = Path(self.test_env['tmp_path']) / "roundtrip_quality.npz"
+        grid, metadata = self.compressor.compress(self.point_cloud, validate=False)
+
+        # Decompress directly from original metadata
+        direct_points, _ = self.compressor.decompress(grid, metadata)
+
+        # Save, load, decompress
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+        loaded_grid, loaded_metadata = self.compressor.load_compressed(str(save_path))
+        loaded_points, _ = self.compressor.decompress(loaded_grid, loaded_metadata)
+
+        # Points should match despite dtype change (float32 vs float64)
+        np.testing.assert_allclose(
+            loaded_points, direct_points.astype(np.float64), rtol=1e-5
+        )
+
+    # --- E2E test ---
+
+    @pytest.mark.e2e
+    def test_compress_save_load_decompress_quality(self):
+        """Full pipeline: compress, save, load, decompress, verify quality."""
+        save_path = Path(self.test_env['tmp_path']) / "e2e.npz"
+
+        grid, metadata = self.compressor.compress(self.point_cloud)
+        original_error = metadata['compression_error']
+        self.compressor.save_compressed(grid, metadata, str(save_path))
+
+        loaded_grid, loaded_metadata = self.compressor.load_compressed(str(save_path))
+        decompressed, _ = self.compressor.decompress(loaded_grid, loaded_metadata)
+
+        # Decompressed point count should be reasonable
+        self.assertGreater(len(decompressed), 0)
+        # Reconstruction error should match original
+        self.assertAlmostEqual(
+            loaded_metadata['compression_error'], original_error, places=6
+        )
+
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tests/test_data_pipeline.py b/tests/test_data_pipeline.py
new file mode 100755
index 000000000..247a8fd82
--- /dev/null
+++ b/tests/test_data_pipeline.py
@@ -0,0 +1,325 @@
+"""
+Tests for data pipeline: OFF/PLY file I/O, mesh-to-point-cloud sampling,
+and point cloud partitioning.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from ds_mesh_to_pc import (
+    MeshData,
+    compute_face_normals,
+    partition_point_cloud,
+    read_off,
+    sample_points_from_mesh,
+    save_ply,
+)
+
+
+class TestReadOFF(tf.test.TestCase):
+    """Tests for OFF file reading."""
+
+    @pytest.fixture(autouse=True)
+    def inject_tmp_path(self, tmp_path):
+        self.tmp_path = tmp_path
+
+    def test_valid_off_file(self):
+        """Should parse a valid OFF file correctly."""
+        off_path = self.tmp_path / "test.off"
+        off_path.write_text(
+            "OFF\n"
+            "4 2 0\n"
+            "0.0 0.0 0.0\n"
+            "1.0 0.0 0.0\n"
+            "0.0 1.0 0.0\n"
+            "0.0 0.0 1.0\n"
+            "3 0 1 2\n"
+            "3 0 1 3\n"
+        )
+
+        mesh = read_off(str(off_path))
+
+        self.assertEqual(mesh.vertices.shape, (4, 3))
+        self.assertEqual(mesh.faces.shape, (2, 3))
+
+    def test_vertices_only_off(self):
+        """Should handle OFF files with vertices but no faces."""
+        off_path = self.tmp_path / "verts.off"
+        off_path.write_text(
+            "OFF\n"
+            "3 0 0\n"
+            "1.0 2.0 3.0\n"
+            "4.0 5.0 6.0\n"
+            "7.0 8.0 9.0\n"
+        )
+
+        mesh = read_off(str(off_path))
+
+        self.assertIsNotNone(mesh)
+        self.assertEqual(mesh.vertices.shape, (3, 3))
+        self.assertIsNone(mesh.faces)
+
+    def test_invalid_header_returns_none(self):
+        """Non-OFF header should return None."""
+        off_path = self.tmp_path / "bad.off"
+        off_path.write_text("NOT_OFF\n1 0 0\n0.0 0.0 0.0\n")
+
+        mesh = read_off(str(off_path))
+        self.assertIsNone(mesh)
+
+    def test_nonexistent_file_returns_none(self):
+        """Missing file should return None (not raise)."""
+        mesh = read_off(str(self.tmp_path / "nonexistent.off"))
+        self.assertIsNone(mesh)
+
+    def test_ngon_triangulation(self):
+        """N-gons (quads etc.) should be triangulated via fan method."""
+        off_path = self.tmp_path / "quad.off"
+        off_path.write_text(
+            "OFF\n"
+            "4 1 0\n"
+            "0.0 0.0 0.0\n"
+            "1.0 0.0 0.0\n"
+            "1.0 1.0 0.0\n"
+            "0.0 1.0 0.0\n"
+            "4 0 1 2 3\n"  # Quad face
+        )
+
+        mesh = read_off(str(off_path))
+
+        # Quad should become 2 triangles
+        self.assertIsNotNone(mesh.faces)
+        self.assertEqual(mesh.faces.shape[0], 2)
+        self.assertEqual(mesh.faces.shape[1], 3)
+
+
+class TestComputeFaceNormals(tf.test.TestCase):
+    """Tests for face normal computation."""
+
+    def test_unit_triangle_normal(self):
+        """Right triangle in XY plane should have Z-aligned normal."""
+        vertices = np.array([
+            [0, 0, 0],
+            [1, 0, 0],
+            [0, 1, 0],
+        ], dtype=np.float32)
+        faces = np.array([[0, 1, 2]], dtype=np.int32)
+
+        normals = compute_face_normals(vertices, faces)
+
+        self.assertEqual(normals.shape, (1, 3))
+        # Normal should be [0, 0, 1] (Z direction)
+        np.testing.assert_allclose(np.abs(normals[0]), [0, 0, 1], atol=1e-5)
+
+    def test_normals_are_unit_length(self):
+        """Face normals should be unit length."""
+        np.random.seed(42)
+        vertices = np.random.randn(10, 3).astype(np.float32)
+        faces = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=np.int32)
+
+        normals = compute_face_normals(vertices, faces)
+        lengths = np.linalg.norm(normals, axis=1)
+
+        np.testing.assert_allclose(lengths, 1.0, atol=1e-5)
+
+
+class TestSamplePointsFromMesh(tf.test.TestCase):
+    """Tests for mesh-to-point-cloud sampling."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        np.random.seed(42)
+        self.vertices = np.array([
+            [0, 0, 0],
+            [1, 0, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+        ], dtype=np.float32)
+        self.faces = np.array([[0, 1, 2], [0, 1, 3]], dtype=np.int32)
+        self.mesh = MeshData(
+            vertices=self.vertices,
+            faces=self.faces,
+            face_normals=compute_face_normals(self.vertices, self.faces)
+        )
+
+    def test_correct_num_points(self):
+        """Should return exactly num_points points."""
+        points, normals = sample_points_from_mesh(self.mesh, num_points=100)
+        self.assertEqual(points.shape[0], 100)
+        self.assertEqual(points.shape[1], 3)
+
+    def test_points_within_mesh_bounds(self):
+        """Sampled points should be within vertex bounding box."""
+        points, _ = sample_points_from_mesh(self.mesh, num_points=1000)
+
+        min_bound = self.vertices.min(axis=0)
+        max_bound = self.vertices.max(axis=0)
+
+        for dim in range(3):
+            self.assertAllGreaterEqual(points[:, dim], min_bound[dim] - 1e-5)
+            self.assertAllLessEqual(points[:, dim], max_bound[dim] + 1e-5)
+
+    def test_normals_unit_length(self):
+        """Returned normals should be unit length."""
+        points, normals = sample_points_from_mesh(
+            self.mesh, num_points=100, compute_normals=True
+        )
+
+        self.assertIsNotNone(normals)
+        lengths = np.linalg.norm(normals, axis=1)
+        np.testing.assert_allclose(lengths, 1.0, atol=1e-5)
+
+    def test_no_normals_when_disabled(self):
+        """Should return None normals when compute_normals=False."""
+        mesh_no_normals = MeshData(
+            vertices=self.vertices,
+            faces=self.faces
+        )
+        points, normals = sample_points_from_mesh(
+            mesh_no_normals, num_points=100, compute_normals=False
+        )
+        self.assertIsNone(normals)
+
+    def test_vertex_sampling_when_no_faces(self):
+        """Without faces, should sample directly from vertices."""
+        mesh_no_faces = MeshData(vertices=self.vertices)
+        points, _ = sample_points_from_mesh(
+            mesh_no_faces, num_points=10, compute_normals=False
+        )
+        self.assertEqual(points.shape, (10, 3))
+
+    def test_points_dtype_float32(self):
+        """Sampled points should be float32."""
+        points, _ = sample_points_from_mesh(self.mesh, num_points=50)
+        self.assertEqual(points.dtype, np.float32)
+
+
+class TestPartitionPointCloud(tf.test.TestCase):
+    """Tests for point cloud spatial partitioning."""
+
+    def test_single_block(self):
+        """Points within one block_size should create one block."""
+        points = np.random.uniform(0, 0.5, (200, 3)).astype(np.float32)
+
+        blocks = partition_point_cloud(points, block_size=1.0, min_points=10)
+
+        self.assertEqual(len(blocks), 1)
+        self.assertEqual(blocks[0]['points'].shape[1], 3)
+
+    def test_multiple_blocks(self):
+        """Spread-out points should create multiple blocks."""
+        # Create two clusters far apart
+        cluster1 = np.random.uniform(0, 0.5, (100, 3)).astype(np.float32)
+        cluster2 = np.random.uniform(5, 5.5, (100, 3)).astype(np.float32)
+        points = np.vstack([cluster1, cluster2])
+
+        blocks = partition_point_cloud(points, block_size=1.0, min_points=10)
+
+        self.assertGreaterEqual(len(blocks), 2)
+
+    def test_min_points_filter(self):
+        """Blocks with fewer than min_points should be excluded."""
+        # Create a big cluster and a tiny cluster
+        big_cluster = np.random.uniform(0, 0.5, (200, 3)).astype(np.float32)
+        tiny_cluster = np.random.uniform(10, 10.1, (5, 3)).astype(np.float32)
+        points = np.vstack([big_cluster, tiny_cluster])
+
+        blocks = partition_point_cloud(points, block_size=1.0, min_points=10)
+
+        # Tiny cluster should be filtered out
+        total_points = sum(len(b['points']) for b in blocks)
+        self.assertEqual(total_points, 200)
+
+    def test_normals_partitioned(self):
+        """Normals should be partitioned along with points."""
+        np.random.seed(42)
+        points = np.random.uniform(0, 0.5, (200, 3)).astype(np.float32)
+        normals = np.random.randn(200, 3).astype(np.float32)
+
+        blocks = partition_point_cloud(
+            points, normals=normals, block_size=1.0, min_points=10
+        )
+
+        for block in blocks:
+            self.assertIn('normals', block)
+            self.assertEqual(block['normals'].shape[0], block['points'].shape[0])
+            self.assertEqual(block['normals'].shape[1], 3)
+
+    def test_all_points_accounted_for(self):
+        """All points in valid blocks should appear exactly once."""
+        np.random.seed(42)
+        points = np.random.uniform(0, 3, (500, 3)).astype(np.float32)
+
+        blocks = partition_point_cloud(points, block_size=1.0, min_points=1)
+
+        total = sum(len(b['points']) for b in blocks)
+        self.assertEqual(total, 500)
+
+
+class TestSavePly(tf.test.TestCase):
+    """Tests for PLY file writing."""
+
+    @pytest.fixture(autouse=True)
+    def inject_tmp_path(self, tmp_path):
+        self.tmp_path = tmp_path
+
+    def test_write_points_only(self):
+        """Should write valid PLY with points only."""
+        points = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        ply_path = str(self.tmp_path / "test.ply")
+
+        save_ply(ply_path, points)
+
+        content = Path(ply_path).read_text()
+        self.assertIn("ply", content)
+        self.assertIn("element vertex 2", content)
+        self.assertNotIn("property float nx", content)
+
+    def test_write_points_with_normals(self):
+        """Should write valid PLY with points and normals."""
+        points = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        normals = np.array([[0, 0, 1], [0, 1, 0]], dtype=np.float32)
+        ply_path = str(self.tmp_path / "test_normals.ply")
+
+        save_ply(ply_path, points, normals)
+
+        content = Path(ply_path).read_text()
+        self.assertIn("property float nx", content)
+        self.assertIn("property float ny", content)
+        self.assertIn("property float nz", content)
+
+    def test_roundtrip_off_to_ply(self):
+        """OFF → sample → PLY should produce valid output."""
+        # Write OFF
+        off_path = str(self.tmp_path / "mesh.off")
+        with open(off_path, 'w') as f:
+            f.write("OFF\n4 2 0\n")
+            f.write("0 0 0\n1 0 0\n0 1 0\n0 0 1\n")
+            f.write("3 0 1 2\n3 0 1 3\n")
+
+        # Read and sample
+        mesh = read_off(off_path)
+        self.assertIsNotNone(mesh)
+        points, normals = sample_points_from_mesh(mesh, num_points=50)
+
+        # Write PLY
+        ply_path = str(self.tmp_path / "output.ply")
+        save_ply(ply_path, points, normals)
+
+        # Verify file exists and has content
+        content = Path(ply_path).read_text()
+        lines = content.strip().split('\n')
+        # Header + 50 data lines
+        header_end = lines.index('end_header')
+        data_lines = lines[header_end + 1:]
+        self.assertEqual(len(data_lines), 50)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_entropy_correctness.py b/tests/test_entropy_correctness.py
new file mode 100755
index 000000000..9c9b22ce5
--- /dev/null
+++ b/tests/test_entropy_correctness.py
@@ -0,0 +1,357 @@
+"""
+Tests for entropy model mathematical correctness.
+
+Validates that discretized Gaussian likelihood is a proper probability mass
+function (PMF), rate estimates are non-negative, and quantization behavior
+switches correctly between training and inference.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from constants import LOG_2_RECIPROCAL
+from entropy_model import (
+    ConditionalGaussian,
+    EntropyModel,
+    MeanScaleHyperprior,
+    PatchedGaussianConditional,
+    _discretized_gaussian_likelihood,
+)
+
+
+class TestDiscretizedLikelihood(tf.test.TestCase):
+    """Tests for the discretized Gaussian likelihood function."""
+
+    def test_pmf_positive(self):
+        """All PMF values must be strictly positive (floored at EPSILON)."""
+        tf.random.set_seed(42)
+        inputs = tf.random.normal((2, 8, 8, 8, 16))
+        mean = tf.zeros_like(inputs)
+        scale = tf.ones_like(inputs)
+
+        likelihood = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertAllGreater(likelihood, 0.0)
+
+    def test_pmf_at_most_one(self):
+        """No single bin should have probability > 1."""
+        inputs = tf.constant([0.0, 1.0, -1.0, 5.0, -5.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.ones_like(inputs)
+
+        likelihood = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertAllLessEqual(likelihood, 1.0)
+
+    def test_pmf_sums_approximately_to_one(self):
+        """PMF over a wide range of integers should sum close to 1."""
+        # Evaluate PMF over integers from -50 to +50 for various scales
+        for scale_val in [0.1, 0.5, 1.0, 2.0, 5.0]:
+            integers = tf.cast(tf.range(-50, 51), tf.float32)
+            mean = tf.zeros_like(integers)
+            scale = tf.fill(integers.shape, scale_val)
+
+            likelihood = _discretized_gaussian_likelihood(integers, mean, scale)
+            total = tf.reduce_sum(likelihood)
+
+            # Should be very close to 1.0 (small scale needs wider range)
+            self.assertAllClose(total, 1.0, atol=1e-3,
+                                msg=f"PMF sum={total:.6f} for scale={scale_val}")
+
+    def test_pmf_peaks_at_mean(self):
+        """PMF should be highest at the integer closest to mean."""
+        mean_val = 2.3
+        integers = tf.cast(tf.range(-10, 11), tf.float32)
+        mean = tf.fill(integers.shape, mean_val)
+        scale = tf.ones_like(integers)
+
+        likelihood = _discretized_gaussian_likelihood(integers, mean, scale)
+
+        # The peak should be at integer 2 (closest to 2.3)
+        peak_idx = tf.argmax(likelihood)
+        # integers[12] = 2 (index 0 -> -10, so index 12 -> 2)
+        self.assertEqual(int(integers[peak_idx]), 2)
+
+    def test_pmf_symmetric_around_integer_mean(self):
+        """PMF should be symmetric when mean is an integer."""
+        mean_val = 0.0
+        integers = tf.cast(tf.range(-10, 11), tf.float32)
+        mean = tf.fill(integers.shape, mean_val)
+        scale = tf.ones_like(integers)
+
+        likelihood = _discretized_gaussian_likelihood(integers, mean, scale)
+        likelihood_np = likelihood.numpy()
+
+        # Check symmetry: P(k) == P(-k) for integer mean
+        for k in range(1, 11):
+            idx_pos = 10 + k  # index of +k
+            idx_neg = 10 - k  # index of -k
+            np.testing.assert_allclose(
+                likelihood_np[idx_pos], likelihood_np[idx_neg], rtol=1e-5,
+                err_msg=f"Asymmetry at k={k}"
+            )
+
+    def test_small_scale_concentrates_mass(self):
+        """Very small scale should concentrate mass near mean."""
+        integers = tf.cast(tf.range(-10, 11), tf.float32)
+        mean = tf.zeros_like(integers)
+        scale = tf.fill(integers.shape, 0.01)
+
+        likelihood = _discretized_gaussian_likelihood(integers, mean, scale)
+
+        # Almost all mass at 0
+        self.assertGreater(float(likelihood[10]), 0.99)  # index 10 = integer 0
+
+    def test_large_scale_spreads_mass(self):
+        """Large scale should spread mass more evenly."""
+        integers = tf.cast(tf.range(-10, 11), tf.float32)
+        mean = tf.zeros_like(integers)
+        scale = tf.fill(integers.shape, 10.0)
+
+        likelihood = _discretized_gaussian_likelihood(integers, mean, scale)
+
+        # Mass at 0 should be much less than for small scale
+        self.assertLess(float(likelihood[10]), 0.1)
+
+    def test_scale_clipped_to_minimum(self):
+        """Scale values near zero should not produce NaN."""
+        inputs = tf.constant([0.0, 1.0, -1.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.constant([1e-10, 0.0, -1.0])  # degenerate scales
+
+        likelihood = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertAllGreater(likelihood, 0.0)
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(likelihood)))
+
+
+class TestRateComputation(tf.test.TestCase):
+    """Tests that rate (bits) from likelihood is non-negative and sensible."""
+
+    def test_rate_non_negative(self):
+        """Bits from discretized likelihood must be non-negative."""
+        tf.random.set_seed(42)
+        inputs = tf.random.normal((1, 8, 8, 8, 16))
+        mean = tf.zeros_like(inputs)
+        scale = tf.ones_like(inputs)
+
+        likelihood = _discretized_gaussian_likelihood(inputs, mean, scale)
+        bits = -tf.math.log(likelihood) * LOG_2_RECIPROCAL
+
+        self.assertAllGreaterEqual(bits, 0.0)
+
+    def test_rate_increases_with_surprise(self):
+        """Unlikely values should require more bits than likely values."""
+        mean = tf.constant([0.0])
+        scale = tf.constant([1.0])
+
+        # Value at mean vs far from mean
+        likely = tf.constant([0.0])
+        unlikely = tf.constant([10.0])
+
+        ll_likely = _discretized_gaussian_likelihood(likely, mean, scale)
+        ll_unlikely = _discretized_gaussian_likelihood(unlikely, mean, scale)
+
+        bits_likely = float(-tf.math.log(ll_likely) * LOG_2_RECIPROCAL)
+        bits_unlikely = float(-tf.math.log(ll_unlikely) * LOG_2_RECIPROCAL)
+
+        # Unlikely values should cost more bits
+        self.assertGreater(bits_unlikely, bits_likely)
+
+    def test_total_bits_from_entropy_model(self):
+        """EntropyModel should produce non-negative total bits."""
+        tf.random.set_seed(42)
+        model = EntropyModel()
+        inputs = tf.random.normal((1, 8, 8, 8, 16))
+
+        compressed, likelihood = model(inputs, training=False)
+
+        total_bits = tf.reduce_sum(-tf.math.log(likelihood) * LOG_2_RECIPROCAL)
+        self.assertGreater(float(total_bits), 0.0)
+
+    def test_low_entropy_vs_high_entropy(self):
+        """Small scale (concentrated) should have fewer bits than large scale."""
+        integers = tf.cast(tf.range(-20, 21), tf.float32)
+        mean = tf.zeros_like(integers)
+
+        # Small scale: concentrated distribution -> low entropy
+        scale_small = tf.fill(integers.shape, 0.5)
+        ll_small = _discretized_gaussian_likelihood(integers, mean, scale_small)
+        entropy_small = float(tf.reduce_sum(-ll_small * tf.math.log(ll_small)))
+
+        # Large scale: spread distribution -> high entropy
+        scale_large = tf.fill(integers.shape, 5.0)
+        ll_large = _discretized_gaussian_likelihood(integers, mean, scale_large)
+        entropy_large = float(tf.reduce_sum(-ll_large * tf.math.log(ll_large)))
+
+        self.assertGreater(entropy_large, entropy_small)
+
+
+class TestQuantizationBehavior(tf.test.TestCase):
+    """Tests that quantization switches correctly between training/inference."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.inputs = tf.constant([[[[[1.3, -0.7, 2.5]]]]])  # (1,1,1,1,3)
+
+    def test_conditional_gaussian_training_adds_noise(self):
+        """Training mode should add uniform noise, not round."""
+        cg = ConditionalGaussian()
+        scale = tf.ones_like(self.inputs)
+        mean = tf.zeros_like(self.inputs)
+
+        # Run multiple times to confirm stochasticity
+        outputs = set()
+        for _ in range(10):
+            out, _ = cg(self.inputs, scale, mean, training=True)
+            outputs.add(tuple(out.numpy().flatten()))
+
+        # Should produce different outputs each time
+        self.assertGreater(len(outputs), 1)
+
+    def test_conditional_gaussian_inference_rounds(self):
+        """Inference mode should produce deterministic rounded output."""
+        cg = ConditionalGaussian()
+        scale = tf.ones_like(self.inputs)
+        mean = tf.zeros_like(self.inputs)
+
+        out1, _ = cg(self.inputs, scale, mean, training=False)
+        out2, _ = cg(self.inputs, scale, mean, training=False)
+
+        self.assertAllEqual(out1, out2)
+
+        # Should be rounded (input - mean rounded + mean = rounded input for mean=0)
+        expected = tf.round(self.inputs)
+        self.assertAllClose(out1, expected)
+
+    def test_conditional_gaussian_likelihood_always_positive(self):
+        """Likelihood should be positive in both training and inference."""
+        cg = ConditionalGaussian()
+        scale = tf.ones_like(self.inputs)
+        mean = tf.zeros_like(self.inputs)
+
+        _, ll_train = cg(self.inputs, scale, mean, training=True)
+        _, ll_eval = cg(self.inputs, scale, mean, training=False)
+
+        self.assertAllGreater(ll_train, 0.0)
+        self.assertAllGreater(ll_eval, 0.0)
+
+
+class TestPatchedGaussianConditional(tf.test.TestCase):
+    """Tests for PatchedGaussianConditional layer."""
+
+    def test_compress_decompress_roundtrip(self):
+        """compress → decompress should be identity for integer inputs."""
+        pgc = PatchedGaussianConditional()
+        # Build the layer
+        inputs = tf.constant([[[[[1.0, 2.0, 3.0]]]]])
+        pgc.build(inputs.shape)
+
+        compressed = pgc.compress(inputs)
+        decompressed = pgc.decompress(compressed)
+
+        self.assertAllClose(decompressed, inputs, atol=1e-5)
+
+    def test_scale_quantization_binary_search(self):
+        """Binary search should map scales to nearest table entry."""
+        scale_table = tf.constant([0.1, 0.5, 1.0, 2.0, 5.0])
+        pgc = PatchedGaussianConditional(scale_table=scale_table)
+
+        test_scales = tf.constant([0.05, 0.3, 0.8, 1.5, 3.0, 10.0])
+        quantized = pgc.quantize_scale(test_scales)
+
+        # Each should map to nearest table entry
+        expected = tf.constant([0.1, 0.1, 1.0, 1.0, 2.0, 5.0])
+        self.assertAllClose(quantized, expected)
+
+    def test_scale_quantization_preserves_shape(self):
+        """Quantized scales should have same shape as input."""
+        scale_table = tf.constant([0.1, 0.5, 1.0, 2.0, 5.0])
+        pgc = PatchedGaussianConditional(scale_table=scale_table)
+
+        test_scales = tf.random.uniform((2, 4, 4, 4, 8), 0.1, 5.0)
+        quantized = pgc.quantize_scale(test_scales)
+
+        self.assertEqual(quantized.shape, test_scales.shape)
+
+    def test_likelihood_matches_standalone(self):
+        """Layer likelihood should match standalone function."""
+        pgc = PatchedGaussianConditional()
+        inputs = tf.constant([[[[[0.0, 1.0, -1.0]]]]])
+        pgc.build(inputs.shape)
+
+        layer_ll = pgc.likelihood(inputs)
+
+        # Compare with standalone function
+        scale = tf.maximum(tf.abs(pgc.scale), 1e-6)
+        standalone_ll = _discretized_gaussian_likelihood(inputs, pgc.mean, scale)
+
+        self.assertAllClose(layer_ll, standalone_ll)
+
+
+class TestMeanScaleHyperprior(tf.test.TestCase):
+    """Tests for MeanScaleHyperprior entropy model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.latent_channels = 32
+        self.hyper_channels = 16
+
+    def test_total_bits_non_negative(self):
+        """Total bits from hyperprior should be non-negative."""
+        model = MeanScaleHyperprior(
+            latent_channels=self.latent_channels,
+            hyper_channels=self.hyper_channels
+        )
+
+        y = tf.random.normal((1, 4, 4, 4, self.latent_channels))
+        z_hat = tf.random.normal((1, 4, 4, 4, self.hyper_channels))
+        z = tf.random.normal((1, 4, 4, 4, self.hyper_channels))
+
+        y_hat, y_likelihood, total_bits = model(y, z_hat, z=z, training=False)
+
+        self.assertGreater(float(total_bits), 0.0)
+        self.assertAllGreater(y_likelihood, 0.0)
+
+    def test_output_shape_matches_input(self):
+        """y_hat should have same shape as y."""
+        model = MeanScaleHyperprior(
+            latent_channels=self.latent_channels,
+            hyper_channels=self.hyper_channels
+        )
+
+        y = tf.random.normal((1, 4, 4, 4, self.latent_channels))
+        z_hat = tf.random.normal((1, 4, 4, 4, self.hyper_channels))
+
+        y_hat, y_likelihood, _ = model(y, z_hat, training=False)
+
+        self.assertEqual(y_hat.shape, y.shape)
+        self.assertEqual(y_likelihood.shape, y.shape)
+
+    def test_compress_decompress_consistency(self):
+        """compress then decompress should recover y_hat."""
+        model = MeanScaleHyperprior(
+            latent_channels=self.latent_channels,
+            hyper_channels=self.hyper_channels
+        )
+
+        y = tf.random.normal((1, 4, 4, 4, self.latent_channels))
+        z_hat = tf.random.normal((1, 4, 4, 4, self.hyper_channels))
+
+        symbols, side_info = model.compress(y, z_hat)
+        y_hat = model.decompress(symbols, z_hat)
+
+        # Symbols + mean should give y_hat
+        self.assertEqual(y_hat.shape, y.shape)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_evaluation_pipeline.py b/tests/test_evaluation_pipeline.py
index dc254a2f6..1608406ae 100644
--- a/tests/test_evaluation_pipeline.py
+++ b/tests/test_evaluation_pipeline.py
@@ -102,5 +102,63 @@ def test_generate_report(self, pipeline, tmp_path):
         assert 'aggregate_metrics' in report_data
         assert len(report_data['model_performance']) == len(results)
 
+    def test_load_model_no_checkpoint_configured(self, config_path):
+        """Pipeline initializes when config has no checkpoint_path."""
+        pipeline = EvaluationPipeline(config_path)
+        assert pipeline.model is not None
+        assert pipeline.config.get('checkpoint_path') is None
+
+    def test_load_model_empty_string_checkpoint(self, tmp_path):
+        """Empty string checkpoint_path is treated as no checkpoint."""
+        config = {
+            'data': {
+                'modelnet40_path': str(tmp_path / 'modelnet40'),
+                'ivfb_path': str(tmp_path / '8ivfb')
+            },
+            'model': {
+                'filters': 32,
+                'activation': 'cenic_gdn',
+                'conv_type': 'separable'
+            },
+            'evaluation': {
+                'metrics': ['psnr'],
+                'output_dir': str(tmp_path / 'results'),
+                'visualize': True
+            },
+            'checkpoint_path': ''
+        }
+        config_file = tmp_path / 'config_empty_ckpt.yml'
+        with open(config_file, 'w') as f:
+            yaml.dump(config, f)
+
+        pipeline = EvaluationPipeline(str(config_file))
+        assert pipeline.model is not None
+
+    def test_load_model_missing_checkpoint_raises(self, tmp_path):
+        """Non-existent checkpoint_path raises FileNotFoundError."""
+        config = {
+            'data': {
+                'modelnet40_path': str(tmp_path / 'modelnet40'),
+                'ivfb_path': str(tmp_path / '8ivfb')
+            },
+            'model': {
+                'filters': 32,
+                'activation': 'cenic_gdn',
+                'conv_type': 'separable'
+            },
+            'evaluation': {
+                'metrics': ['psnr'],
+                'output_dir': str(tmp_path / 'results'),
+                'visualize': True
+            },
+            'checkpoint_path': str(tmp_path / 'nonexistent' / 'model.weights.h5')
+        }
+        config_file = tmp_path / 'config_missing_ckpt.yml'
+        with open(config_file, 'w') as f:
+            yaml.dump(config, f)
+
+        with pytest.raises(FileNotFoundError, match="Checkpoint not found"):
+            EvaluationPipeline(str(config_file))
+
 if __name__ == '__main__':
     tf.test.main()
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 8714810ab..2f3b02e9a 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -322,5 +322,39 @@ def test_v2_gaussian_backward_compatible(self):
         self.assertEqual(x_hat.shape[:-1], input_tensor.shape[:-1])
 
 
+class TestCheckpointResumeIntegration(tf.test.TestCase):
+    """Integration test for checkpoint save/load through new serialization format."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self, tmp_path):
+        self.test_env = setup_test_environment(tmp_path)
+        self.resolution = 16
+        self.batch_size = 1
+
+    @pytest.mark.integration
+    def test_training_checkpoint_resume_loss_continuity(self):
+        """Model state is preserved through checkpoint save/load cycle."""
+        pipeline = TrainingPipeline(self.test_env['config_path'])
+        batch = create_mock_voxel_grid(self.resolution, self.batch_size)[..., 0]
+
+        # Train a few steps to establish non-trivial model + optimizer state
+        for _ in range(3):
+            pipeline._train_step(batch, training=True)
+
+        pipeline.save_checkpoint('resume_test')
+
+        # Record eval loss at checkpoint
+        checkpoint_loss = pipeline._train_step(batch, training=False)['total_loss']
+
+        # Load into fresh pipeline and verify same eval loss
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline._train_step(batch, training=True)  # Build optimizer variables
+        new_pipeline.load_checkpoint('resume_test')
+
+        resumed_loss = new_pipeline._train_step(batch, training=False)['total_loss']
+
+        self.assertAllClose(checkpoint_loss, resumed_loss, rtol=1e-4)
+
+
 if __name__ == '__main__':
     tf.test.main()
diff --git a/tests/test_model_transforms.py b/tests/test_model_transforms.py
old mode 100644
new mode 100755
index 03c8c2bf5..6540b7569
--- a/tests/test_model_transforms.py
+++ b/tests/test_model_transforms.py
@@ -8,7 +8,7 @@
 sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
 
 from model_transforms import (
-    CENICGDN,
+    GDN,
     AnalysisTransform,
     DeepCompressModel,
     DeepCompressModelV2,
@@ -33,10 +33,9 @@ def setup(self):
         self.resolution = 64
         self.input_shape = (self.batch_size, self.resolution, self.resolution, self.resolution, 1)
 
-    def test_cenic_gdn(self):
-        channels = 64
-        activation = CENICGDN(channels)
-        input_tensor = tf.random.uniform((2, 32, 32, 32, channels))
+    def test_gdn(self):
+        activation = GDN(inverse=False)
+        input_tensor = tf.random.uniform((2, 32, 32, 32, 64))
         output = activation(input_tensor)
         self.assertEqual(output.shape, input_tensor.shape)
 
@@ -49,6 +48,13 @@ def test_cenic_gdn(self):
         # Check that gradients are non-zero
         self.assertGreater(tf.reduce_sum(tf.abs(gradients[0])), 0)
 
+    def test_igdn(self):
+        """IGDN (inverse GDN) used in synthesis path."""
+        activation = GDN(inverse=True)
+        input_tensor = tf.random.uniform((2, 8, 8, 8, 64))
+        output = activation(input_tensor)
+        self.assertEqual(output.shape, input_tensor.shape)
+
     def test_spatial_separable_conv(self):
         conv = SpatialSeparableConv(filters=64, kernel_size=(3, 3, 3), strides=(1, 1, 1))
         input_tensor = tf.random.uniform((2, 32, 32, 32, 32))
@@ -64,17 +70,21 @@ def test_analysis_transform(self):
         output = analysis(input_tensor)
         self.assertEqual(len(output.shape), 5)  # 5D tensor (B, D, H, W, C)
         self.assertGreater(output.shape[-1], input_tensor.shape[-1])
-        # Check that CENICGDN layers are present in the conv_layers list
-        has_gdn = any(isinstance(layer, CENICGDN) for layer in analysis.conv_layers)
+        # Check that GDN layers are present in the conv_layers list
+        has_gdn = any(isinstance(layer, GDN) for layer in analysis.conv_layers)
         self.assertTrue(has_gdn)
 
     def test_synthesis_transform(self):
         synthesis = SynthesisTransform(self.config)
-        input_tensor = tf.random.uniform((2, 32, 32, 32, 256))  # Match analysis output channels
+        # Use small spatial dims since Conv3DTranspose upsamples with strides=(2,2,2):
+        # 4 -> 8 -> 16 -> 32
+        input_tensor = tf.random.uniform((1, 4, 4, 4, 256))
         output = synthesis(input_tensor)
         # Synthesis reduces channels progressively
         self.assertEqual(len(output.shape), 5)  # 5D tensor
         self.assertLessEqual(output.shape[-1], input_tensor.shape[-1])
+        # Conv3DTranspose upsamples spatial dims
+        self.assertGreaterEqual(output.shape[1], input_tensor.shape[1])
 
     def test_deep_compress_model(self):
         # Use strides=(1,1,1) to avoid spatial dimension changes
@@ -87,16 +97,19 @@ def test_deep_compress_model(self):
         )
         model = DeepCompressModel(config_no_stride)
         input_tensor = create_mock_voxel_grid(16, 1)  # Smaller for faster test
-        # Model returns (x_hat, y, y_hat, z) tuple
+        # Model returns (x_hat, y, z_hat, z_noisy) tuple
         output = model(input_tensor, training=True)
         self.assertIsInstance(output, tuple)
         self.assertEqual(len(output), 4)
-        x_hat, y, y_hat, z = output
+        x_hat, y, z_hat, z_noisy = output
         # Check that output tensors have correct shapes
         self.assertEqual(x_hat.shape[:-1], input_tensor.shape[:-1])
         self.assertEqual(len(y.shape), 5)
-        self.assertEqual(len(y_hat.shape), 5)
-        self.assertEqual(len(z.shape), 5)
+        self.assertEqual(len(z_hat.shape), 5)
+        self.assertEqual(len(z_noisy.shape), 5)
+        # x_hat should be sigmoid-activated (values in [0, 1])
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
 
     def test_gradient_flow(self):
         model = DeepCompressModel(self.config)
diff --git a/tests/test_mp_report.py b/tests/test_mp_report.py
old mode 100644
new mode 100755
index a7e50bfc3..cf62a4904
--- a/tests/test_mp_report.py
+++ b/tests/test_mp_report.py
@@ -113,8 +113,8 @@ def test_best_performance_selection(setup_experiment):
     assert best_performance['best_bd_rate'] == 'original_3.ply'
     # The best bitrate should be from "original_2.ply" (lowest bitrate is best)
     assert best_performance['best_bitrate'] == 'original_2.ply'
-    # The best compression ratio should be from "original_1.ply" (lowest is best: 0.75)
-    assert best_performance['best_compression_ratio'] == 'original_1.ply'
+    # The best compression ratio should be from "original_3.ply" (highest is best: 0.85)
+    assert best_performance['best_compression_ratio'] == 'original_3.ply'
     # The best compression time should be from "original_3.ply" (shorter is better: 2.0)
     assert best_performance['best_compression_time'] == 'original_3.ply'
     # The best decompression time should be from "original_1.ply" (shorter is better: 1.0)
diff --git a/tests/test_numerical.py b/tests/test_numerical.py
new file mode 100755
index 000000000..d2072cf04
--- /dev/null
+++ b/tests/test_numerical.py
@@ -0,0 +1,333 @@
+"""
+Tests for numerical stability of GDN, entropy models, and data pipeline.
+
+Validates that GDN/IGDN handle edge cases (zero, large, negative inputs),
+entropy models remain stable with extreme parameters, and the data pipeline
+produces valid outputs.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from constants import EPSILON
+from entropy_model import (
+    ConditionalGaussian,
+    PatchedGaussianConditional,
+    _discretized_gaussian_likelihood,
+)
+from model_transforms import GDN
+
+
+class TestGDNStability(tf.test.TestCase):
+    """Tests for GDN numerical stability."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.channels = 8
+        self.shape = (1, 4, 4, 4, self.channels)
+
+    def test_zero_input(self):
+        """GDN should handle zero input without NaN."""
+        gdn = GDN(inverse=False)
+        inputs = tf.zeros(self.shape)
+        output = gdn(inputs)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(output)))
+        self.assertFalse(tf.reduce_any(tf.math.is_inf(output)))
+        # Zero divided by sqrt(beta) should be zero
+        self.assertAllClose(output, tf.zeros_like(output))
+
+    def test_zero_input_igdn(self):
+        """IGDN should handle zero input without NaN."""
+        igdn = GDN(inverse=True)
+        inputs = tf.zeros(self.shape)
+        output = igdn(inputs)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(output)))
+        self.assertFalse(tf.reduce_any(tf.math.is_inf(output)))
+        self.assertAllClose(output, tf.zeros_like(output))
+
+    def test_large_input(self):
+        """GDN should handle large inputs without overflow."""
+        gdn = GDN(inverse=False)
+        inputs = tf.constant(1000.0, shape=self.shape)
+        output = gdn(inputs)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(output)))
+        self.assertFalse(tf.reduce_any(tf.math.is_inf(output)))
+
+    def test_large_input_igdn(self):
+        """IGDN should handle large inputs without overflow."""
+        igdn = GDN(inverse=True)
+        inputs = tf.constant(100.0, shape=self.shape)
+        output = igdn(inputs)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(output)))
+        self.assertFalse(tf.reduce_any(tf.math.is_inf(output)))
+
+    def test_negative_input(self):
+        """GDN should handle negative inputs."""
+        gdn = GDN(inverse=False)
+        inputs = tf.constant(-5.0, shape=self.shape)
+        output = gdn(inputs)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(output)))
+        self.assertFalse(tf.reduce_any(tf.math.is_inf(output)))
+
+    def test_igdn_gdn_approximate_inverse(self):
+        """IGDN(GDN(x)) should approximately recover x for moderate inputs."""
+        tf.random.set_seed(42)
+        gdn = GDN(inverse=False)
+        igdn = GDN(inverse=True)
+
+        # Use moderate values to stay in stable range
+        inputs = tf.random.normal(self.shape) * 2.0
+
+        # Forward through GDN then IGDN
+        encoded = gdn(inputs)
+        decoded = igdn(encoded)
+
+        # GDN and IGDN are not exact inverses with independently initialized
+        # parameters, but with default params (beta=1, gamma=0.1*I) they
+        # should be reasonably close for moderate inputs
+        # We just check no NaN/Inf and shape preservation
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(decoded)))
+        self.assertEqual(decoded.shape, inputs.shape)
+
+    def test_gdn_output_bounded(self):
+        """GDN should reduce magnitude (divisive normalization)."""
+        gdn = GDN(inverse=False)
+        inputs = tf.constant(5.0, shape=self.shape)
+        output = gdn(inputs)
+
+        # GDN divides by sqrt(beta + ...) >= sqrt(1) = 1
+        # So output magnitude should be <= input magnitude
+        max_output = float(tf.reduce_max(tf.abs(output)))
+        max_input = float(tf.reduce_max(tf.abs(inputs)))
+        self.assertLessEqual(max_output, max_input + 1e-5)
+
+    def test_gdn_gradient_no_nan(self):
+        """Gradients through GDN should not contain NaN."""
+        gdn = GDN(inverse=False)
+        inputs = tf.random.normal(self.shape)
+
+        with tf.GradientTape() as tape:
+            tape.watch(inputs)
+            output = gdn(inputs)
+            loss = tf.reduce_mean(output)
+
+        grad = tape.gradient(loss, inputs)
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(grad)))
+
+    def test_igdn_gradient_no_nan(self):
+        """Gradients through IGDN should not contain NaN."""
+        igdn = GDN(inverse=True)
+        inputs = tf.random.normal(self.shape)
+
+        with tf.GradientTape() as tape:
+            tape.watch(inputs)
+            output = igdn(inputs)
+            loss = tf.reduce_mean(output)
+
+        grad = tape.gradient(loss, inputs)
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(grad)))
+
+    def test_gamma_symmetry(self):
+        """Gamma matrix used in GDN should be symmetric after call."""
+        gdn = GDN(inverse=False)
+        inputs = tf.random.normal(self.shape)
+        _ = gdn(inputs)
+
+        # The effective gamma inside call() is (relu(gamma) + relu(gamma)^T) / 2
+        gamma = tf.nn.relu(gdn.gamma)
+        gamma_sym = (gamma + tf.transpose(gamma)) / 2.0
+        self.assertAllClose(gamma_sym, tf.transpose(gamma_sym))
+
+
+class TestEntropyStability(tf.test.TestCase):
+    """Tests for entropy model numerical stability."""
+
+    def test_very_small_scale(self):
+        """Very small scale should not produce NaN likelihood."""
+        inputs = tf.constant([0.0, 1.0, -1.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.constant([1e-8, 1e-8, 1e-8])
+
+        ll = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(ll)))
+        self.assertAllGreater(ll, 0.0)
+
+    def test_very_large_scale(self):
+        """Very large scale should not produce NaN likelihood."""
+        inputs = tf.constant([0.0, 100.0, -100.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.constant([1e6, 1e6, 1e6])
+
+        ll = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(ll)))
+        self.assertAllGreater(ll, 0.0)
+
+    def test_very_large_input(self):
+        """Very large inputs should produce small but non-NaN likelihood."""
+        inputs = tf.constant([1000.0, -1000.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.ones_like(inputs)
+
+        ll = _discretized_gaussian_likelihood(inputs, mean, scale)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(ll)))
+        # Should be floored at EPSILON
+        self.assertAllGreaterEqual(ll, float(EPSILON))
+
+    def test_bits_no_nan_extreme_values(self):
+        """Bits computation should not produce NaN even for extreme values."""
+        from constants import LOG_2_RECIPROCAL
+
+        inputs = tf.constant([0.0, 50.0, -50.0, 1000.0])
+        mean = tf.zeros_like(inputs)
+        scale = tf.ones_like(inputs)
+
+        ll = _discretized_gaussian_likelihood(inputs, mean, scale)
+        bits = -tf.math.log(ll) * LOG_2_RECIPROCAL
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(bits)))
+        self.assertAllGreaterEqual(bits, 0.0)
+
+    def test_conditional_gaussian_extreme_scale(self):
+        """ConditionalGaussian should be stable with extreme scales."""
+        cg = ConditionalGaussian()
+        inputs = tf.constant([[[[[1.0, 2.0]]]]])
+        mean = tf.zeros_like(inputs)
+
+        for scale_val in [1e-6, 1e-3, 1.0, 1e3, 1e6]:
+            scale = tf.fill(inputs.shape, scale_val)
+            out, ll = cg(inputs, scale, mean, training=False)
+
+            self.assertFalse(tf.reduce_any(tf.math.is_nan(out)),
+                             msg=f"NaN output at scale={scale_val}")
+            self.assertFalse(tf.reduce_any(tf.math.is_nan(ll)),
+                             msg=f"NaN likelihood at scale={scale_val}")
+            self.assertAllGreater(ll, 0.0)
+
+    def test_patched_gaussian_negative_scale(self):
+        """PatchedGaussianConditional should handle negative learned scale."""
+        pgc = PatchedGaussianConditional()
+        inputs = tf.constant([[[[[1.0, 2.0, 3.0]]]]])
+        pgc.build(inputs.shape)
+
+        # Force negative scale
+        pgc.scale.assign(-tf.ones_like(pgc.scale))
+
+        ll = pgc.likelihood(inputs)
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(ll)))
+        self.assertAllGreater(ll, 0.0)
+
+    def test_gradient_through_likelihood(self):
+        """Gradients through discretized likelihood should not be NaN."""
+        inputs = tf.Variable(tf.constant([0.0, 1.0, -1.0, 5.0]))
+        mean = tf.constant([0.0, 0.0, 0.0, 0.0])
+        scale = tf.constant([1.0, 1.0, 1.0, 1.0])
+
+        with tf.GradientTape() as tape:
+            ll = _discretized_gaussian_likelihood(inputs, mean, scale)
+            loss = -tf.reduce_sum(tf.math.log(ll))
+
+        grad = tape.gradient(loss, inputs)
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(grad)))
+
+
+class TestConstantsCorrectness(tf.test.TestCase):
+    """Tests that pre-computed constants are correct."""
+
+    def test_log2_value(self):
+        """LOG_2 should equal ln(2)."""
+        from constants import LOG_2
+        self.assertAllClose(LOG_2, tf.constant(np.log(2.0), dtype=tf.float32))
+
+    def test_log2_reciprocal_value(self):
+        """LOG_2_RECIPROCAL should equal 1/ln(2)."""
+        from constants import LOG_2_RECIPROCAL
+        expected = tf.constant(1.0 / np.log(2.0), dtype=tf.float32)
+        self.assertAllClose(LOG_2_RECIPROCAL, expected)
+
+    def test_log2_reciprocal_identity(self):
+        """LOG_2 * LOG_2_RECIPROCAL should equal 1."""
+        from constants import LOG_2, LOG_2_RECIPROCAL
+        product = LOG_2 * LOG_2_RECIPROCAL
+        self.assertAllClose(product, 1.0, atol=1e-6)
+
+    def test_epsilon_positive(self):
+        """EPSILON should be a small positive value."""
+        self.assertGreater(float(EPSILON), 0.0)
+        self.assertLess(float(EPSILON), 1e-6)
+
+    def test_scale_bounds(self):
+        """SCALE_MIN < SCALE_MAX."""
+        from constants import SCALE_MAX, SCALE_MIN
+        self.assertLess(float(SCALE_MIN), float(SCALE_MAX))
+
+    def test_f16_constants_match(self):
+        """Float16 constants should match float32 values within f16 precision."""
+        from constants import LOG_2, LOG_2_F16, LOG_2_RECIPROCAL, LOG_2_RECIPROCAL_F16
+        self.assertAllClose(
+            tf.cast(LOG_2, tf.float16), LOG_2_F16, atol=1e-3
+        )
+        self.assertAllClose(
+            tf.cast(LOG_2_RECIPROCAL, tf.float16), LOG_2_RECIPROCAL_F16, atol=1e-3
+        )
+
+
+class TestScaleQuantizationNumerics(tf.test.TestCase):
+    """Tests for binary search scale quantization edge cases."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.scale_table = tf.constant([0.1, 0.5, 1.0, 2.0, 5.0, 10.0])
+        self.pgc = PatchedGaussianConditional(scale_table=self.scale_table)
+
+    def test_exact_table_values_preserved(self):
+        """Input values exactly matching table entries should be preserved."""
+        test_scales = tf.constant([0.1, 0.5, 1.0, 2.0, 5.0, 10.0])
+        quantized = self.pgc.quantize_scale(test_scales)
+        self.assertAllClose(quantized, test_scales)
+
+    def test_negative_scales_made_positive(self):
+        """Negative scales should be mapped to positive table entries."""
+        test_scales = tf.constant([-0.5, -1.0, -2.0])
+        quantized = self.pgc.quantize_scale(test_scales)
+
+        self.assertAllGreater(quantized, 0.0)
+
+    def test_below_minimum_clipped(self):
+        """Scales below table minimum should be clipped to minimum."""
+        test_scales = tf.constant([0.001, 0.01, 0.05])
+        quantized = self.pgc.quantize_scale(test_scales)
+
+        self.assertAllGreaterEqual(quantized, 0.1)
+
+    def test_above_maximum_clipped(self):
+        """Scales above table maximum should be clipped to maximum."""
+        test_scales = tf.constant([20.0, 100.0, 1000.0])
+        quantized = self.pgc.quantize_scale(test_scales)
+
+        self.assertAllLessEqual(quantized, 10.0)
+
+    def test_zero_scale(self):
+        """Zero scale should not cause errors."""
+        test_scales = tf.constant([0.0])
+        quantized = self.pgc.quantize_scale(test_scales)
+
+        self.assertFalse(tf.reduce_any(tf.math.is_nan(quantized)))
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_parallel_process.py b/tests/test_parallel_process.py
old mode 100644
new mode 100755
index c57274da4..7abbd01a7
--- a/tests/test_parallel_process.py
+++ b/tests/test_parallel_process.py
@@ -66,18 +66,34 @@ def test_popen_timeout(self, mock_popen):
                 process.wait()
 
     @patch("subprocess.Popen")
-    def test_popen_cleanup(self, mock_popen):
-        """Test proper cleanup of Popen resources."""
+    def test_popen_cleanup_running(self, mock_popen):
+        """Test cleanup terminates a still-running process."""
         mock_process = MagicMock()
+        mock_process.poll.return_value = None  # Process still running
         mock_process.stdout = MagicMock()
         mock_process.stderr = MagicMock()
         mock_popen.return_value = mock_process
 
-        cmd = ["echo", "test"]
+        cmd = ["sleep", "10"]
         with Popen(cmd) as _:
-            pass  # Context manager should handle cleanup
+            pass  # Context manager should terminate
 
         mock_process.terminate.assert_called_once()
+
+    @patch("subprocess.Popen")
+    def test_popen_cleanup_finished(self, mock_popen):
+        """Test cleanup skips terminate for already-finished process."""
+        mock_process = MagicMock()
+        mock_process.poll.return_value = 0  # Process already exited
+        mock_process.stdout = MagicMock()
+        mock_process.stderr = MagicMock()
+        mock_popen.return_value = mock_process
+
+        cmd = ["echo", "test"]
+        with Popen(cmd) as _:
+            pass  # Context manager should just close handles
+
+        mock_process.terminate.assert_not_called()
         mock_process.stdout.close.assert_called_once()
         mock_process.stderr.close.assert_called_once()
 
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
new file mode 100755
index 000000000..7a1aba66a
--- /dev/null
+++ b/tests/test_roundtrip.py
@@ -0,0 +1,326 @@
+"""
+Tests for end-to-end compress/decompress roundtrip consistency.
+
+Validates that DeepCompressModel and DeepCompressModelV2 produce correct
+output shapes, bounded values, and deterministic inference across entropy
+model configurations.
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+import tensorflow as tf
+
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from model_transforms import DeepCompressModel, DeepCompressModelV2, TransformConfig
+from test_utils import create_mock_voxel_grid
+
+# Standard small config for all roundtrip tests
+_SMALL_CONFIG = TransformConfig(
+    filters=32,
+    kernel_size=(3, 3, 3),
+    strides=(1, 1, 1),
+    activation='relu',
+    conv_type='standard'
+)
+
+_RESOLUTION = 16
+_BATCH_SIZE = 1
+
+
+class TestDeepCompressModelV1Roundtrip(tf.test.TestCase):
+    """Roundtrip tests for V1 model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.model = DeepCompressModel(_SMALL_CONFIG)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+
+    def test_output_shape_matches_input(self):
+        """x_hat should have same shape as input."""
+        x_hat, y, z_hat, z_noisy = self.model(self.input_tensor, training=False)
+        self.assertEqual(x_hat.shape, self.input_tensor.shape)
+
+    def test_output_bounded_zero_one(self):
+        """x_hat should be in [0, 1] (sigmoid output)."""
+        x_hat, _, _, _ = self.model(self.input_tensor, training=False)
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
+
+    def test_inference_deterministic(self):
+        """Inference should be deterministic (tf.round, no noise)."""
+        out1 = self.model(self.input_tensor, training=False)
+        out2 = self.model(self.input_tensor, training=False)
+
+        self.assertAllClose(out1[0], out2[0])  # x_hat
+        self.assertAllClose(out1[1], out2[1])  # y
+        self.assertAllClose(out1[2], out2[2])  # z_hat
+        self.assertAllClose(out1[3], out2[3])  # z_noisy (rounded)
+
+    def test_training_stochastic(self):
+        """Training should be stochastic (uniform noise)."""
+        out1 = self.model(self.input_tensor, training=True)
+        out2 = self.model(self.input_tensor, training=True)
+
+        # y_hat values should differ due to noise (check z_noisy)
+        diff = tf.reduce_sum(tf.abs(out1[3] - out2[3]))
+        self.assertGreater(float(diff), 0.0)
+
+    def test_latent_has_channels(self):
+        """Latent y should have more channels than input."""
+        _, y, _, _ = self.model(self.input_tensor, training=False)
+        self.assertGreater(y.shape[-1], self.input_tensor.shape[-1])
+
+    def test_returns_four_values(self):
+        """V1 model should return exactly 4 values."""
+        outputs = self.model(self.input_tensor, training=False)
+        self.assertEqual(len(outputs), 4)
+
+
+class TestDeepCompressModelV2Gaussian(tf.test.TestCase):
+    """Roundtrip tests for V2 model with gaussian entropy model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+        self.model = DeepCompressModelV2(
+            _SMALL_CONFIG, entropy_model='gaussian',
+            num_channel_groups=4, num_attention_layers=1
+        )
+
+    def test_output_shape(self):
+        """x_hat should match input shape."""
+        x_hat, y, y_hat, z, rate_info = self.model(self.input_tensor, training=False)
+        self.assertEqual(x_hat.shape, self.input_tensor.shape)
+
+    def test_output_bounded(self):
+        """x_hat should be in [0, 1]."""
+        x_hat, _, _, _, _ = self.model(self.input_tensor, training=False)
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
+
+    def test_returns_five_values(self):
+        """V2 model should return exactly 5 values."""
+        outputs = self.model(self.input_tensor, training=False)
+        self.assertEqual(len(outputs), 5)
+
+    def test_rate_info_keys(self):
+        """rate_info should contain required keys."""
+        _, _, _, _, rate_info = self.model(self.input_tensor, training=False)
+        for key in ['likelihood', 'total_bits', 'y_bits', 'z_bits', 'bpp']:
+            self.assertIn(key, rate_info, msg=f"Missing key: {key}")
+
+    def test_total_bits_positive(self):
+        """Total bits should be positive."""
+        _, _, _, _, rate_info = self.model(self.input_tensor, training=False)
+        self.assertGreater(float(rate_info['total_bits']), 0.0)
+
+    def test_bpp_positive(self):
+        """Bits per point should be positive."""
+        _, _, _, _, rate_info = self.model(self.input_tensor, training=False)
+        self.assertGreater(float(rate_info['bpp']), 0.0)
+
+    def test_inference_deterministic(self):
+        """Inference should be deterministic."""
+        out1 = self.model(self.input_tensor, training=False)
+        out2 = self.model(self.input_tensor, training=False)
+        self.assertAllClose(out1[0], out2[0])
+
+
+class TestDeepCompressModelV2Hyperprior(tf.test.TestCase):
+    """Roundtrip tests for V2 model with hyperprior entropy model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+        self.model = DeepCompressModelV2(
+            _SMALL_CONFIG, entropy_model='hyperprior',
+            num_channel_groups=4, num_attention_layers=1
+        )
+
+    def test_output_shape(self):
+        """x_hat should match input shape."""
+        x_hat, y, y_hat, z, rate_info = self.model(self.input_tensor, training=False)
+        self.assertEqual(x_hat.shape, self.input_tensor.shape)
+
+    def test_output_bounded(self):
+        """x_hat should be in [0, 1]."""
+        x_hat, _, _, _, _ = self.model(self.input_tensor, training=False)
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
+
+    def test_returns_five_values(self):
+        """V2 model should return exactly 5 values."""
+        outputs = self.model(self.input_tensor, training=False)
+        self.assertEqual(len(outputs), 5)
+
+    def test_rate_info_keys(self):
+        """rate_info should contain required keys."""
+        _, _, _, _, rate_info = self.model(self.input_tensor, training=False)
+        for key in ['likelihood', 'total_bits', 'y_bits', 'z_bits', 'bpp']:
+            self.assertIn(key, rate_info, msg=f"Missing key: {key}")
+
+    def test_total_bits_positive(self):
+        """Total bits should be positive."""
+        _, _, _, _, rate_info = self.model(self.input_tensor, training=False)
+        self.assertGreater(float(rate_info['total_bits']), 0.0)
+
+    def test_inference_deterministic(self):
+        """Inference should be deterministic."""
+        out1 = self.model(self.input_tensor, training=False)
+        out2 = self.model(self.input_tensor, training=False)
+        self.assertAllClose(out1[0], out2[0])
+
+
+class TestV2CompressDecompressGaussian(tf.test.TestCase):
+    """Tests for V2 compress/decompress path with gaussian model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+        self.model = DeepCompressModelV2(_SMALL_CONFIG, entropy_model='gaussian')
+        _ = self.model(self.input_tensor, training=False)  # build
+
+    def test_compress_returns_dict(self):
+        """compress() should return a dict with 'y', 'z', 'side_info'."""
+        compressed = self.model.compress(self.input_tensor)
+        self.assertIn('y', compressed)
+        self.assertIn('z', compressed)
+        self.assertIn('side_info', compressed)
+
+    def test_decompress_shape(self):
+        """decompress() output should match input shape."""
+        compressed = self.model.compress(self.input_tensor)
+        x_hat = self.model.decompress(compressed)
+        self.assertEqual(x_hat.shape, self.input_tensor.shape)
+
+    def test_decompress_bounded(self):
+        """Decompressed output should be in [0, 1]."""
+        compressed = self.model.compress(self.input_tensor)
+        x_hat = self.model.decompress(compressed)
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
+
+    def test_compress_decompress_deterministic(self):
+        """Compress + decompress should be deterministic."""
+        compressed1 = self.model.compress(self.input_tensor)
+        x_hat1 = self.model.decompress(compressed1)
+        compressed2 = self.model.compress(self.input_tensor)
+        x_hat2 = self.model.decompress(compressed2)
+        self.assertAllClose(x_hat1, x_hat2)
+
+
+class TestV2CompressDecompressHyperprior(tf.test.TestCase):
+    """Tests for V2 compress/decompress path with hyperprior model."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+        self.model = DeepCompressModelV2(_SMALL_CONFIG, entropy_model='hyperprior')
+        _ = self.model(self.input_tensor, training=False)  # build
+
+    def test_compress_returns_dict(self):
+        """compress() should return a dict with 'y', 'z', 'side_info'."""
+        compressed = self.model.compress(self.input_tensor)
+        self.assertIn('y', compressed)
+        self.assertIn('z', compressed)
+        self.assertIn('side_info', compressed)
+
+    def test_decompress_shape(self):
+        """decompress() output should match input shape."""
+        compressed = self.model.compress(self.input_tensor)
+        x_hat = self.model.decompress(compressed)
+        self.assertEqual(x_hat.shape, self.input_tensor.shape)
+
+    def test_decompress_bounded(self):
+        """Decompressed output should be in [0, 1]."""
+        compressed = self.model.compress(self.input_tensor)
+        x_hat = self.model.decompress(compressed)
+        self.assertAllGreaterEqual(x_hat, 0.0)
+        self.assertAllLessEqual(x_hat, 1.0)
+
+    def test_compress_decompress_deterministic(self):
+        """Compress + decompress should be deterministic."""
+        compressed1 = self.model.compress(self.input_tensor)
+        x_hat1 = self.model.decompress(compressed1)
+        compressed2 = self.model.compress(self.input_tensor)
+        x_hat2 = self.model.decompress(compressed2)
+        self.assertAllClose(x_hat1, x_hat2)
+
+
+class TestGradientFlow(tf.test.TestCase):
+    """Tests that gradients flow through the model during training."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        tf.random.set_seed(42)
+        self.input_tensor = create_mock_voxel_grid(_RESOLUTION, _BATCH_SIZE)
+
+    def test_v1_gradients_flow(self):
+        """V1 model should produce non-zero gradients."""
+        model = DeepCompressModel(_SMALL_CONFIG)
+
+        with tf.GradientTape() as tape:
+            x_hat, y, z_hat, z_noisy = model(self.input_tensor, training=True)
+            loss = tf.reduce_mean(tf.square(self.input_tensor - x_hat))
+
+        grads = tape.gradient(loss, model.trainable_variables)
+        non_none = [g for g in grads if g is not None]
+
+        self.assertNotEmpty(non_none, "No gradients computed")
+        total_grad_norm = sum(float(tf.reduce_sum(tf.abs(g))) for g in non_none)
+        self.assertGreater(total_grad_norm, 0.0)
+
+    def test_v2_gaussian_gradients_flow(self):
+        """V2 gaussian model should produce non-zero gradients."""
+        model = DeepCompressModelV2(_SMALL_CONFIG, entropy_model='gaussian')
+
+        with tf.GradientTape() as tape:
+            x_hat, y, y_hat, z, rate_info = model(self.input_tensor, training=True)
+            loss = tf.reduce_mean(tf.square(self.input_tensor - x_hat))
+
+        grads = tape.gradient(loss, model.trainable_variables)
+        non_none = [g for g in grads if g is not None]
+
+        self.assertNotEmpty(non_none, "No gradients computed")
+
+    def test_v2_hyperprior_gradients_flow(self):
+        """V2 hyperprior model should produce non-zero gradients."""
+        model = DeepCompressModelV2(_SMALL_CONFIG, entropy_model='hyperprior')
+
+        with tf.GradientTape() as tape:
+            x_hat, y, y_hat, z, rate_info = model(self.input_tensor, training=True)
+            distortion = tf.reduce_mean(tf.square(self.input_tensor - x_hat))
+            rate = rate_info['total_bits']
+            loss = distortion + 0.01 * rate
+
+        grads = tape.gradient(loss, model.trainable_variables)
+        non_none = [g for g in grads if g is not None]
+
+        self.assertNotEmpty(non_none, "No gradients computed")
+
+
+class TestInvalidEntropyModel(tf.test.TestCase):
+    """Tests for invalid entropy model selection."""
+
+    def test_invalid_entropy_model_raises(self):
+        """Invalid entropy model string should raise ValueError."""
+        with self.assertRaises(ValueError):
+            DeepCompressModelV2(_SMALL_CONFIG, entropy_model='invalid')
+
+    def test_valid_entropy_models_accepted(self):
+        """All valid entropy model strings should be accepted."""
+        for name in DeepCompressModelV2.ENTROPY_MODELS:
+            model = DeepCompressModelV2(_SMALL_CONFIG, entropy_model=name)
+            self.assertEqual(model.entropy_model_type, name)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tests/test_training_pipeline.py b/tests/test_training_pipeline.py
index 4a5b4290a..afeb14007 100644
--- a/tests/test_training_pipeline.py
+++ b/tests/test_training_pipeline.py
@@ -1,6 +1,7 @@
 import sys
 from pathlib import Path
 
+import numpy as np
 import pytest
 import tensorflow as tf
 import yaml
@@ -99,6 +100,11 @@ def test_save_load_checkpoint(self, pipeline, tmp_path):
         checkpoint_dir = Path(pipeline.checkpoint_dir) / checkpoint_name
         assert (checkpoint_dir / 'model.weights.h5').exists()
         assert (checkpoint_dir / 'entropy.weights.h5').exists()
+        # Optimizer variables saved as individual .npy files in subdirectories
+        for opt_name in pipeline.optimizers:
+            opt_dir = checkpoint_dir / f'{opt_name}_optimizer'
+            if pipeline.optimizers[opt_name].variables:
+                assert opt_dir.exists()
 
         new_pipeline = TrainingPipeline(pipeline.config_path)
         # Build the new model before loading weights
@@ -129,3 +135,189 @@ def create_sample_batch():
         checkpoint_dir = Path(pipeline.checkpoint_dir)
         assert len(list(checkpoint_dir.glob('epoch_*'))) > 0
         assert (checkpoint_dir / 'best_model').exists()
+
+    # --- Security / path validation tests ---
+
+    def test_load_checkpoint_rejects_path_traversal(self, pipeline):
+        """Path traversal via ../ is rejected."""
+        with pytest.raises(ValueError, match="escapes"):
+            pipeline.load_checkpoint('../../etc/passwd')
+
+    def test_load_checkpoint_rejects_absolute_path(self, pipeline):
+        """Absolute path outside checkpoint dir is rejected."""
+        with pytest.raises(ValueError, match="escapes"):
+            pipeline.load_checkpoint('/tmp/evil_checkpoint')
+
+    def test_load_checkpoint_prefix_collision(self, pipeline, tmp_path):
+        """Sibling directory with prefix-matching name is rejected."""
+        # checkpoint_dir is tmp_path / 'checkpoints'
+        # Create a sibling with a name that is a prefix match
+        evil_dir = tmp_path / 'checkpoints_evil'
+        evil_dir.mkdir()
+
+        # '../checkpoints_evil' resolves outside checkpoint_dir but
+        # starts with the same string prefix — must still be rejected
+        with pytest.raises(ValueError, match="escapes"):
+            pipeline.load_checkpoint('../checkpoints_evil')
+
+    # --- NaN / degenerate value tests ---
+
+    def test_checkpoint_nan_in_optimizer_variable(self, pipeline):
+        """NaN in optimizer variables is preserved through save/load."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        # Train to populate momentum/variance variables
+        batch = tf.zeros((1, 16, 16, 16))
+        pipeline._train_step(batch, training=True)
+
+        opt = pipeline.optimizers['reconstruction']
+        # Find a float variable (skip int64 iteration counter)
+        float_vars = [(i, v) for i, v in enumerate(opt.variables)
+                      if v.dtype == tf.float32]
+        assert len(float_vars) > 0
+        idx, target_var = float_vars[0]
+
+        nan_value = np.full_like(target_var.numpy(), float('nan'))
+        target_var.assign(nan_value)
+
+        pipeline.save_checkpoint('nan_test')
+
+        # Load into fresh pipeline
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+        new_pipeline._train_step(batch, training=True)
+        new_pipeline.load_checkpoint('nan_test')
+
+        loaded_var = new_pipeline.optimizers['reconstruction'].variables[idx]
+        assert np.all(np.isnan(loaded_var.numpy()))
+
+    # --- Zero / empty / boundary tests ---
+
+    def test_save_checkpoint_before_training(self, pipeline):
+        """Checkpoint saved before training loads without error."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        # No training step — optimizer has only internal state (iteration counter)
+        pipeline.save_checkpoint('untrained')
+
+        checkpoint_dir = Path(pipeline.checkpoint_dir) / 'untrained'
+        assert (checkpoint_dir / 'model.weights.h5').exists()
+        assert (checkpoint_dir / 'entropy.weights.h5').exists()
+
+        # Loading the untrained checkpoint should not crash
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+        new_pipeline.load_checkpoint('untrained')
+
+    # --- Negative / error path tests ---
+
+    def test_load_checkpoint_missing_weights_file(self, pipeline):
+        """Missing model weights file raises error on load."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        pipeline.save_checkpoint('incomplete')
+
+        # Delete the model weights file
+        weights_path = Path(pipeline.checkpoint_dir) / 'incomplete' / 'model.weights.h5'
+        weights_path.unlink()
+
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+
+        with pytest.raises(Exception):
+            new_pipeline.load_checkpoint('incomplete')
+
+    def test_checkpoint_partial_optimizer_files(self, pipeline):
+        """Missing optimizer .npy files are silently skipped."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        batch = tf.zeros((1, 16, 16, 16))
+        pipeline._train_step(batch, training=True)
+        pipeline.save_checkpoint('partial_test')
+
+        # Delete the last .npy file from an optimizer dir
+        opt_dir = Path(pipeline.checkpoint_dir) / 'partial_test' / 'reconstruction_optimizer'
+        if opt_dir.exists():
+            npy_files = sorted(opt_dir.glob('*.npy'))
+            if len(npy_files) > 1:
+                npy_files[-1].unlink()
+
+        # Loading should succeed — missing files silently skipped
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+        new_pipeline._train_step(batch, training=True)
+        new_pipeline.load_checkpoint('partial_test')
+
+    # --- Regression tests ---
+
+    def test_load_old_format_pickle_file_ignored(self, pipeline):
+        """Old-style pickle .npy file at checkpoint level is safely ignored."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        pipeline.save_checkpoint('format_test')
+
+        # Place an old-format pickle file alongside new-format directories
+        checkpoint_dir = Path(pipeline.checkpoint_dir) / 'format_test'
+        old_file = checkpoint_dir / 'stale_optimizer.npy'
+        np.save(str(old_file), np.array([np.zeros(5)], dtype=object),
+                allow_pickle=True)
+
+        # Loading should succeed, ignoring the old file
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+        new_pipeline.load_checkpoint('format_test')
+
+    # --- Integration test ---
+
+    def test_checkpoint_optimizer_state_values_survive_roundtrip(self, pipeline):
+        """Optimizer variable values are numerically equal after save/load."""
+        dummy = tf.zeros((1, 16, 16, 16, 1))
+        pipeline.model(dummy, training=False)
+        y = pipeline.model.analysis(dummy)
+        pipeline.entropy_model(y, training=False)
+
+        batch = tf.zeros((1, 16, 16, 16))
+        for _ in range(3):
+            pipeline._train_step(batch, training=True)
+
+        opt = pipeline.optimizers['reconstruction']
+        original_values = [v.numpy().copy() for v in opt.variables]
+
+        pipeline.save_checkpoint('opt_fidelity')
+
+        new_pipeline = TrainingPipeline(pipeline.config_path)
+        new_pipeline.model(dummy, training=False)
+        y2 = new_pipeline.model.analysis(dummy)
+        new_pipeline.entropy_model(y2, training=False)
+        new_pipeline._train_step(batch, training=True)
+        new_pipeline.load_checkpoint('opt_fidelity')
+
+        new_opt = new_pipeline.optimizers['reconstruction']
+        for orig, loaded in zip(original_values,
+                                [v.numpy() for v in new_opt.variables]):
+            np.testing.assert_array_equal(orig, loaded)