Implement many kernels from scratch

2026-04-28 02:01:25 +00:00 · 2024-08-06 18:20:34 -07:00
parent 4c8331b806
commit b57573c8da
15 changed files with 209 additions and 100 deletions
--- a/backend/operations.py
+++ b/backend/operations.py
@@ -7,23 +7,29 @@ from backend import stream
 stash = {}


-def weights_manual_cast(layer, x):
+def weights_manual_cast(layer, x, skip_dtype=False):
    weight, bias, signal = None, None, None
    non_blocking = True

    if getattr(x.device, 'type', None) == 'mps':
        non_blocking = False

+    target_dtype = x.dtype
+    target_device = x.device
+
+    if skip_dtype:
+        target_dtype = None
+
    if stream.using_stream:
        with stream.stream_context()(stream.mover_stream):
            if layer.bias is not None:
-                bias = layer.bias.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
-            weight = layer.weight.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
+                bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+            weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
            signal = stream.mover_stream.record_event()
    else:
        if layer.bias is not None:
-            bias = layer.bias.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
-        weight = layer.weight.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
+            bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+        weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)

    return weight, bias, signal

@@ -60,9 +66,19 @@ def cleanup_cache():
    return


+current_device = None
+current_dtype = None
+current_manual_cast_enabled = False
+
+
 class ForgeOperations:
    class Linear(torch.nn.Linear):
-        parameters_manual_cast = False
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled

        def reset_parameters(self):
            return None
@@ -76,7 +92,12 @@ class ForgeOperations:
                return super().forward(x)

    class Conv2d(torch.nn.Conv2d):
-        parameters_manual_cast = False
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled

        def reset_parameters(self):
            return None
@@ -90,7 +111,12 @@ class ForgeOperations:
                return super().forward(x)

    class Conv3d(torch.nn.Conv3d):
-        parameters_manual_cast = False
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled

        def reset_parameters(self):
            return None
@@ -103,8 +129,98 @@ class ForgeOperations:
            else:
                return super().forward(x)

+    class Conv1d(torch.nn.Conv1d):
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled
+
+        def reset_parameters(self):
+            return None
+
+        def forward(self, x):
+            if self.parameters_manual_cast:
+                weight, bias, signal = weights_manual_cast(self, x)
+                with main_stream_worker(weight, bias, signal):
+                    return self._conv_forward(x, weight, bias)
+            else:
+                return super().forward(x)
+
+    class ConvTranspose2d(torch.nn.ConvTranspose2d):
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled
+
+        def reset_parameters(self):
+            return None
+
+        def forward(self, x, output_size=None):
+            if self.parameters_manual_cast:
+                num_spatial_dims = 2
+                output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
+
+                weight, bias, signal = weights_manual_cast(self, x)
+                with main_stream_worker(weight, bias, signal):
+                    return torch.nn.functional.conv_transpose2d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
+            else:
+                return super().forward(x, output_size)
+
+    class ConvTranspose1d(torch.nn.ConvTranspose1d):
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled
+
+        def reset_parameters(self):
+            return None
+
+        def forward(self, x, output_size=None):
+            if self.parameters_manual_cast:
+                num_spatial_dims = 1
+                output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
+
+                weight, bias, signal = weights_manual_cast(self, x)
+                with main_stream_worker(weight, bias, signal):
+                    return torch.nn.functional.conv_transpose1d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
+            else:
+                return super().forward(x, output_size)
+
+    class ConvTranspose3d(torch.nn.ConvTranspose3d):
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled
+
+        def reset_parameters(self):
+            return None
+
+        def forward(self, x, output_size=None):
+            if self.parameters_manual_cast:
+                num_spatial_dims = 3
+                output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
+
+                weight, bias, signal = weights_manual_cast(self, x)
+                with main_stream_worker(weight, bias, signal):
+                    return torch.nn.functional.conv_transpose3d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
+            else:
+                return super().forward(x, output_size)
+
    class GroupNorm(torch.nn.GroupNorm):
-        parameters_manual_cast = False
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled

        def reset_parameters(self):
            return None
@@ -118,7 +234,12 @@ class ForgeOperations:
                return super().forward(x)

    class LayerNorm(torch.nn.LayerNorm):
-        parameters_manual_cast = False
+
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            kwargs['dtype'] = current_dtype
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled

        def reset_parameters(self):
            return None
@@ -131,34 +252,37 @@ class ForgeOperations:
            else:
                return super().forward(x)

+    class Embedding(torch.nn.Embedding):

-class ForgeOperationsWithManualCast(ForgeOperations):
-    class Linear(ForgeOperations.Linear):
-        parameters_manual_cast = True
+        def __init__(self, *args, **kwargs):
+            kwargs['device'] = current_device
+            super().__init__(*args, **kwargs)
+            self.parameters_manual_cast = current_manual_cast_enabled
+            self.bias = None

-    class Conv2d(ForgeOperations.Conv2d):
-        parameters_manual_cast = True
+        def reset_parameters(self):
+            self.bias = None
+            return None

-    class Conv3d(ForgeOperations.Conv3d):
-        parameters_manual_cast = True
-
-    class GroupNorm(ForgeOperations.GroupNorm):
-        parameters_manual_cast = True
-
-    class LayerNorm(ForgeOperations.LayerNorm):
-        parameters_manual_cast = True
+        def forward(self, x):
+            if self.parameters_manual_cast:
+                weight, bias, signal = weights_manual_cast(self, x, skip_dtype=True)
+                with main_stream_worker(weight, bias, signal):
+                    return torch.nn.functional.embedding(x, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
+            else:
+                return super().forward(x)


@contextlib.contextmanager
-def using_forge_operations(parameters_manual_cast=False, operations=None):
+def using_forge_operations(operations=None, device=None, dtype=None, manual_cast_enabled=False):
+    global current_device, current_dtype, current_manual_cast_enabled
+
+    current_device, current_dtype, current_manual_cast_enabled = device, dtype, manual_cast_enabled

    if operations is None:
        operations = ForgeOperations

-        if parameters_manual_cast:
-            operations = ForgeOperationsWithManualCast
-
-    op_names = ['Linear', 'Conv2d', 'Conv3d', 'GroupNorm', 'LayerNorm']
+    op_names = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'GroupNorm', 'LayerNorm', 'Embedding']
    backups = {op_name: getattr(torch.nn, op_name) for op_name in op_names}

    try: