Implement many kernels from scratch

This commit is contained in:
layerdiffusion
2024-08-06 18:20:34 -07:00
parent 4c8331b806
commit b57573c8da
15 changed files with 209 additions and 100 deletions

View File

@@ -7,23 +7,29 @@ from backend import stream
stash = {}
def weights_manual_cast(layer, x):
def weights_manual_cast(layer, x, skip_dtype=False):
weight, bias, signal = None, None, None
non_blocking = True
if getattr(x.device, 'type', None) == 'mps':
non_blocking = False
target_dtype = x.dtype
target_device = x.device
if skip_dtype:
target_dtype = None
if stream.using_stream:
with stream.stream_context()(stream.mover_stream):
if layer.bias is not None:
bias = layer.bias.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
weight = layer.weight.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
signal = stream.mover_stream.record_event()
else:
if layer.bias is not None:
bias = layer.bias.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
weight = layer.weight.to(device=x.device, dtype=x.dtype, non_blocking=non_blocking)
bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
return weight, bias, signal
@@ -60,9 +66,19 @@ def cleanup_cache():
return
current_device = None
current_dtype = None
current_manual_cast_enabled = False
class ForgeOperations:
class Linear(torch.nn.Linear):
parameters_manual_cast = False
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
@@ -76,7 +92,12 @@ class ForgeOperations:
return super().forward(x)
class Conv2d(torch.nn.Conv2d):
parameters_manual_cast = False
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
@@ -90,7 +111,12 @@ class ForgeOperations:
return super().forward(x)
class Conv3d(torch.nn.Conv3d):
parameters_manual_cast = False
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
@@ -103,8 +129,98 @@ class ForgeOperations:
else:
return super().forward(x)
class Conv1d(torch.nn.Conv1d):
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
def forward(self, x):
if self.parameters_manual_cast:
weight, bias, signal = weights_manual_cast(self, x)
with main_stream_worker(weight, bias, signal):
return self._conv_forward(x, weight, bias)
else:
return super().forward(x)
class ConvTranspose2d(torch.nn.ConvTranspose2d):
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
def forward(self, x, output_size=None):
if self.parameters_manual_cast:
num_spatial_dims = 2
output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
weight, bias, signal = weights_manual_cast(self, x)
with main_stream_worker(weight, bias, signal):
return torch.nn.functional.conv_transpose2d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
else:
return super().forward(x, output_size)
class ConvTranspose1d(torch.nn.ConvTranspose1d):
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
def forward(self, x, output_size=None):
if self.parameters_manual_cast:
num_spatial_dims = 1
output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
weight, bias, signal = weights_manual_cast(self, x)
with main_stream_worker(weight, bias, signal):
return torch.nn.functional.conv_transpose1d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
else:
return super().forward(x, output_size)
class ConvTranspose3d(torch.nn.ConvTranspose3d):
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
def forward(self, x, output_size=None):
if self.parameters_manual_cast:
num_spatial_dims = 3
output_padding = self._output_padding(x, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation)
weight, bias, signal = weights_manual_cast(self, x)
with main_stream_worker(weight, bias, signal):
return torch.nn.functional.conv_transpose3d(x, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation)
else:
return super().forward(x, output_size)
class GroupNorm(torch.nn.GroupNorm):
parameters_manual_cast = False
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
@@ -118,7 +234,12 @@ class ForgeOperations:
return super().forward(x)
class LayerNorm(torch.nn.LayerNorm):
parameters_manual_cast = False
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
kwargs['dtype'] = current_dtype
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
def reset_parameters(self):
return None
@@ -131,34 +252,37 @@ class ForgeOperations:
else:
return super().forward(x)
class Embedding(torch.nn.Embedding):
class ForgeOperationsWithManualCast(ForgeOperations):
class Linear(ForgeOperations.Linear):
parameters_manual_cast = True
def __init__(self, *args, **kwargs):
kwargs['device'] = current_device
super().__init__(*args, **kwargs)
self.parameters_manual_cast = current_manual_cast_enabled
self.bias = None
class Conv2d(ForgeOperations.Conv2d):
parameters_manual_cast = True
def reset_parameters(self):
self.bias = None
return None
class Conv3d(ForgeOperations.Conv3d):
parameters_manual_cast = True
class GroupNorm(ForgeOperations.GroupNorm):
parameters_manual_cast = True
class LayerNorm(ForgeOperations.LayerNorm):
parameters_manual_cast = True
def forward(self, x):
if self.parameters_manual_cast:
weight, bias, signal = weights_manual_cast(self, x, skip_dtype=True)
with main_stream_worker(weight, bias, signal):
return torch.nn.functional.embedding(x, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
else:
return super().forward(x)
@contextlib.contextmanager
def using_forge_operations(parameters_manual_cast=False, operations=None):
def using_forge_operations(operations=None, device=None, dtype=None, manual_cast_enabled=False):
global current_device, current_dtype, current_manual_cast_enabled
current_device, current_dtype, current_manual_cast_enabled = device, dtype, manual_cast_enabled
if operations is None:
operations = ForgeOperations
if parameters_manual_cast:
operations = ForgeOperationsWithManualCast
op_names = ['Linear', 'Conv2d', 'Conv3d', 'GroupNorm', 'LayerNorm']
op_names = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'GroupNorm', 'LayerNorm', 'Embedding']
backups = {op_name: getattr(torch.nn, op_name) for op_name in op_names}
try: