| import enum |
| import os.path |
| import shutil |
| import functools |
| import operator |
| import collections |
| import subprocess |
| from library import * |
| from dispatch import * |
| |
| |
| ################################################################################ |
| class MatmulOperation: |
| """Data structure to describe a matrix multiplication operation. |
| This includes the shape, datatype, and layout of the operands. This data |
| structure is *independent* of the compilation passes and tiling configuration. |
| It strictly contains the parameter that changes the functionality of matmul |
| operation. |
| """ |
| |
| def __init__(self, problem_shape, lhs, rhs, result): |
| """Initializes a matrix multiplication operation. |
| Matrix-multiple operation: `result[M, N] = lhs[M, K] * rhs[K, N]` |
| problem_shape: A tuple representing the matrix multiplication problem shape |
| in the format (M, N, K), where M is the number of rows in the lhs matrix, |
| N is the number of columns in the rhs matrix, and K is the number of columns |
| in the lhs matrix and rows in the rhs matrix. |
| lhs: A TensorDescription object representing the left-hand-side matrix operand. |
| rhs: A TensorDescription object representing the right-hand-side matrix operand. |
| result: A TensorDescription object representing the result matrix operand. |
| """ |
| self.problem_shape = problem_shape # Matmul problem shape [M, N, K] |
| self.M = problem_shape[0] |
| self.N = problem_shape[1] |
| self.K = problem_shape[2] |
| |
| self.lhs = lhs # TensorDescription |
| self.rhs = rhs # TensorDescription |
| self.result = result # TensorDescription |
| self.operation_kind = OperationKind.Matmul |
| |
| def __eq__(self, other): |
| return self.problem_shape == other.problem_shape and \ |
| self.lhs == other.lhs and \ |
| self.rhs == other.rhs and \ |
| self.result == other.result |
| |
| def name(self): |
| """Procedurally generated name for the matmul operation. |
| The name uniquely identifies a matmul operation with matmul shape, |
| lhs dataype and layout, rhs datatype and layout, and result |
| datatype and layout. |
| """ |
| return "matmul_{m}x{n}x{k}_"\ |
| "{typeLhs}{layoutLhs}_"\ |
| "{typeRhs}{layoutRhs}_"\ |
| "{typeResult}{layoutResult}".format( |
| m=self.problem_shape[0], |
| n=self.problem_shape[1], |
| k=self.problem_shape[2], |
| typeLhs=DataTypeName[self.lhs.datatype], |
| layoutLhs=ShortLayoutTypeName[self.lhs.layout], |
| typeRhs=DataTypeName[self.rhs.datatype], |
| layoutRhs=ShortLayoutTypeName[self.rhs.layout], |
| typeResult=DataTypeName[self.result.datatype], |
| layoutResult=ShortLayoutTypeName[self.result.layout] |
| ) |
| |
| def lhs_npy_shape(self): |
| """Returns the shape of the lhs numpy array as a string in the format "MxKxDataType".""" |
| return f"{self.M}x{self.K}x{DataTypeName[self.lhs.datatype]}" |
| |
| def rhs_npy_shape(self): |
| """Returns the shape of the rhs numpy array as a string in the format "KxNxDataType".""" |
| return f"{self.K}x{self.N}x{DataTypeName[self.rhs.datatype]}" |
| |
| def result_npy_shape(self): |
| """Returns the shape of the result numpy array as a string in the format "MxNxDataType".""" |
| return f"{self.M}x{self.N}x{DataTypeName[self.result.datatype]}" |
| |
| def bytes(self): |
| """Returns the number of bytes read/written by the matmul operation.""" |
| bytes = (DataTypeSizeInBits[self.lhs.datatype] * self.M // 8) * self.K + \ |
| (DataTypeSizeInBits[self.rhs.datatype] * self.K // 8) * self.N + \ |
| (DataTypeSizeInBits[self.result.datatype] * self.M // 8) * self.N |
| return bytes |
| |
| def flops(self): |
| """Returns the number of floating point operations performed by the matmul operation.""" |
| return 2 * self.M * self.N * self.K |
| |
| |
| ############################################################################## |
| class MatmulCompilationInfo: |
| """Data structure strictly describes the compilation passes and the tiling configurations. |
| For a matrix multiplication operation, compilation passes and tiling configuration |
| influences the performance of the compiled matmul operation, but the functionality. |
| This data structure should be independent of the matmul operation functionality. |
| |
| Any change in this data structure should not affect the functionality of the matmul operation, i.e., |
| we should be able to use the same reference results for a matrix operation compiled with different |
| compilation info. |
| """ |
| |
| def __init__(self, |
| tile_description, |
| translation_info, |
| config_type=CompilationConfigType.Custom): |
| self.tile_description = tile_description # TileDescription |
| self.translation_info = translation_info # TranslationInfo |
| self.config_type = config_type # CompilationConfigType |
| |
| def name(self): |
| """Procedurally generated name for the matmul compilation info.""" |
| if self.config_type == CompilationConfigType.Default: |
| return "tile_config_default" |
| |
| return "tile_config_{tbm}x{tbn}_{tbk}x{stages}_{translation_info}".format( |
| tbm=self.tile_description.threadblock_shape[0], |
| tbn=self.tile_description.threadblock_shape[1], |
| tbk=self.tile_description.threadblock_shape[2], |
| stages=self.tile_description.stages, |
| translation_info=TranslationInfoName[self.translation_info]) |
| |
| |
| ################################################################################ |
| class EmitMatmulCompilationInfo: |
| """Emitters for the matmul compilation info.""" |
| |
| def __init__(self): |
| self.compilation_info_template = """ |
| // matmul compilation info (tile configuration, translation info, workgroup size) |
| #${compilation_info_name} = #iree_codegen.compilation_info< |
| lowering_config = <tile_sizes = [[${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}]]>, |
| translation_info = <${translation_info} pipeline_depth = ${stages}>, |
| workgroup_size = [${block_dim_x} : index, ${block_dim_y} : index, ${block_dim_z} : index] |
| > |
| """ |
| |
| def emit(self, compilation_info): |
| """Emits the matmul compilation info as a string.""" |
| if compilation_info.config_type == CompilationConfigType.Default: |
| return "" |
| |
| values = { |
| 'compilation_info_name': |
| compilation_info.name(), |
| 'translation_info': |
| TranslationInfoTag[compilation_info.translation_info], |
| 'threadblock_shape_m': |
| str(compilation_info.tile_description.threadblock_shape[0]), |
| 'threadblock_shape_n': |
| str(compilation_info.tile_description.threadblock_shape[1]), |
| 'threadblock_shape_k': |
| str(compilation_info.tile_description.threadblock_shape[2]), |
| 'stages': |
| str(compilation_info.tile_description.stages), |
| 'block_dim_x': |
| str(compilation_info.tile_description.block_dim[0]), |
| 'block_dim_y': |
| str(compilation_info.tile_description.block_dim[1]), |
| 'block_dim_z': |
| str(compilation_info.tile_description.block_dim[2]), |
| } |
| |
| return SubstituteTemplate(self.compilation_info_template, values) |
| |
| |
| ############################################################################### |
| class EmitLinalgMatmulDispatch: |
| """Emitters for the `linalg.matmul` dispatch.""" |
| |
| def __init__(self): |
| self.mlir_dialect = MlirDialect.Linalg |
| |
| self.linalg_row_row_matmul_template = """ |
| // Dispatch linalg.matmul row-row layout |
| func.func @${operation_name}_${compilation_info_name}( |
| %lhs: tensor<${problem_m}x${problem_k}x${datatype_lhs}>, |
| %rhs: tensor<${problem_k}x${problem_n}x${datatype_rhs}>) -> tensor<${problem_m}x${problem_n}x${datatype_result}> |
| { |
| %c0 = arith.constant 0.0 : ${datatype_result} |
| %init = tensor.empty() : tensor<${problem_m}x${problem_n}x${datatype_result}> |
| %inital_result = linalg.fill ins(%c0 : ${datatype_result}) outs(%init : tensor<${problem_m}x${problem_n}x${datatype_result}>) -> tensor<${problem_m}x${problem_n}x${datatype_result}> |
| %result = linalg.matmul ${compilation_info_attribute} |
| ins(%lhs, %rhs: tensor<${problem_m}x${problem_k}x${datatype_lhs}>, tensor<${problem_k}x${problem_n}x${datatype_rhs}>) |
| outs(%inital_result: tensor<${problem_m}x${problem_n}x${datatype_result}>) -> tensor<${problem_m}x${problem_n}x${datatype_result}> |
| return %result : tensor<${problem_m}x${problem_n}x${datatype_result}> |
| } |
| """ |
| |
| def emit(self, matmul_dispatch): |
| """Emit the matmul operation in the MLIR dialect for a single compilation info""" |
| compilation_info_attribute_template = """{compilation_info = #${compilation_info_name}}""" |
| compilation_info_attribute_str = SubstituteTemplate( |
| compilation_info_attribute_template, |
| {'compilation_info_name': matmul_dispatch.configuration.name()}) |
| compilation_info_attribute = compilation_info_attribute_str \ |
| if matmul_dispatch.configuration.config_type != CompilationConfigType.Default else "" |
| |
| values = { |
| 'operation_name': |
| matmul_dispatch.operation.name(), |
| 'compilation_info_attribute': |
| compilation_info_attribute, |
| 'problem_m': |
| str(matmul_dispatch.operation.problem_shape[0]), |
| 'problem_n': |
| str(matmul_dispatch.operation.problem_shape[1]), |
| 'problem_k': |
| str(matmul_dispatch.operation.problem_shape[2]), |
| 'datatype_lhs': |
| DataTypeName[matmul_dispatch.operation.lhs.datatype], |
| 'datatype_rhs': |
| DataTypeName[matmul_dispatch.operation.rhs.datatype], |
| 'datatype_result': |
| DataTypeName[matmul_dispatch.operation.result.datatype], |
| 'compilation_info_name': |
| matmul_dispatch.configuration.name() |
| } |
| |
| return SubstituteTemplate(self.linalg_row_row_matmul_template, values) |
| |
| |
| # Emit `mhlo.matmul` operation. |
| # TODO: Add support for testing lowering matmul op from other dialect. |
| class EmitMhloMatmulOperation: |
| |
| def __init__(self): |
| self.mlir_dialect = MlirDialect.Mhlo |
| |
| self.linalg_row_row_matmul_template = """ |
| // mhlo.matmul operation row-row layout |
| """ |
| |
| |
| ############################################################################### |
| class ReferenceMatmulOperation: |
| """Reference implementation for the matmul operation in numpy. |
| ReferenceMatmulOperation class has the following responsibilities: |
| 1) Generates matmul operation inputs as np.array for a desired |
| distribution. |
| 2) Runs the matmul reference operation in np.matmul. |
| 3) Generates the matmul operation expected output as np.array. |
| 4) Additional, generate input and output filename strings. |
| """ |
| |
| def __init__(self, matmul_operation, dist_lhs, dist_rhs): |
| self.matmul_operation = matmul_operation |
| # Problem shape. |
| self.M = matmul_operation.problem_shape[0] |
| self.N = matmul_operation.problem_shape[1] |
| self.K = matmul_operation.problem_shape[2] |
| |
| # Data type for the input and result matrices. |
| self.dtype_lhs = DataTypeNumPyTag[matmul_operation.lhs.datatype] |
| self.dtype_rhs = DataTypeNumPyTag[matmul_operation.rhs.datatype] |
| self.dtype_result = DataTypeNumPyTag[matmul_operation.result.datatype] |
| |
| # Distribution of the input tensors. |
| self.dist_lhs = dist_lhs |
| self.dist_rhs = dist_rhs |
| |
| # Filename for the left hand side input tensor. |
| self.filename_lhs = "m{problem_m}xk{problem_k}_"\ |
| "{tensor_description}_{dist}_lhs.npy".format( |
| problem_m=self.M, |
| problem_k=self.K, |
| tensor_description=self.matmul_operation.lhs.name(), |
| dist=DistributionName[self.dist_lhs]) |
| |
| # Filename for the right hand side input tensor. |
| self.filename_rhs = "k{problem_k}xn{problem_n}_"\ |
| "{tensor_description}_{dist}_rhs.npy".format( |
| problem_k=self.K, |
| problem_n=self.N, |
| tensor_description=self.matmul_operation.rhs.name(), |
| dist=DistributionName[self.dist_rhs]) |
| |
| # Filename for the reference result tensor. |
| self.reference_filename_result = "m{problem_m}xn{problem_n}_"\ |
| "{tensor_description}_reference_result.npy".format( |
| problem_m=self.M, |
| problem_n=self.N, |
| tensor_description=self.matmul_operation.result.name()) |
| |
| # Generates input data, runs reference numpy.matmul, and save npy files to the output directory. |
| def run_and_save(self, output_dir="."): |
| |
| # Generate the input data as np.array for the matmul operation. |
| lhs_np_array = get_np_array(self.matmul_operation.lhs, (self.M, self.K), |
| self.dist_lhs) |
| rhs_np_array = get_np_array(self.matmul_operation.rhs, (self.K, self.N), |
| self.dist_rhs) |
| |
| # Run the reference np.matmul and generate result np.array. |
| result = np.matmul(lhs_np_array, rhs_np_array) |
| |
| # Save the input data as np.array for the matmul operation. |
| np.save(os.path.join(output_dir, self.filename_lhs),\ |
| np.array(lhs_np_array, dtype = self.dtype_lhs)) |
| np.save(os.path.join(output_dir, self.filename_rhs),\ |
| np.array(rhs_np_array, dtype = self.dtype_rhs)) |
| |
| # Save the expected result as an np.array. |
| np.save(os.path.join(output_dir, self.reference_filename_result),\ |
| np.array(result, dtype = self.dtype_result)) |
| |
| |
| ############################################################################### |
| class MatmulOperationLauncher: |
| """Launches the compilation and execution of the matmul operation. |
| MatmulOperationLauncher class has the following responsibilities: |
| 1) Launches compilation of the matmul for a verification or profiling runs. |
| 2) Launches the verification by running the IREE compiled matmul operation |
| and the python nmpy.matmul. |
| 3) Launches the profiling by running the IREE compiled matmul operation. |
| """ |
| |
| def __init__(self, args, operation): |
| self.operation = operation |
| |
| # Variables from top-level argparse. |
| self.generated_path = os.path.join(args.build_dir, 'generated', |
| args.mlir_dialect) |
| self.args = args |
| self.benchmark_dispatch_repeat_count = args.batch_size |
| self.batch_size = args.batch_size |
| |
| # Additional paths. |
| self.matmul_path = os.path.join(self.generated_path, 'matmul') |
| self.operation_path = os.path.join(self.matmul_path, operation.name()) |
| self.source_mlir_file = os.path.join(self.operation_path, |
| operation.name() + '.mlir') |
| |
| # path to iree-compile tool. (for compiling the input mlir file to vmfb) |
| self.iree_compile_path = os.path.join(args.build_dir, 'tools', |
| 'iree-compile') |
| |
| # path to iree-benchmark-module tool. (for performance benchmarking and profiling) |
| self.iree_benchmark_module_path = os.path.join(args.build_dir, 'tools', |
| 'iree-benchmark-module') |
| |
| # path to iree-run-module tool. (for verification) |
| self.iree_run_module_path = os.path.join(args.build_dir, 'tools', |
| 'iree-run-module') |
| |
| # output vmfb files for the operation. |
| self.vmfb_verify_file = os.path.join(self.operation_path, |
| self.operation.name() + '_verify.vmfb') |
| self.vmfb_benchmark_file = os.path.join( |
| self.operation_path, |
| self.operation.name() + '_benchmark.vmfb') |
| |
| def compile(self, compilation_mode): |
| """Compiles the matmul operation to a vmfb file for profiling.""" |
| |
| benchmark_dispatch_repeat_count = self.benchmark_dispatch_repeat_count if compilation_mode == CompilationMode.Profile else 1 |
| vmfb_file = self.vmfb_benchmark_file if compilation_mode == CompilationMode.Profile else self.vmfb_verify_file |
| |
| # Base iree-compile commandline |
| cmd = [self.iree_compile_path, self.source_mlir_file, "-o", f"{vmfb_file}"] |
| |
| # General compilation options |
| cmd += [f"--iree-hal-target-backends={self.args.device}"] |
| cmd += [f"--iree-hal-cuda-llvm-target-arch={self.args.cuda_arch}"] |
| if self.args.split_k_slices != "": |
| cmd += [f"--iree-flow-split-matmul-reduction={self.args.split_k_slices}"] |
| if self.args.use_mma_sync: |
| cmd += [f"--iree-codegen-llvmgpu-use-mma-sync"] |
| if self.args.use_wmma: |
| cmd += [f"--iree-codegen-llvmgpu-use-wmma"] |
| |
| # Compilation options for profiling |
| cmd += [ |
| f"--iree-hal-benchmark-dispatch-repeat-count={benchmark_dispatch_repeat_count}" |
| ] |
| |
| if not os.path.exists(vmfb_file) or self.args.force_compile: |
| print( |
| f">> Compilation command for {CompilationModeNames[compilation_mode]} : {' '.join(cmd)}" |
| ) |
| subprocess.check_output(cmd) |
| |
| elif self.args.verbose: |
| print("Skipping compilation of matmul operation: " + vmfb_file + |
| " since it already exists.") |
| |
| def verify(self, configuration): |
| """Verifies the matmul operation with a given configuration.""" |
| # First compile the operation to a vmfb file. |
| self.compile(CompilationMode.Verify) |
| |
| # Verify using random data distribution. |
| # TODO 1) make input distribution configurable through command line. |
| # TODO 2) make the reference run to check if reference npy files are present, |
| # then do not re-run the reference. |
| reference_op = ReferenceMatmulOperation(self.operation,\ |
| Distribution.Random,\ |
| Distribution.Random) |
| |
| lhs_npy_file = os.path.join(self.operation_path, reference_op.filename_lhs) |
| rhs_npy_file = os.path.join(self.operation_path, reference_op.filename_rhs) |
| expected_result_npy_file = os.path.join( |
| self.operation_path, reference_op.reference_filename_result) |
| |
| # If the reference numpy do not exists, run the reference implementation |
| # and generate npy files. |
| if not os.path.exists(lhs_npy_file) or \ |
| not os.path.exists(rhs_npy_file) or \ |
| not os.path.exists(expected_result_npy_file): |
| reference_op.run_and_save(self.operation_path) |
| |
| # Commandline `iree-run-module` for verification. |
| cmd = [ |
| self.iree_run_module_path, f'--module={self.vmfb_verify_file}', |
| f'--device={self.args.device}' |
| ] |
| |
| # Operation-specific verification command-line. |
| # TODO: abstract the operation-specific verification command-line out of verification. |
| cmd.append(f'--function={self.operation.name()}_{configuration.name()}') |
| cmd.append(f'--input=@{lhs_npy_file}') |
| cmd.append(f'--input=@{rhs_npy_file}') |
| cmd.append(f'--expected_output=@{expected_result_npy_file}') |
| |
| # Print the command if verbose. |
| if self.args.verbose: |
| print(">> Verification command: " + ' '.join(cmd)) |
| |
| # Launch verification. |
| cmd_output = subprocess.check_output(cmd, text=True) |
| |
| # Parse the verification output. |
| m = re.search(r"\[(?P<verification_result>[a-zA-Z]+)\]", cmd_output) |
| if m is None: |
| raise Exception( |
| "Failed to parse verification output by iree-run-module: " + |
| cmd_output) |
| verification_result = m.group('verification_result') |
| |
| if self.args.verbose or verification_result != "SUCCESS": |
| print(cmd_output) |
| |
| return verification_result |
| |
| def profile(self, configuration): |
| """Profiles the matmul operation with a given configuration.""" |
| # First compile the operation to a vmfb file. |
| self.compile(CompilationMode.Profile) |
| |
| # Commandline `iree-benchmark-module` for profiling. |
| cmd = [ |
| self.iree_benchmark_module_path, f'--module={self.vmfb_benchmark_file}', |
| f'--device={self.args.device}' |
| ] |
| |
| # Profiling specific flags. |
| cmd += [f'--benchmark_repetitions={self.args.benchmark_repetitions}'] |
| cmd += [f'--batch_size={self.batch_size}'] |
| |
| # Operation-specific profiling command-line. |
| cmd += [f'--function={self.operation.name()}_{configuration.name()}'] |
| cmd += [f'--input={self.operation.lhs_npy_shape()}'] |
| cmd += [f'--input={self.operation.rhs_npy_shape()}'] |
| |
| # Print the command if verbose. |
| if self.args.verbose: |
| print(">> Profiling command: " + ' '.join(cmd)) |
| |
| # Launch profiling. |
| cmd_output = subprocess.check_output(cmd, |
| text=True, |
| stderr=subprocess.STDOUT) |
| |
| # Parse the profiling output. |
| m = re.search(r"real_time_median\s+(?P<runtime>\d+.\d+)\s+ms", cmd_output) |
| if m is None: |
| raise Exception("Failed to parse runtime from benchmark result: " + |
| cmd_output) |
| runtime_in_ms = float(m.group('runtime')) |
| return runtime_in_ms |
| |
| |
| ############################################################################## |
| class MatmulGenerator: |
| """Matmul dispatch generator class. |
| Generates a list of pre-definied matmul operations with resonable tuning cofigurations. |
| The generator function are seperated based on the target backend and the data type. |
| Please see example `MatmulGenerator._cuda_matmul_tensor_cores_f16` for cuda target |
| backend and f16 data type.""" |
| |
| def __init__(self, args): |
| self.args = args |
| |
| self.default_config = False if args.default_config in [ |
| 'False', 'false', '0' |
| ] else True |
| |
| self.translation_infos = [ |
| #TranslationInfo.LLVMGPUMatmulSimt, # CUDA Core (SMIT) |
| #TranslationInfo.LLVMGPUMatmulTensorCore, # Tensor Core (WMMA) |
| TranslationInfo. |
| LLVMGPUMatmulTensorCoreMmaSync, # Tensor Core (MMA.SYNC) |
| ] |
| |
| self.problem_shapes = [[128, 256, 8192]] |
| """ |
| self.problem_shapes = [[128, 128, 256], [256, 512, 128], [1024, 512, 2048], |
| [2560, 2560, 2560], [3456, 1024, 2048]] |
| """ |
| |
| # List of pre-definied matmul dispatch collections. |
| self.dispatches_collection_list = [] |
| |
| # CUDA specific constants. |
| self.cuda_warp_size = 32 |
| self.cuda_smem_capacity_in_bytes_sm80 = 192 << 10 |
| |
| def _is_tile_aligned_shape(self, dispatch): |
| """Checks if the given dispatch is valid for CUDA.""" |
| problem_shape = dispatch.operation.problem_shape |
| threadblock_shape = dispatch.configuration.tile_description.threadblock_shape |
| if len(problem_shape) != len(threadblock_shape): |
| raise ValueError("Problem shape and threadblock shape must have the "\ |
| "same rank.") |
| is_aligned = all( |
| a % b == 0 for a, b in zip(problem_shape, threadblock_shape)) |
| return is_aligned |
| |
| def _cuda_smem_required_in_bytes(self, dispatch): |
| """Returns size bytes of shared memory required for a given cuda dispatch.""" |
| threadblock_shape = dispatch.configuration.tile_description.threadblock_shape |
| num_stages = dispatch.configuration.tile_description.stages |
| tile_shape_lhs = threadblock_shape[0] * threadblock_shape[2] |
| tile_shape_rhs = threadblock_shape[2] * threadblock_shape[1] |
| return ((tile_shape_lhs * DataTypeSizeInBits[dispatch.operation.lhs.datatype] + \ |
| tile_shape_rhs * DataTypeSizeInBits[dispatch.operation.rhs.datatype]) * num_stages) // 8 |
| |
| def _is_cuda_smem_avialable(self, dispatch): |
| """Checks if the given dispatch is valid for CUDA.""" |
| return self._cuda_smem_required_in_bytes( |
| dispatch) <= self.cuda_smem_capacity_in_bytes_sm80 |
| |
| def _cuda_supported_configuration_list(self, operation, configuration_list): |
| """Returns a list of supported configurations for CUDA.""" |
| supported_configuration_list = [] |
| for configuration in configuration_list: |
| dispatch = Dispatch(operation, configuration) |
| if not self._is_tile_aligned_shape(dispatch): |
| print(f"Warning: {dispatch.name()} is not aligned is being skipped.") |
| continue |
| if not self._is_cuda_smem_avialable(dispatch): |
| print(f"Warning: {dispatch.name()} requires {self._cuda_smem_required_in_bytes(dispatch)} "\ |
| f"bytes of shared memory, which is larger than the SM80 capacity "\ |
| f"{self.cuda_smem_capacity_in_bytes_sm80} bytes.") |
| continue |
| |
| # If all checks pass, add the configuration to the supported list. |
| supported_configuration_list.append(configuration) |
| |
| return supported_configuration_list |
| |
| def _cuda_matmul_tensor_cores_f16(self): |
| """Appends a list of matmul dispatches for GPU TensorCore F16 data type.""" |
| |
| tile_descriptions = [ |
| TileDescription([256, 128, 32], 3, [64, 4, 1]), |
| TileDescription([128, 256, 32], 3, [128, 2, 1]), |
| TileDescription([128, 128, 64], 4, [64, 2, 1]), |
| TileDescription([128, 128, 32], 5, [64, 2, 1]), |
| TileDescription([128, 64, 32], 5, [64, 2, 1]), |
| TileDescription([64, 64, 64], 5, [64, 2, 1]), |
| TileDescription([64, 64, 32], 10, [64, 2, 1]), |
| ] |
| |
| # Create configuration list from the tile descriptions and translation infos. |
| configuration_list = [] |
| |
| for tile_description in tile_descriptions: |
| for translation_info in self.translation_infos: |
| configuration_list.append( |
| MatmulCompilationInfo(tile_description, translation_info)) |
| |
| # Create dispatches collection for each problem shape with the configuration list. |
| for problem_shape in self.problem_shapes: |
| operation = MatmulOperation( |
| problem_shape,\ |
| TensorDescription(DataType.f16, LayoutType.RowMajor), \ |
| TensorDescription(DataType.f16, LayoutType.RowMajor), \ |
| TensorDescription(DataType.f16, LayoutType.RowMajor)) |
| |
| # Filter out configurations that are not supported by LLVM GPU CUDA backend. |
| supported_configuration_list = self._cuda_supported_configuration_list( |
| operation, configuration_list) |
| |
| # Add default configuration if requested. |
| if self.default_config: |
| supported_configuration_list.append( |
| MatmulCompilationInfo([], [], CompilationConfigType.Default)) |
| |
| self.dispatches_collection_list.append(DispatchCollection(\ |
| operation, supported_configuration_list)) |
| |
| ################################################################################ |
| def _cuda_matmul_tensor_cores_f32(self): |
| """Appends a list of matmul dispatches for GPU TensorCore F32 data type.""" |
| |
| tile_descriptions = [ |
| TileDescription([128, 256, 16], 3, [128, 2, 1]), |
| TileDescription([256, 128, 16], 3, [64, 4, 1]), |
| TileDescription([128, 128, 16], 5, [64, 2, 1]), |
| TileDescription([128, 128, 32], 3, [64, 2, 1]), |
| TileDescription([128, 128, 32], 4, [64, 2, 1]), |
| TileDescription([64, 64, 64], 3, [64, 2, 1]), |
| ] |
| |
| # Create dispatches collection for each problem shape with the configuration list. |
| configuration_list = [] |
| |
| for tile_description in tile_descriptions: |
| for translation_info in self.translation_infos: |
| configuration_list.append( |
| MatmulCompilationInfo(tile_description, translation_info)) |
| |
| for problem_shape in self.problem_shapes: |
| operation = MatmulOperation( |
| problem_shape,\ |
| TensorDescription(DataType.f32, LayoutType.RowMajor), \ |
| TensorDescription(DataType.f32, LayoutType.RowMajor), \ |
| TensorDescription(DataType.f32, LayoutType.RowMajor)) |
| |
| # Filter out configurations that are not supported by LLVM GPU CUDA backend. |
| supported_configuration_list = self._cuda_supported_configuration_list( |
| operation, configuration_list) |
| |
| # Add default configuration if requested. |
| if self.default_config: |
| supported_configuration_list.append( |
| MatmulCompilationInfo([], [], CompilationConfigType.Default)) |
| |
| self.dispatches_collection_list.append(DispatchCollection(\ |
| operation, supported_configuration_list)) |
| |
| def generate(self): |
| """Generates a list of matmul operations.""" |
| if self.args.device == 'cuda': |
| self._cuda_matmul_tensor_cores_f16() |
| self._cuda_matmul_tensor_cores_f32() |
| return self.dispatches_collection_list |
| |
| |
| ############################################################################## |