experimental/metal/builtin_executables.m - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "experimental/metal/builtin_executables.h"

 #include <string.h>

 #include "experimental/metal/builtin/metal_buffer_kernels.h"
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/api.h"

 typedef struct iree_hal_metal_builtin_executable_data_t {
   const char* entry_point;
   uint32_t file_index;
 } iree_hal_metal_builtin_executable_data_t;

 // The list of builtin executable entry points and their source file index in builtin exectuable
 // embedded data. This MUST be consistent with kernel function names in MSL source code and the file
 // order in embedded data.
 static iree_hal_metal_builtin_executable_data_t iree_hal_metal_builtin_executable_entry_points[] = {
     {"fill_buffer_16byte", 1},  // Buffer fills; 16-byte aligned offset/length
     {"fill_buffer_4byte", 1},   // Buffer fills; 4-byte aligned offset/length
     {"fill_buffer_1byte", 1},   // Buffer fills; 1-byte aligned offset/length
     {"copy_buffer_1byte", 0},   // Buffer copies; 1-byte aligned offset/length
 };

 // The buffer fill specificiation. This MUST be consistent with the same struct in MSL source code.
 typedef struct iree_hal_metal_buffer_fill_spec_t {
   uint64_t buffer_offset;  // Buffer offset to fill (in bytes)
   uint64_t buffer_length;  // Buffer length to fill (in bytes)
   uint32_t pattern;        // 32-bit fill pattern
 } iree_hal_metal_buffer_fill_spec_t;

 typedef struct iree_hal_metal_buffer_copy_spec_t {
   uint64_t src_buffer_offset;  // Source buffer offset (in bytes)
   uint64_t dst_buffer_offset;  // Destination buffer offset (in bytes)
   uint64_t length;             // Buffer length to fill (in bytes)
 } iree_hal_metal_buffer_copy_spec_t;

 iree_status_t iree_hal_metal_builtin_executable_create(
     id<MTLDevice> device, iree_allocator_t host_allocator,
     iree_hal_metal_builtin_executable_t** out_executable) {
   IREE_ASSERT_ARGUMENT(out_executable);
   *out_executable = NULL;
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_hal_metal_builtin_executable_t* executable = NULL;
   iree_host_size_t entry_point_count =
       IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
   iree_host_size_t total_size =
       sizeof(*executable) + entry_point_count * sizeof(executable->entry_points[0]);
   iree_status_t status = iree_allocator_malloc(host_allocator, total_size, (void**)&executable);

   if (iree_status_is_ok(status)) {
     executable->host_allocator = host_allocator;
     executable->entry_point_count = entry_point_count;

     // Compile each MSL source string into a MTLLibrary and get the MTLFunction for the entry point
     // to build the pipeline state object.
     // TODO(antiagainst): We are performing synchronous compilation at runtime here. This is good
     // for debugging purposes but bad for performance. Enable offline compilation and make that as
     // the default.

     MTLCompileOptions* compile_options = [MTLCompileOptions new];  // +1
     compile_options.languageVersion = MTLLanguageVersion3_0;

     for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
          ++i) {
       const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i].entry_point;
       uint32_t file_index = iree_hal_metal_builtin_executable_entry_points[i].file_index;
       iree_file_toc_t source_code = metal_buffer_kernels_create()[file_index];

       id<MTLLibrary> library = nil;
       id<MTLFunction> function = nil;
       id<MTLComputePipelineState> pso = nil;
       status = iree_hal_metal_compile_msl_and_create_pipeline_object(
           iree_make_string_view(source_code.data, source_code.size), IREE_SV(entry_point), device,
           compile_options, &library, &function, &pso);
       if (!iree_status_is_ok(status)) break;

       // Package required parameters for kernel launches for each entry point.
       iree_hal_metal_kernel_params_t* params = &executable->entry_points[i];
       params->library = library;
       params->function = function;
       params->pso = pso;
       // Thread group size for builtin executables are determined at runtime dispatch time.
       params->threadgroup_size[0] = 0;
       params->threadgroup_size[1] = 0;
       params->threadgroup_size[2] = 0;
       // We don't need the layout parameter for builtin executables too.
       params->layout = NULL;

       // Stash the entry point name in the string table for use when tracing.
       IREE_TRACE({ params->function_name = IREE_SV(entry_point); });
     }

     [compile_options release];  // -1
   }

   if (iree_status_is_ok(status)) {
     *out_executable = executable;
   } else {
     iree_hal_metal_builtin_executable_destroy(executable);
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_metal_builtin_executable_destroy(iree_hal_metal_builtin_executable_t* executable) {
   IREE_TRACE_ZONE_BEGIN(z0);

   for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) {
     iree_hal_metal_kernel_params_t* entry_point = &executable->entry_points[i];
     [entry_point->pso release];
     [entry_point->function release];
     [entry_point->library release];
     IREE_ASSERT_EQ(entry_point->layout, NULL);
   }
   iree_allocator_free(executable->host_allocator, executable);

   IREE_TRACE_ZONE_END(z0);
 }

 static inline iree_device_size_t iree_hal_metal_ceil_div(iree_device_size_t a,
                                                          iree_device_size_t b) {
   return (a + b - 1) / b;
 }

 iree_status_t iree_hal_metal_builtin_executable_fill_buffer(
     const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
     id<MTLBuffer> target_buffer, iree_device_size_t target_offset, iree_device_size_t length,
     uint32_t pattern) {
   id<MTLComputePipelineState> pso = nil;
   MTLResourceUsage usage = MTLResourceUsageWrite;
   const iree_device_size_t workgroup_size = 32;
   iree_device_size_t workgroup_count = 0;

   if (target_offset % 16 == 0 && length % 16 == 0) {  // 16-byte aligned case
     pso = executable->entry_points[0].pso;
     workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 16);
   } else if (target_offset % 4 == 0 && length % 4 == 0) {  // 4-byte aligned case
     pso = executable->entry_points[1].pso;
     workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);
   } else {  // 1-byte aligned case
     pso = executable->entry_points[2].pso;
     // We may potentially need to read some 32-bit scalars at unaligned addresses.
     usage |= MTLResourceUsageRead;
     // Calculate unaligned partial prefix/suffix byte count, and then get the middle aligned byte
     // count for distributing threads. This logic MUST be consistent with the MSL source code.
     iree_device_size_t left_byte_count = target_offset % 4;
     iree_device_size_t right_byte_count = (target_offset + length) % 4;
     int64_t middle_byte_count = length - left_byte_count - right_byte_count;
     // Note that in the extreme case, we don't have aligned bytes in the middle (0), or actually
     // prefix and suffix partial bytes are the same (< 0). We'd need one thread to handle the
     // partial bytes at least.
     if (middle_byte_count <= 0) middle_byte_count = 1;
     workgroup_count = iree_hal_metal_ceil_div(middle_byte_count, workgroup_size * 4);
   }

   iree_hal_metal_buffer_fill_spec_t spec = {
       .buffer_offset = target_offset,
       .buffer_length = length,
       .pattern = pattern,
   };

   [encoder setComputePipelineState:pso];

   // The following MUST exactly match the pipeline layout from MSL source code.
   // buffer(0) is the target buffer to fill. Note that we MUST set 0 as offset here--the offset
   // is to be handled directly in the kernels!
   [encoder setBuffer:target_buffer offset:0 atIndex:0];
   [encoder useResource:target_buffer usage:usage];
   // buffer(1) is the buffer fill spec.
   [encoder setBytes:&spec length:sizeof(spec) atIndex:1];

   // Encode the dispatch.
   [encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
           threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
   return iree_ok_status();
 }

 iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
     const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
     id<MTLBuffer> source_buffer, iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
     iree_device_size_t target_offset, iree_device_size_t length) {
   id<MTLComputePipelineState> pso = executable->entry_points[3].pso;
   const iree_device_size_t workgroup_size = 32;
   iree_device_size_t workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);

   iree_hal_metal_buffer_copy_spec_t spec = {
       .src_buffer_offset = source_offset,
       .dst_buffer_offset = target_offset,
       .length = length,
   };

   [encoder setComputePipelineState:pso];

   // The following MUST exactly match the pipeline layout from MSL source code.
   // buffer(0) is the source buffer. Note that we MUST set 0 as offset here--the offset is to be
   // handled directly in the kernels!
   [encoder setBuffer:source_buffer offset:0 atIndex:0];
   [encoder useResource:source_buffer usage:MTLResourceUsageRead];
   // buffer(0) is the target buffer. Note that we MUST set 0 as offset here--the offset is to be
   // handled directly in the kernels!
   [encoder setBuffer:target_buffer offset:0 atIndex:1];
   [encoder useResource:target_buffer usage:MTLResourceUsageWrite];
   // buffer(1) is the buffer copy spec.
   [encoder setBytes:&spec length:sizeof(spec) atIndex:2];

   // Encode the dispatch.
   [encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
           threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
   return iree_ok_status();
 }
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "experimental/metal/builtin_executables.h"

	#include <string.h>

	#include "experimental/metal/builtin/metal_buffer_kernels.h"
	#include "iree/base/api.h"
	#include "iree/base/tracing.h"
	#include "iree/hal/api.h"

	typedef struct iree_hal_metal_builtin_executable_data_t {
	const char* entry_point;
	uint32_t file_index;
	} iree_hal_metal_builtin_executable_data_t;

	// The list of builtin executable entry points and their source file index in builtin exectuable
	// embedded data. This MUST be consistent with kernel function names in MSL source code and the file
	// order in embedded data.
	static iree_hal_metal_builtin_executable_data_t iree_hal_metal_builtin_executable_entry_points[] = {
	{"fill_buffer_16byte", 1}, // Buffer fills; 16-byte aligned offset/length
	{"fill_buffer_4byte", 1}, // Buffer fills; 4-byte aligned offset/length
	{"fill_buffer_1byte", 1}, // Buffer fills; 1-byte aligned offset/length
	{"copy_buffer_1byte", 0}, // Buffer copies; 1-byte aligned offset/length
	};

	// The buffer fill specificiation. This MUST be consistent with the same struct in MSL source code.
	typedef struct iree_hal_metal_buffer_fill_spec_t {
	uint64_t buffer_offset; // Buffer offset to fill (in bytes)
	uint64_t buffer_length; // Buffer length to fill (in bytes)
	uint32_t pattern; // 32-bit fill pattern
	} iree_hal_metal_buffer_fill_spec_t;

	typedef struct iree_hal_metal_buffer_copy_spec_t {
	uint64_t src_buffer_offset; // Source buffer offset (in bytes)
	uint64_t dst_buffer_offset; // Destination buffer offset (in bytes)
	uint64_t length; // Buffer length to fill (in bytes)
	} iree_hal_metal_buffer_copy_spec_t;

	iree_status_t iree_hal_metal_builtin_executable_create(
	id<MTLDevice> device, iree_allocator_t host_allocator,
	iree_hal_metal_builtin_executable_t** out_executable) {
	IREE_ASSERT_ARGUMENT(out_executable);
	*out_executable = NULL;
	IREE_TRACE_ZONE_BEGIN(z0);

	iree_hal_metal_builtin_executable_t* executable = NULL;
	iree_host_size_t entry_point_count =
	IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
	iree_host_size_t total_size =
	sizeof(executable) + entry_point_count sizeof(executable->entry_points[0]);
	iree_status_t status = iree_allocator_malloc(host_allocator, total_size, (void**)&executable);

	if (iree_status_is_ok(status)) {
	executable->host_allocator = host_allocator;
	executable->entry_point_count = entry_point_count;

	// Compile each MSL source string into a MTLLibrary and get the MTLFunction for the entry point
	// to build the pipeline state object.
	// TODO(antiagainst): We are performing synchronous compilation at runtime here. This is good
	// for debugging purposes but bad for performance. Enable offline compilation and make that as
	// the default.

	MTLCompileOptions* compile_options = [MTLCompileOptions new]; // +1
	compile_options.languageVersion = MTLLanguageVersion3_0;

	for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
	++i) {
	const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i].entry_point;
	uint32_t file_index = iree_hal_metal_builtin_executable_entry_points[i].file_index;
	iree_file_toc_t source_code = metal_buffer_kernels_create()[file_index];

	id<MTLLibrary> library = nil;
	id<MTLFunction> function = nil;
	id<MTLComputePipelineState> pso = nil;
	status = iree_hal_metal_compile_msl_and_create_pipeline_object(
	iree_make_string_view(source_code.data, source_code.size), IREE_SV(entry_point), device,
	compile_options, &library, &function, &pso);
	if (!iree_status_is_ok(status)) break;

	// Package required parameters for kernel launches for each entry point.
	iree_hal_metal_kernel_params_t* params = &executable->entry_points[i];
	params->library = library;
	params->function = function;
	params->pso = pso;
	// Thread group size for builtin executables are determined at runtime dispatch time.
	params->threadgroup_size[0] = 0;
	params->threadgroup_size[1] = 0;
	params->threadgroup_size[2] = 0;
	// We don't need the layout parameter for builtin executables too.
	params->layout = NULL;

	// Stash the entry point name in the string table for use when tracing.
	IREE_TRACE({ params->function_name = IREE_SV(entry_point); });
	}

	[compile_options release]; // -1
	}

	if (iree_status_is_ok(status)) {
	*out_executable = executable;
	} else {
	iree_hal_metal_builtin_executable_destroy(executable);
	}

	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	void iree_hal_metal_builtin_executable_destroy(iree_hal_metal_builtin_executable_t* executable) {
	IREE_TRACE_ZONE_BEGIN(z0);

	for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) {
	iree_hal_metal_kernel_params_t* entry_point = &executable->entry_points[i];
	[entry_point->pso release];
	[entry_point->function release];
	[entry_point->library release];
	IREE_ASSERT_EQ(entry_point->layout, NULL);
	}
	iree_allocator_free(executable->host_allocator, executable);

	IREE_TRACE_ZONE_END(z0);
	}

	static inline iree_device_size_t iree_hal_metal_ceil_div(iree_device_size_t a,
	iree_device_size_t b) {
	return (a + b - 1) / b;
	}

	iree_status_t iree_hal_metal_builtin_executable_fill_buffer(
	const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
	id<MTLBuffer> target_buffer, iree_device_size_t target_offset, iree_device_size_t length,
	uint32_t pattern) {
	id<MTLComputePipelineState> pso = nil;
	MTLResourceUsage usage = MTLResourceUsageWrite;
	const iree_device_size_t workgroup_size = 32;
	iree_device_size_t workgroup_count = 0;

	if (target_offset % 16 == 0 && length % 16 == 0) { // 16-byte aligned case
	pso = executable->entry_points[0].pso;
	workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 16);
	} else if (target_offset % 4 == 0 && length % 4 == 0) { // 4-byte aligned case
	pso = executable->entry_points[1].pso;
	workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);
	} else { // 1-byte aligned case
	pso = executable->entry_points[2].pso;
	// We may potentially need to read some 32-bit scalars at unaligned addresses.
	usage \|= MTLResourceUsageRead;
	// Calculate unaligned partial prefix/suffix byte count, and then get the middle aligned byte
	// count for distributing threads. This logic MUST be consistent with the MSL source code.
	iree_device_size_t left_byte_count = target_offset % 4;
	iree_device_size_t right_byte_count = (target_offset + length) % 4;
	int64_t middle_byte_count = length - left_byte_count - right_byte_count;
	// Note that in the extreme case, we don't have aligned bytes in the middle (0), or actually
	// prefix and suffix partial bytes are the same (< 0). We'd need one thread to handle the
	// partial bytes at least.
	if (middle_byte_count <= 0) middle_byte_count = 1;
	workgroup_count = iree_hal_metal_ceil_div(middle_byte_count, workgroup_size * 4);
	}

	iree_hal_metal_buffer_fill_spec_t spec = {
	.buffer_offset = target_offset,
	.buffer_length = length,
	.pattern = pattern,
	};

	[encoder setComputePipelineState:pso];

	// The following MUST exactly match the pipeline layout from MSL source code.
	// buffer(0) is the target buffer to fill. Note that we MUST set 0 as offset here--the offset
	// is to be handled directly in the kernels!
	[encoder setBuffer:target_buffer offset:0 atIndex:0];
	[encoder useResource:target_buffer usage:usage];
	// buffer(1) is the buffer fill spec.
	[encoder setBytes:&spec length:sizeof(spec) atIndex:1];

	// Encode the dispatch.
	[encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
	threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
	return iree_ok_status();
	}

	iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
	const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
	id<MTLBuffer> source_buffer, iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
	iree_device_size_t target_offset, iree_device_size_t length) {
	id<MTLComputePipelineState> pso = executable->entry_points[3].pso;
	const iree_device_size_t workgroup_size = 32;
	iree_device_size_t workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);

	iree_hal_metal_buffer_copy_spec_t spec = {
	.src_buffer_offset = source_offset,
	.dst_buffer_offset = target_offset,
	.length = length,
	};

	[encoder setComputePipelineState:pso];

	// The following MUST exactly match the pipeline layout from MSL source code.
	// buffer(0) is the source buffer. Note that we MUST set 0 as offset here--the offset is to be
	// handled directly in the kernels!
	[encoder setBuffer:source_buffer offset:0 atIndex:0];
	[encoder useResource:source_buffer usage:MTLResourceUsageRead];
	// buffer(0) is the target buffer. Note that we MUST set 0 as offset here--the offset is to be
	// handled directly in the kernels!
	[encoder setBuffer:target_buffer offset:0 atIndex:1];
	[encoder useResource:target_buffer usage:MTLResourceUsageWrite];
	// buffer(1) is the buffer copy spec.
	[encoder setBytes:&spec length:sizeof(spec) atIndex:2];

	// Encode the dispatch.
	[encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
	threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
	return iree_ok_status();
	}