Source code for psyclone.domain.gocean.transformations.gocean_opencl_trans

# -----------------------------------------------------------------------------
# BSD 3-Clause License
#
# Copyright (c) 2021-2026, Science and Technology Facilities Council.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
# Authors R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab

'''This module contains the GOcean-specific OpenCL transformation.
'''

import os

from psyclone.configuration import Config
from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.errors import GenerationError
from psyclone.gocean1p0 import GOInvokeSchedule, GOLoop
from psyclone.psyGen import (
    Transformation, args_filter, InvokeSchedule, HaloExchange)
from psyclone.psyir.backend.opencl import OpenCLWriter
from psyclone.psyir.frontend.fortran import FortranReader
from psyclone.psyir.nodes import (
    Routine, Call, Reference, Literal, Assignment, IfBlock, ArrayReference,
    Schedule, BinaryOperation, StructureReference, FileContainer, CodeBlock,
    IntrinsicCall, Container, DataNode)
from psyclone.psyir.symbols import (
    ArrayType, DataSymbol, RoutineSymbol, ContainerSymbol,
    UnsupportedFortranType, ArgumentInterface, ImportInterface,
    ScalarType)
from psyclone.psyir.transformations.transformation_error import (
    TransformationError)


[docs] class GOOpenCLTrans(Transformation): ''' Switches on/off the generation of an OpenCL PSy layer for a given InvokeSchedule. Additionally, it will generate OpenCL kernels for each of the kernels referenced by the Invoke. For example: >>> from psyclone.parse.algorithm import parse >>> from psyclone.psyGen import PSyFactory >>> API = "gocean" >>> FILENAME = "shallow_alg.f90" # examples/gocean/eg1 >>> ast, invoke_info = parse(FILENAME, api=API) >>> psy = PSyFactory(API, distributed_memory=False).create(invoke_info) >>> schedule = psy.invokes.get('invoke_0').schedule >>> ocl_trans = GOOpenCLTrans() >>> ocl_trans.apply(schedule) >>> print(schedule.view()) ''' # Specify which OpenCL command queue to use for management operations like # data transfers when generating an OpenCL PSy-layer _OCL_MANAGEMENT_QUEUE = 1 # TODO #1572: These are class attributes because multiple invokes may need # to generate a single OpenCL environment (e.g. to share the device data # pointers) and therefore guarantee the same properties, but this hasn't # been tested. PSycloneBench ShallowWater could be an example of this. # Biggest queue number that any kernel is allocated to. The OpenCL # environment should be set up with at least this number of queues. _max_queue_number = 1 # Whether to enable the profiling option in the OpenCL environment _enable_profiling = False # Whether to enable the out_of_order option in the OpenCL environment _out_of_order = False # Total number of invokes that have been transformed to OpenCL _transformed_invokes = 0 # Reference to the OpenCL kernels file _kernels_file = None @property def name(self): ''' :returns: the name of this transformation. :rtype: str ''' return "GOOpenCLTrans"
[docs] def validate(self, node, options=None): ''' Checks that the supplied InvokeSchedule is valid and that an OpenCL version of it can be generated. :param node: the Schedule to check. :type node: :py:class:`psyclone.psyGen.InvokeSchedule` :param options: a dictionary with options for transformations. :type options: dict of str:value or None :param bool options["enable_profiling"]: whether or not to set up the OpenCL environment with the profiling option enabled. :param bool options["out_of_order"]: whether or not to set up the OpenCL environment with the out_of_order option enabled. :param bool options["end_barrier"]: whether or not to add an OpenCL barrier at the end of the transformed invoke. :raises TransformationError: if the InvokeSchedule is not for the GOcean API. :raises TransformationError: if any of the kernels have arguments which are passed as a literal. :raises TransformationError: if any of the provided options is invalid. :raises TransformationError: if any of the provided options is not compatible with a previous OpenCL environment. :raises TransformationError: if any kernel in this invoke has a global variable used by an import. :raises TransformationError: if any kernel does not iterate over the whole grid. ''' if isinstance(node, InvokeSchedule): if not isinstance(node, GOInvokeSchedule): raise TransformationError( f"OpenCL generation is currently only supported for the " f"GOcean API but got an InvokeSchedule of type: " f"'{type(node).__name__}'") else: raise TransformationError( f"Error in GOOpenCLTrans: the supplied node must be a (sub-" f"class of) InvokeSchedule but got {type(node)}") # Validate options map valid_options = ['end_barrier', 'enable_profiling', 'out_of_order'] if options: for key, value in options.items(): if key in valid_options: # All current options should contain boolean values if not isinstance(value, bool): raise TransformationError( f"InvokeSchedule OpenCL option '{key}' should be " f"a boolean.") else: raise TransformationError( f"InvokeSchedule does not support the OpenCL option " f"'{key}'. The supported options are: " f"{valid_options}.") # Validate that the options are valid with previously generated OpenCL if self._transformed_invokes > 0: if ('enable_profiling' in options and self._enable_profiling != options['enable_profiling']): raise TransformationError( f"Can't generate an OpenCL Invoke with enable_profiling='" f"{options['enable_profiling']}' since a previous " f"transformation used a different value, and their OpenCL" f" environments must match.") if ('out_of_order' in options and self._out_of_order != options['out_of_order']): raise TransformationError( f"Can't generate an OpenCL Invoke with out_of_order='" f"{options['out_of_order']}' since a previous " f"transformation used a different value, and their OpenCL " f"environments must match.") # Now we need to check that none of the invoke arguments is a literal args = args_filter(node.args, arg_types=["scalar"]) for arg in args: if arg.is_literal: raise TransformationError( f"Cannot generate OpenCL for Invokes that contain kernel " f"arguments which are a literal, but found the literal " f"'{arg.name}' used as an argument in invoke " f"'{node.name}'.") # Check that we can construct the PSyIR and SymbolTable of each of # the kernels in this Schedule. Also check that none of them access # any form of global data (that is not a routine argument or just # type information). for kern in node.kernels(): if not kern.module_inline: KernelModuleInlineTrans().validate(kern) for ksched in kern.get_callees(): global_variables = set(ksched.symbol_table.imported_symbols) prec_symbols = set(ksched.symbol_table.precision_datasymbols) if global_variables.difference(prec_symbols): names = sorted([sym.name for sym in global_variables.difference(prec_symbols)]) raise TransformationError( f"The Symbol Table for kernel '{kern.name}' contains " f"the following symbols with 'global' scope: {names}. " f"An OpenCL kernel cannot call other kernels and all " f"of the data it accesses must be passed by argument. " f"Use the KernelImportsToArguments transformation to " f"convert such symbols to kernel arguments first.") # In OpenCL all kernel loops should iterate the whole grid for kernel in node.kernels(): inner_loop = kernel.ancestor(GOLoop) outer_loop = inner_loop.ancestor(GOLoop) if not (inner_loop.field_space == "go_every" and outer_loop.field_space == "go_every" and inner_loop.iteration_space == "go_all_pts" and outer_loop.iteration_space == "go_all_pts"): raise TransformationError( f"The kernel '{kernel.name}' does not iterate over all " f"grid points. This is a necessary requirement for " f"generating the OpenCL code and can be done by applying " f"the GOMoveIterationBoundariesInsideKernelTrans to each " f"kernel before the GOOpenCLTrans.")
[docs] def apply(self, node, options=None): ''' Apply the OpenCL transformation to the supplied GOInvokeSchedule. This causes PSyclone to generate an OpenCL version of the corresponding PSy-layer routine. The generated code makes use of the FortCL library (https://github.com/stfc/FortCL) in order to manage the OpenCL device directly from Fortran. :param node: the InvokeSchedule to transform. :type node: :py:class:`psyclone.psyGen.GOInvokeSchedule` :param options: set of option to tune the OpenCL generation. :type options: dict of str:value or None :param bool options["enable_profiling"]: whether or not to set up the \ OpenCL environment with the profiling option enabled. :param bool options["out_of_order"]: whether or not to set up the \ OpenCL environment with the out_of_order option enabled. :param bool options["end_barrier"]: whether or not to add an OpenCL \ barrier at the end of the transformed invoke. ''' if not options: options = {} self.validate(node, options) api_config = Config.get().api_conf("gocean") # Update class attributes if 'enable_profiling' in options: self._enable_profiling = options['enable_profiling'] if 'out_of_order' in options: self._out_of_order = options['out_of_order'] self._transformed_invokes += 1 # Get end_barrier option end_barrier = options.get('end_barrier', True) # Update the maximum value that the queue_number have. for kernel in node.coded_kernels(): self._max_queue_number = max(self._max_queue_number, kernel.opencl_options["queue_number"]) # Insert, if they don't already exist, the necessary OpenCL helper # subroutines in the root Container. module = node.ancestor(Container) psy_init = self._insert_opencl_init_routine(module) init_grid = self._insert_initialise_grid_buffers(module) write_grid_buf = self._insert_write_grid_buffers(module) self._insert_ocl_read_from_device_function(module) self._insert_ocl_write_to_device_function(module) init_buf = self._insert_ocl_initialise_buffer(module) for kern in node.coded_kernels(): self._insert_ocl_arg_setter_routine(module, kern) # Insert fortcl, clfortran and c_iso_binding import statement fortcl = ContainerSymbol("fortcl") node.symbol_table.add(fortcl) get_num_cmd_queues = RoutineSymbol( "get_num_cmd_queues", interface=ImportInterface(fortcl)) get_cmd_queues = RoutineSymbol( "get_cmd_queues", interface=ImportInterface(fortcl)) get_kernel_by_name = RoutineSymbol( "get_kernel_by_name", interface=ImportInterface(fortcl)) node.symbol_table.add(get_num_cmd_queues) node.symbol_table.add(get_cmd_queues) node.symbol_table.add(get_kernel_by_name) clfortran = ContainerSymbol("clfortran") node.symbol_table.add(clfortran) cl_finish = RoutineSymbol( "clFinish", interface=ImportInterface(clfortran)) cl_launch = RoutineSymbol( "clEnqueueNDRangeKernel", interface=ImportInterface(clfortran)) node.symbol_table.add(cl_finish) node.symbol_table.add(cl_launch) iso_c_binding = ContainerSymbol("iso_c_binding") node.symbol_table.add(iso_c_binding) c_loc = RoutineSymbol( "C_LOC", interface=ImportInterface(iso_c_binding)) c_null = DataSymbol( "C_NULL_PTR", datatype=ScalarType.integer_type(), interface=ImportInterface(iso_c_binding)) node.symbol_table.add(c_loc) node.symbol_table.add(c_null) # Include the check_status subroutine if we are in debug_mode if api_config.debug_mode: ocl_utils = ContainerSymbol("ocl_utils_mod") check_status = RoutineSymbol( "check_status", interface=ImportInterface(ocl_utils)) node.symbol_table.add(ocl_utils) node.symbol_table.add(check_status) # Declare local variables needed by an OpenCL PSy-layer invoke qlist = node.symbol_table.new_symbol( "cmd_queues", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)"), tag="opencl_cmd_queues") # 'first_time' needs to be an UnsupportedFortranType because it has # SAVE and initial value first = DataSymbol("first_time", datatype=UnsupportedFortranType( "logical, save :: first_time = .true.")) node.symbol_table.add(first, tag="first_time") flag = node.symbol_table.new_symbol( "ierr", symbol_type=DataSymbol, datatype=ScalarType.integer_type(), tag="opencl_error") global_size = node.symbol_table.new_symbol( "globalsize", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_size_t), target :: globalsize(2)")) local_size = node.symbol_table.new_symbol( "localsize", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_size_t), target :: localsize(2)")) # Bring all the boundaries at the beginning (since we are going to # use them during the setup block - and they don't change) boundary_vars = [] for tag, symbol in node.symbol_table.tags_dict.items(): if tag.startswith(("xstart_", "xstop_", "ystart_", "ystop_")): boundary_vars.append(symbol) cursor = 0 for assignment in node.walk(Assignment): if assignment.lhs.symbol in boundary_vars: node.children.insert(cursor, assignment.detach()) cursor += 1 # Create block of code to execute only the first time: setup_block = IfBlock.create(Reference(first), []) setup_block.preceding_comment = \ "Initialise OpenCL runtime, kernels and buffers" node.children.insert(cursor, setup_block) setup_block.if_body.addchild(Call.create(psy_init, [])) # Set up cmd_queues pointer cblock = CodeBlock.create(f"{qlist.name} => {get_cmd_queues.name}()", "pointer_assignment") setup_block.if_body.addchild(cblock) # Declare and assign kernel pointers for kern in node.coded_kernels(): name = "kernel_" + kern.name try: kpointer = node.symbol_table.lookup_with_tag(name) except KeyError: pointer_type = UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), TARGET, SAVE :: " + name) kpointer = DataSymbol(name, datatype=pointer_type) node.symbol_table.add(kpointer, tag=name) setup_block.if_body.addchild( Assignment.create( Reference(kpointer), Call.create(get_kernel_by_name, [Literal(kern.name, ScalarType.character_type())]))) # Traverse all arguments and make sure all the buffers are initialised initialised_fields = set() there_is_a_grid_buffer = False for kern in node.coded_kernels(): for arg in kern.arguments.args: if arg.argument_type == "field": field = node.symbol_table.lookup(arg.name) if field not in initialised_fields: # Call the init_buffer routine with this field call = Call.create(init_buf, [Reference(field)]) setup_block.if_body.addchild(call) initialised_fields.add(field) elif (arg.argument_type == "grid_property" and not arg.is_scalar): if not there_is_a_grid_buffer: # Call the grid init_buffer routine field = node.symbol_table.lookup( kern.arguments.find_grid_access().name) call = Call.create(init_grid, [Reference(field)]) setup_block.if_body.addchild(call) there_is_a_grid_buffer = True if not arg.is_scalar: # All buffers will be assigned to a local OpenCL memory # object to easily reference them, make sure this local # variable is declared in the Invoke. name = arg.name + "_cl_mem" try: node.symbol_table.lookup_with_tag(name) except KeyError: node.symbol_table.new_symbol( name, tag=name, symbol_type=DataSymbol, datatype=UnsupportedFortranType( "INTEGER(KIND=c_intptr_t) :: " + name)) # Now call all the set_args routines because in some platforms (e.g. # in Xilinx FPGA) knowing which arguments each kernel is going to use # allows the write operation to place the data into the appropriate # memory bank. first_statement_comment = False kernel_names = set() for kern in node.coded_kernels(): if kern.name not in kernel_names: kernel_names.add(kern.name) callblock = self._generate_set_args_call(kern, node.scope) for child in callblock.pop_all_children(): setup_block.if_body.addchild(child) if not first_statement_comment: child.preceding_comment = ( "Do a set_args now so subsequent writes place the " "data appropriately") first_statement_comment = True # Now we can insert calls to write_to_device method for each buffer # and the grid writing call if there is one (in a new first time block) first_statement_comment = False for field in initialised_fields: call = Call.create( RoutineSymbol(field.name+"%write_to_device"), []) setup_block.if_body.addchild(call) if not first_statement_comment: call.preceding_comment = "Write data to the device" first_statement_comment = True if there_is_a_grid_buffer: fieldarg = node.coded_kernels()[0].arguments.find_grid_access() field = node.symbol_table.lookup(fieldarg.name) call = Call.create(write_grid_buf, [Reference(field)]) setup_block.if_body.addchild(call) # We will just mark the nodes we are replacing as deleting them inside # the loop would break the PSy-layer backward_dependency method in the # following iterations. We will detach all these nodes after the loop. nodes_to_detach = [] # Transform each kernel call loop construct to its equivalent FortCL # statements for kern in node.coded_kernels(): outerloop = kern.ancestor(GOLoop).ancestor(GOLoop) # Set up globalsize and localsize arrays garg = node.coded_kernels()[0].arguments.find_grid_access() num_x = api_config.grid_properties["go_grid_nx"].fortran\ .format(garg.name) num_y = api_config.grid_properties["go_grid_ny"].fortran\ .format(garg.name) assig = Assignment.create( Reference(global_size), Literal(f"(/{num_x}, {num_y}/)", ArrayType(ScalarType.integer_type(), [2]))) node.children.insert(outerloop.position, assig) local_size_value = kern.opencl_options['local_size'] assig = Assignment.create( Reference(local_size), Literal(f"(/{local_size_value}, 1/)", ArrayType(ScalarType.integer_type(), [2]))) node.children.insert(outerloop.position, assig) # Check that the global_size is multiple of the local_size if api_config.debug_mode: fortran_reader = FortranReader() global_size_expr = fortran_reader.psyir_from_expression( num_x, node.symbol_table) self._add_divisibility_check(node, outerloop.position, check_status, global_size_expr, local_size_value) # Retrieve kernel symbol kernelsym = node.symbol_table.lookup_with_tag( "kernel_" + kern.name) # Choose the command queue number to which to dispatch this kernel. # We have do deal with possible dependencies to kernels dispatched # in different command queues as the order of execution is not # guaranteed. queue_number = kern.opencl_options['queue_number'] cmd_queue = ArrayReference.create( qlist, [Literal(str(queue_number), ScalarType.integer_type())]) dependency = outerloop.backward_dependence() # If the dependency is a loop containing a kernel, add a barrier if # the previous kernels were dispatched in a different command queue if dependency: for kernel_dep in dependency.coded_kernels(): previous_queue = kernel_dep.opencl_options['queue_number'] if previous_queue != queue_number: # If the backward dependency is being executed in # another queue we add a barrier to make sure the # previous kernel has finished before this halo # exchange starts. barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(previous_queue), ScalarType.integer_type())])])) node.children.insert(outerloop.position, barrier) # If the dependency is something other than a kernel, currently we # dispatch everything else to queue _OCL_MANAGEMENT_QUEUE, so add a # barrier if this kernel is not on queue _OCL_MANAGEMENT_QUEUE. if dependency and not dependency.coded_kernels() and \ queue_number != self._OCL_MANAGEMENT_QUEUE: barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(self._OCL_MANAGEMENT_QUEUE), ScalarType.integer_type())])])) node.children.insert(outerloop.position, barrier) # Check that everything has succeeded before the kernel launch if api_config.debug_mode: self._add_ready_check(node, outerloop.position, check_status, kern.name, flag, cl_finish, cmd_queue.copy()) callblock = self._generate_set_args_call(kern, node.scope) for child in callblock.pop_all_children(): node.children.insert(outerloop.position, child) # Then we call the clEnqueueNDRangeKernel assig = Assignment.create( Reference(flag), Call.create(cl_launch, [ # OpenCL Command Queue cmd_queue, # OpenCL Kernel object Reference(kernelsym), # Number of work dimensions Literal("2", ScalarType.integer_type()), # Global offset (if NULL the global IDs start at # offset (0,0,0)) Reference(c_null), # Global work size Call.create(c_loc, [Reference(global_size)]), # Local work size Call.create(c_loc, [Reference(local_size)]), # Number of events in wait list Literal("0", ScalarType.integer_type()), # Event wait list that need to be completed before # this kernel Reference(c_null), # Event that identifies this kernel completion Reference(c_null)])) assig.preceding_comment = "Launch the kernel" node.children.insert(outerloop.position, assig) self._insert_kernel_code_in_opencl_file(kern) # Add additional checks if we are in debug mode if api_config.debug_mode: self._add_kernel_check(node, outerloop.position, check_status, kern.name, flag, cl_finish, cmd_queue.copy()) nodes_to_detach.append(outerloop) # If we execute the kernels asynchronously, we need to add wait # statements before the halo exchanges to guarantee that the data # has been updated for possible_dependent_node in node.walk(HaloExchange): # The backward_dependences returns the last Loop with a kernel # that has a dependency with this halo exchange dependency = possible_dependent_node.backward_dependence() if dependency: for kernel_dep in dependency.coded_kernels(): previous_queue = kernel_dep.opencl_options['queue_number'] if previous_queue != self._OCL_MANAGEMENT_QUEUE: # If the backward dependency is being executed in # another queue we add a barrier to make sure the # previous kernel has finished before this one starts. barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(previous_queue), ScalarType.integer_type())])])) pos = possible_dependent_node.position node.children.insert(pos, barrier) for node_to_detach in nodes_to_detach: node_to_detach.detach() if end_barrier: self._add_end_barrier(node, flag, cl_finish, qlist) # And at the very end always makes sure that first_time value is False assign = Assignment.create(Reference(first), Literal("false", ScalarType.boolean_type())) assign.preceding_comment = "Unset the first time flag" node.addchild(assign) self._output_opencl_kernels_file()
def _add_end_barrier(self, node, flag, cl_finish, qlist): ''' Append into the given node a OpenCL Wait operation for each of the OpenCL queues in use. :param node: PSyIR node where to append the barrier. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param qlist: PSyIR symbol of the OpenCL queues array. :type qlist: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' # We need a clFinish for each of the queues in the implementation added_comment = False for num in range(1, self._max_queue_number + 1): queue = ArrayReference.create(qlist, [Literal(str(num), ScalarType.integer_type())]) node.addchild( Assignment.create( Reference(flag), Call.create(cl_finish, [queue]))) if not added_comment: node.children[-1].preceding_comment = \ "Wait until all kernels have finished" added_comment = True @staticmethod def _add_divisibility_check(node, position, check_status, global_size_expr, local_size): ''' Insert into node a check that the global_size is exactly divisible by the local size. :param node: where to insert the divisibility check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the divisibilitay check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param global_size_expr: PSyIR representing the global_size. :type global_size_expr: :py:class:`psyclone.psyir.nodes.DataNode` :param int local_size: size of the OpenCL local work_group. ''' check = BinaryOperation.create( BinaryOperation.Operator.NE, IntrinsicCall.create( IntrinsicCall.Intrinsic.MOD, [global_size_expr, Literal(str(local_size), ScalarType.integer_type())] ), Literal("0", ScalarType.integer_type())) message = ("Global size is not a multiple of local size (" "mandatory in OpenCL < 2.0).") error = Call.create(check_status, [Literal(message, ScalarType.character_type()), Literal("-1", ScalarType.integer_type())]) ifblock = IfBlock.create(check, [error]) node.children.insert(position, ifblock) @staticmethod def _add_kernel_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue): ''' Insert into node a check that the kernel has been launched and has been executed successfully. :param node: where to insert the kernel check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the kernel check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param str kernel_name: name of the kernel being checked. :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param cmd_queue: PSyIR symbol of the OpenCL command queues array. :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' # First check the launch return value message = Literal(f"{kernel_name} clEnqueueNDRangeKernel", ScalarType.character_type()) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position, check) # Then add a barrier barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [cmd_queue])) node.children.insert(position + 1, barrier) # And check the kernel executed successfully message = Literal(f"Errors during {kernel_name}", ScalarType.character_type()) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position + 2, check) @staticmethod def _add_ready_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue): ''' Insert into node a check that verifies if everything in the command queues previous to a kernel launch has completed successfully. :param node: where to insert the kernel check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the kernel check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param str kernel_name: name of the kernel being checked. :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param cmd_queue: PSyIR symbol of the OpenCL command queues array. :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [cmd_queue])) node.children.insert(position, barrier) message = Literal(f"Errors before {kernel_name} launch", ScalarType.character_type()) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position + 1, check) def _insert_kernel_code_in_opencl_file(self, kernel): ''' Insert the given kernel into a OpenCL file. For this we need to remove the 'go_wp' precision symbol which can't be generated by OpenCL. We assume 'go_wp' is a OpenCL double. :param kernel: the kernel to insert. :type kernel: :py:class:`psyclone.psyir.nodes.KernelSchedule` ''' if not self._kernels_file: self._kernels_file = FileContainer("opencl_kernels") # Create a copy of the kernel and remove precision symbols since they # are not supported in the OpenCL backend. # validate() has checked that the kernel is not polymorphic. schedule = kernel.get_callees()[0] kernel_copy = schedule.copy() symtab = kernel_copy.symbol_table # TODO #898: Removing symbols is not properly supported by PSyIR # because we have to deal with all references to it. In this case we # implement manually a conversion of all 'go_wp' to a double precision # and remove the symbol because we guarantee that it just appear in the # declarations of other symbols (symtab.datasymbols). # pylint: disable=protected-access for sym in symtab.datasymbols: # Not all types have the 'precision' attribute (e.g. # UnresolvedType) if (hasattr(sym.datatype, "precision") and isinstance(sym.datatype.precision, DataNode)): sym.datatype._precision = ScalarType.Precision.DOUBLE if 'go_wp' in symtab: del symtab._symbols['go_wp'] # Insert kernel in the OpenCL kernels file if it doesn't already exist for routine in self._kernels_file.walk(Routine): if routine.name == kernel.name: break # if it exist re-use existing one # TODO 1572: Here we assume that in the same Invoke (scope) a # kernel with the same name will be the same kernel, but that # may not be true when doing multiple invokes. else: self._kernels_file.addchild(kernel_copy) def _output_opencl_kernels_file(self) -> None: ''' Write the OpenCL kernels to a file using the OpenCL backend. ''' ocl_writer = OpenCLWriter(kernels_local_size=64) new_kern_code = ocl_writer(self._kernels_file) fdesc = None name_idx = -1 while not fdesc: name_idx += 1 new_name = f"opencl_kernels_{name_idx}.cl" try: # Atomically attempt to open the new kernel file (in case # this is part of a parallel build) fdesc = os.open( os.path.join(Config.get().kernel_output_dir, new_name), os.O_CREAT | os.O_WRONLY | os.O_EXCL) except (OSError, IOError): # The os.O_CREATE and os.O_EXCL flags in combination mean # that open() raises an error if the file exists continue # Write the modified AST out to file os.write(fdesc, new_kern_code.encode()) # Close the new kernel file os.close(fdesc) @staticmethod def _generate_set_args_call(kernel, scope): ''' Generate the Call statement to the set_args subroutine for the provided kernel. :param kernel: the kernel for which to generate a call to its \ arg_setter subroutine. :type kernel: :py:class:`psyclone.psyGen.CodedKern` :param scope: The node representing the scope where the call \ statements will be inserted. :type scope: :py:class:`psyclone.psyir.nodes.ScopingNode` :returns: a block of statements that represent the set_args call :rtype: :py:class:`psyclone.psyir.nodes.Schedule` ''' call_block = Schedule() # Retrieve symbol table and kernel symbol symtab = scope.symbol_table kernelsym = symtab.lookup_with_tag("kernel_" + kernel.name) # Find the symbol that defines each boundary for this kernel. # In OpenCL the iteration boundaries are passed as arguments to the # kernel because the global work size may exceed the dimensions and # therefore the updates outside the boundaries should be masked. # If any of the boundaries is not found, it can not proceed. boundaries = [] try: for boundary in ["xstart", "xstop", "ystart", "ystop"]: tag = boundary + "_" + kernel.name symbol = symtab.lookup_with_tag(tag) boundaries.append(symbol.name) except KeyError as err: raise GenerationError( f"Boundary symbol tag '{tag}' not found while generating the " f"OpenCL code for kernel '{kernel.name}'. Make sure to apply " f"the GOMoveIterationBoundariesInsideKernelTrans before " f"attempting the OpenCL code generation.") from err api_config = Config.get().api_conf("gocean") # Prepare the argument list for the set_args routine arguments = [Reference(kernelsym)] for arg in kernel.arguments.args: if arg.argument_type == "scalar": if arg.name in boundaries: # Boundary values are 0-indexed in OpenCL and 1-indexed in # PSyIR, therefore we need to subtract 1 bop = BinaryOperation.create( BinaryOperation.Operator.SUB, arg.psyir_expression(), Literal("1", ScalarType.integer_type())) arguments.append(bop) else: arguments.append(arg.psyir_expression()) elif arg.argument_type == "field": # Cast buffer to cl_mem type expected by OpenCL field = symtab.lookup(arg.name) symbol = symtab.lookup_with_tag(arg.name + "_cl_mem") source = StructureReference.create(field, ['device_ptr']) dest = Reference(symbol) icall = IntrinsicCall.create(IntrinsicCall.Intrinsic.TRANSFER, [source, dest]) assig = Assignment.create(dest.copy(), icall) call_block.addchild(assig) arguments.append(Reference(symbol)) elif arg.argument_type == "grid_property": garg = kernel.arguments.find_grid_access() if arg.is_scalar: # pylint: disable=protected-access arguments.append( StructureReference.create( symtab.lookup(garg.name), api_config.grid_properties[arg._property_name] .fortran.split('%')[1:] )) else: # Cast grid buffer to cl_mem type expected by OpenCL device_grid_property = arg.name + "_device" field = symtab.lookup(garg.name) source = StructureReference.create( field, ['grid', device_grid_property]) symbol = symtab.lookup_with_tag(arg.name + "_cl_mem") dest = Reference(symbol) icall = IntrinsicCall.create( IntrinsicCall.Intrinsic.TRANSFER, [source, dest]) assig = Assignment.create(dest.copy(), icall) call_block.addchild(assig) arguments.append(Reference(symbol)) call_symbol = symtab.lookup_with_tag(kernel.name + "_set_args") call_block.addchild(Call.create(call_symbol, arguments)) return call_block @staticmethod def _insert_ocl_arg_setter_routine(node, kernel): ''' Returns the symbol of the subroutine that sets the OpenCL kernel arguments for the provided PSy-layer kernel using FortCL. If the subroutine doesn't exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :param kernel: the kernel call for which to provide the arg_setter \ subroutine. :type kernel: :py:class:`psyclone.psyGen.CodedKern` :returns: the symbol representing the arg_setter subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # Check if the subroutine already exist. sub_name = kernel.name + "_set_args" try: return node.symbol_table.lookup_with_tag(sub_name) except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Create the new Routine sub_name = node.symbol_table.next_available_name(sub_name) sub_symbol = node.symbol_table.new_symbol( sub_name, tag=kernel.name + "_set_args", symbol_type=RoutineSymbol) argsetter = Routine(sub_symbol) arg_list = [] # Add subroutine imported symbols clfortran = ContainerSymbol("clfortran") clsetkernelarg = RoutineSymbol("clSetKernelArg", interface=ImportInterface(clfortran)) iso_c = ContainerSymbol("iso_c_binding") c_sizeof = RoutineSymbol("C_SIZEOF", interface=ImportInterface(iso_c)) c_loc = RoutineSymbol("C_LOC", interface=ImportInterface(iso_c)) c_intptr_t = RoutineSymbol("c_intptr_t", interface=ImportInterface(iso_c)) ocl_utils = ContainerSymbol("ocl_utils_mod") check_status = RoutineSymbol("check_status", interface=ImportInterface(ocl_utils)) argsetter.symbol_table.add(clfortran) argsetter.symbol_table.add(clsetkernelarg) argsetter.symbol_table.add(iso_c) argsetter.symbol_table.add(c_sizeof) argsetter.symbol_table.add(c_loc) argsetter.symbol_table.add(c_intptr_t) argsetter.symbol_table.add(ocl_utils) argsetter.symbol_table.add(check_status) # Add an argument symbol for the kernel object kobj = argsetter.symbol_table.new_symbol( "kernel_obj", symbol_type=DataSymbol, interface=ArgumentInterface(ArgumentInterface.Access.READ), datatype=UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), TARGET :: kernel_obj")) arg_list.append(kobj) # Include each kernel call argument as an argument of this routine for arg in kernel.arguments.args: name = argsetter.symbol_table.next_available_name(arg.name) # This function requires 'TARGET' annotated declarations which are # not supported in the PSyIR, so we build them as # UnsupportedFortranType for now. if arg.is_scalar and arg.intrinsic_type == "real": pointer_type = UnsupportedFortranType( "REAL(KIND=go_wp), INTENT(IN), TARGET :: " + name) elif arg.is_scalar: pointer_type = UnsupportedFortranType( "INTEGER, INTENT(IN), TARGET :: " + name) else: # Everything else is a cl_mem pointer (c_intptr_t) pointer_type = UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), INTENT(IN), TARGET :: " + name) new_arg = DataSymbol( name, datatype=pointer_type, interface=ArgumentInterface(ArgumentInterface.Access.READ)) argsetter.symbol_table.add(new_arg) arg_list.append(new_arg) argsetter.symbol_table.specify_argument_list(arg_list) # Create the ierr local variable ierr = argsetter.symbol_table.new_symbol( "ierr", symbol_type=DataSymbol, datatype=ScalarType.integer_type()) # Call the clSetKernelArg for each argument and a check_status to # see if the OpenCL call has succeeded for index, variable in enumerate(arg_list[1:]): call = Call.create(clsetkernelarg, [Reference(kobj), Literal(str(index), ScalarType.integer_type()), Call.create(c_sizeof, [Reference(variable)]), Call.create(c_loc, [Reference(variable)])]) assignment = Assignment.create(Reference(ierr), call) argsetter.addchild(assignment) emsg = f"clSetKernelArg: arg {index} of {kernel.name}" call = Call.create(check_status, [Literal(emsg, ScalarType.character_type()), Reference(ierr)]) argsetter.addchild(call) argsetter.children[0].preceding_comment = \ f"Set the arguments for the {kernel.name} OpenCL Kernel" # Add the subroutine as child of the provided node node.addchild(argsetter) return node.symbol_table.lookup_with_tag(kernel.name + "_set_args") def _insert_opencl_init_routine(self, node): ''' Returns the symbol of the subroutine that initialises the OpenCL environment using FortCL. If the subroutine doesn't exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol representing the OpenCL initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: # TODO #1572: The ocl_init routine may need to be regenerated if # there are multiple Invokes because _max_queue_number may have # increased and we need to load the kernels of both invokes. return symtab.lookup_with_tag("ocl_init_routine") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Choose a round-robin device number if it has MPI and multiple # accelerators. distributed_memory = Config.get().distributed_memory devices_per_node = Config.get().ocl_devices_per_node additional_uses = "" additional_stmts = "" if devices_per_node > 1 and distributed_memory: additional_uses += "USE parallel_mod, ONLY: get_rank" additional_stmts += \ f"ocl_device_num = mod(get_rank()-1, {devices_per_node}) + 1" # Get a set of all kernel names in the Container. This implementation # currently assumes all of them will be available in OpenCL unique_kernels = {kernel.name for kernel in node.coded_kernels()} # Code of the subroutine in Fortran code = f''' subroutine psy_init() {additional_uses} use fortcl, only: ocl_env_init, add_kernels character(len=30) kernel_names({len(unique_kernels)}) integer :: ocl_device_num=1 logical, save :: initialised=.false. ! Check to make sure we only execute this routine once if (.not. initialised) then initialised = .true. ! Initialise the opencl environment/device {additional_stmts} call ocl_env_init({self._max_queue_number}, ocl_device_num, & {".true." if self._enable_profiling else ".false."}, & {".true." if self._out_of_order else ".false."}) ! The kernels this psy layer module requires ''' for index, kernel_name in enumerate(unique_kernels): code += f"kernel_names({index + 1}) = \"{kernel_name}\"\n" code += f'''\ ! Create the opencl kernel objects. This expects to find all of ! the compiled kernels in FORTCL_KERNELS_FILE environment variable call add_kernels({len(unique_kernels)}, kernel_names) end if end subroutine psy_init''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("psy_init") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] subroutine.detach() node.symbol_table.add(subroutine_symbol, tag="ocl_init_routine") subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_routine") @staticmethod def _insert_initialise_grid_buffers(node): ''' Returns the symbol of a subroutine that initialises all OpenCL grid buffers in the OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the grid buffer initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_init_grid_buffers") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") int_arrays = [] real_arrays = [] for key, prop in props.items(): if key == "go_grid_data": # TODO #676: Ignore because go_grid_data is actually a field # property continue if prop.type == "array" and prop.intrinsic_type == "integer": int_arrays.append(prop.fortran.format("field")) elif prop.type == "array" and prop.intrinsic_type == "real": real_arrays.append(prop.fortran.format("field")) # Code of the subroutine in Fortran code = f''' subroutine initialise_device_grid(field) USE fortcl, ONLY: create_ronly_buffer USE iso_c_binding, only: c_size_t use field_mod type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes IF (.not. c_associated({int_arrays[0]}_device)) THEN ! Create integer grid fields size_in_bytes = int({num_x}*{num_y}, 8) * & c_sizeof({int_arrays[0]}(1,1)) ''' for int_array in int_arrays: code += f''' {int_array}_device = transfer( & create_ronly_buffer(size_in_bytes), {int_array}_device) ''' code += f''' ! Create real grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof({real_arrays[0]}(1,1)) ''' for real_array in real_arrays: code += f''' {real_array}_device = transfer( & create_ronly_buffer(size_in_bytes), {real_array}_device) ''' code += ''' END IF end subroutine initialise_device_grid ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("initialise_grid_device_buffers") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_init_grid_buffers") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_grid_buffers") def _insert_write_grid_buffers(self, node): ''' Returns the symbol of a subroutine that writes the values of the grid properties into the OpenCL device buffers using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol representing the grid buffers writing subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_write_grid_buffers") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") # Code of the subroutine in Fortran code = f''' subroutine write_device_grid(field) USE fortcl, ONLY: get_cmd_queues use iso_c_binding, only: c_intptr_t, c_size_t, c_sizeof USE clfortran USE ocl_utils_mod, ONLY: check_status type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer(c_intptr_t) :: cl_mem integer :: ierr cmd_queues => get_cmd_queues() ! Integer grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof(field%grid%tmask(1,1)) cl_mem = transfer(field%grid%tmask_device, cl_mem) ierr = clEnqueueWriteBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), & cl_mem, CL_TRUE, 0_8, size_in_bytes, & C_LOC(field%grid%tmask), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer tmask", ierr) ! Real grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof(field%grid%area_t(1,1)) ''' write_str = ''' cl_mem = transfer(field%grid%{0}_device, cl_mem) ierr = clEnqueueWriteBuffer(cmd_queues({1}), & cl_mem, CL_TRUE, 0_8, size_in_bytes, & C_LOC(field%grid%{0}), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer {0}_device", ierr) ''' for grid_prop in ['area_t', 'area_u', 'area_v', 'dx_u', 'dx_v', 'dx_t', 'dy_u', 'dy_v', 'dy_t', 'gphiu', 'gphiv']: code += write_str.format(grid_prop, self._OCL_MANAGEMENT_QUEUE) code += "end subroutine write_device_grid" # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("write_grid_buffers") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_write_grid_buffers") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_write_grid_buffers") def _insert_ocl_read_from_device_function(self, node): ''' Returns the symbol of a subroutine that retrieves the data back from an OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer data retrieving subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_read_func") except KeyError: # If the subroutines does not exist, it needs to be # generated first. pass # Code of the subroutine in Fortran code = f''' subroutine read_sub(from, to, startx, starty, nx, ny, blocking) USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof USE ocl_utils_mod, ONLY: check_status use kind_params_mod, only: go_wp USE clfortran USE fortcl, ONLY: get_cmd_queues type(c_ptr), intent(in) :: from real(go_wp), intent(inout), dimension(:,:), target :: to integer, intent(in) :: startx, starty, nx, ny logical, intent(in) :: blocking INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes integer(c_intptr_t) :: cl_mem INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer :: ierr, i ! Give the from pointer the appropriate OpenCL memory object type cl_mem = transfer(from, cl_mem) cmd_queues => get_cmd_queues() ! Two copy strategies depending on how much of the total length ! nx covers. if (nx < size(to, 1) / 2) then ! Dispatch asynchronous copies of just the contiguous data. do i = starty, starty+ny size_in_bytes = int(nx, 8) * c_sizeof(to(1,1)) offset_in_bytes = int(size(to, 1) * (i-1) + (startx-1)) & * c_sizeof(to(1,1)) ierr = clEnqueueReadBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_FALSE, offset_in_bytes, size_in_bytes, & C_LOC(to(startx, i)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueReadBuffer", ierr) enddo if (blocking) then CALL check_status("clFinish on read", & clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE}))) endif else ! Copy across the whole starty:starty+ny rows in a single ! copy operation. size_in_bytes = int(size(to, 1) * ny, 8) * c_sizeof(to(1,1)) offset_in_bytes = int(size(to,1)*(starty-1), 8) & * c_sizeof(to(1,1)) ierr = clEnqueueReadBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_TRUE, offset_in_bytes, size_in_bytes, & C_LOC(to(1,starty)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueReadBuffer", ierr) endif end subroutine read_sub ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("read_from_device") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_read_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_read_func") def _insert_ocl_write_to_device_function(self, node): ''' Returns the symbol of a subroutine that writes the buffer data into an OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer writing subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_write_func") except KeyError: # If the subroutines does not exist, it needs to be # generated first. pass # Code of the subroutine in Fortran code = f''' subroutine write_sub(from, to, startx, starty, nx, ny, blocking) USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof USE ocl_utils_mod, ONLY: check_status use kind_params_mod, only: go_wp USE clfortran USE fortcl, ONLY: get_cmd_queues real(go_wp), intent(in), dimension(:,:), target :: from type(c_ptr), intent(in) :: to integer, intent(in) :: startx, starty, nx, ny logical, intent(in) :: blocking integer(c_intptr_t) :: cl_mem INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer :: ierr, i ! Give the to pointer the appropriate OpenCL memory object type cl_mem = transfer(to, cl_mem) cmd_queues => get_cmd_queues() ! Two copy strategies depending on how much of the total length ! nx covers. if (nx < size(from,1) / 2) then ! Dispatch asynchronous copies of just the contiguous data. do i=starty, starty+ny size_in_bytes = int(nx, 8) * c_sizeof(from(1,1)) offset_in_bytes = int(size(from, 1) * (i-1) + (startx-1)) & * c_sizeof(from(1,1)) ierr = clEnqueueWriteBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_FALSE, offset_in_bytes, size_in_bytes, & C_LOC(from(startx, i)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer", ierr) enddo if (blocking) then CALL check_status("clFinish on write", & clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE}))) endif else ! Copy across the whole starty:starty+ny rows in a single ! copy operation. size_in_bytes = int(size(from,1) * ny, 8) * c_sizeof(from(1,1)) offset_in_bytes = int(size(from,1) * (starty-1)) & * c_sizeof(from(1,1)) ierr = clEnqueueWriteBuffer(& cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_TRUE, offset_in_bytes, size_in_bytes, & C_LOC(from(1, starty)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer", ierr) endif end subroutine write_sub ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("write_to_device") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_write_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_write_func") @staticmethod def _insert_ocl_initialise_buffer(node): ''' Returns the symbol of a subroutine that initialises an OpenCL buffer in the OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_init_buffer_func") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") host_buff = \ api_config.grid_properties["go_grid_data"].fortran.format("field") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") # Fields need to provide a function pointer to how the # device data is going to be read and written, if it doesn't # exist, create the appropriate subroutine first. read_fp = symtab.lookup_with_tag("ocl_read_func").name write_fp = symtab.lookup_with_tag("ocl_write_func").name # Code of the subroutine in Fortran code = f''' subroutine initialise_device_buffer(field) USE fortcl, ONLY: create_rw_buffer USE iso_c_binding, only: c_size_t use field_mod type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes IF (.NOT. field%data_on_device) THEN size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof({host_buff}(1,1)) ! Create buffer on device, we store it without type information ! on the dl_esm_inf pointer (transfer/static_cast to void*) field%device_ptr = transfer( & create_rw_buffer(size_in_bytes), & field%device_ptr) field%data_on_device = .true. field%read_from_device_f => {read_fp} field%write_to_device_f => {write_fp} END IF end subroutine initialise_device_buffer ''' # Create the symbol for the routine . subroutine_symbol = RoutineSymbol("initialise_device_buffer") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_init_buffer_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_buffer_func")
# For AutoAPI documentation generation __all__ = ["GOOpenCLTrans"]