Source code for psyclone.domain.gocean.transformations.gocean_opencl_trans

# -----------------------------------------------------------------------------
# BSD 3-Clause License
#
# Copyright (c) 2021-2024, Science and Technology Facilities Council.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
# Authors R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab

'''This module contains the GOcean-specific OpenCL transformation.
'''

import os

from fparser.two import Fortran2003
from psyclone.configuration import Config
from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.errors import GenerationError
from psyclone.gocean1p0 import GOInvokeSchedule, GOLoop
from psyclone.psyGen import Transformation, args_filter, InvokeSchedule, \
    HaloExchange
from psyclone.psyir.backend.opencl import OpenCLWriter
from psyclone.psyir.frontend.fortran import FortranReader
from psyclone.psyir.nodes import Routine, Call, Reference, Literal, \
    Assignment, IfBlock, ArrayReference, Schedule, BinaryOperation, \
    StructureReference, FileContainer, CodeBlock, IntrinsicCall, Container
from psyclone.psyir.symbols import (
    ArrayType, DataSymbol, RoutineSymbol, ContainerSymbol,
    UnsupportedFortranType, ArgumentInterface, ImportInterface,
    INTEGER_TYPE, CHARACTER_TYPE, BOOLEAN_TYPE, ScalarType)
from psyclone.transformations import TransformationError


[docs] class GOOpenCLTrans(Transformation): ''' Switches on/off the generation of an OpenCL PSy layer for a given InvokeSchedule. Additionally, it will generate OpenCL kernels for each of the kernels referenced by the Invoke. For example: >>> from psyclone.parse.algorithm import parse >>> from psyclone.psyGen import PSyFactory >>> API = "gocean" >>> FILENAME = "shallow_alg.f90" # examples/gocean/eg1 >>> ast, invoke_info = parse(FILENAME, api=API) >>> psy = PSyFactory(API, distributed_memory=False).create(invoke_info) >>> schedule = psy.invokes.get('invoke_0').schedule >>> ocl_trans = GOOpenCLTrans() >>> ocl_trans.apply(schedule) >>> print(schedule.view()) ''' # Specify which OpenCL command queue to use for management operations like # data transfers when generating an OpenCL PSy-layer _OCL_MANAGEMENT_QUEUE = 1 # TODO #1572: These are class attributes because multiple invokes may need # to generate a single OpenCL environment (e.g. to share the device data # pointers) and therefore guarantee the same properties, but this hasn't # been tested. PSycloneBench ShallowWater could be an example of this. # Biggest queue number that any kernel is allocated to. The OpenCL # environment should be set up with at least this number of queues. _max_queue_number = 1 # Whether to enable the profiling option in the OpenCL environment _enable_profiling = False # Whether to enable the out_of_order option in the OpenCL environment _out_of_order = False # Total number of invokes that have been transformed to OpenCL _transformed_invokes = 0 # Reference to the OpenCL kernels file _kernels_file = None @property def name(self): ''' :returns: the name of this transformation. :rtype: str ''' return "GOOpenCLTrans" def validate(self, node, options=None): ''' Checks that the supplied InvokeSchedule is valid and that an OpenCL version of it can be generated. :param node: the Schedule to check. :type node: :py:class:`psyclone.psyGen.InvokeSchedule` :param options: a dictionary with options for transformations. :type options: dict of str:value or None :param bool options["enable_profiling"]: whether or not to set up the \ OpenCL environment with the profiling option enabled. :param bool options["out_of_order"]: whether or not to set up the \ OpenCL environment with the out_of_order option enabled. :param bool options["end_barrier"]: whether or not to add an OpenCL \ barrier at the end of the transformed invoke. :raises TransformationError: if the InvokeSchedule is not for the \ GOcean1.0 API. :raises TransformationError: if any of the kernels have arguments \ which are passed as a literal. :raises TransformationError: if any of the provided options is invalid. :raises TransformationError: if any of the provided options is not \ compatible with a previous OpenCL environment. :raises TransformationError: if any kernel in this invoke has a \ global variable used by an import. :raises TransformationError: if any kernel does not iterate over \ the whole grid. ''' if isinstance(node, InvokeSchedule): if not isinstance(node, GOInvokeSchedule): raise TransformationError( f"OpenCL generation is currently only supported for the " f"GOcean API but got an InvokeSchedule of type: " f"'{type(node).__name__}'") else: raise TransformationError( f"Error in GOOpenCLTrans: the supplied node must be a (sub-" f"class of) InvokeSchedule but got {type(node)}") # Validate options map valid_options = ['end_barrier', 'enable_profiling', 'out_of_order'] for key, value in options.items(): if key in valid_options: # All current options should contain boolean values if not isinstance(value, bool): raise TransformationError( f"InvokeSchedule OpenCL option '{key}' should be a " f"boolean.") else: raise TransformationError( f"InvokeSchedule does not support the OpenCL option " f"'{key}'. The supported options are: {valid_options}.") # Validate that the options are valid with previously generated OpenCL if self._transformed_invokes > 0: if ('enable_profiling' in options and self._enable_profiling != options['enable_profiling']): raise TransformationError( f"Can't generate an OpenCL Invoke with enable_profiling='" f"{options['enable_profiling']}' since a previous " f"transformation used a different value, and their OpenCL" f" environments must match.") if ('out_of_order' in options and self._out_of_order != options['out_of_order']): raise TransformationError( f"Can't generate an OpenCL Invoke with out_of_order='" f"{options['out_of_order']}' since a previous " f"transformation used a different value, and their OpenCL " f"environments must match.") # Now we need to check that none of the invoke arguments is a literal args = args_filter(node.args, arg_types=["scalar"]) for arg in args: if arg.is_literal: raise TransformationError( f"Cannot generate OpenCL for Invokes that contain kernel " f"arguments which are a literal, but found the literal " f"'{arg.name}' used as an argument in invoke " f"'{node.name}'.") # Check that we can construct the PSyIR and SymbolTable of each of # the kernels in this Schedule. Also check that none of them access # any form of global data (that is not a routine argument). for kern in node.kernels(): KernelModuleInlineTrans().validate(kern) ksched = kern.get_kernel_schedule() global_variables = ksched.symbol_table.imported_symbols if global_variables: raise TransformationError( f"The Symbol Table for kernel '{kern.name}' contains the " f"following symbols with 'global' scope: " f"{[sym.name for sym in global_variables]}. An OpenCL " f"kernel cannot call other kernels and all of the data it " f"accesses must be passed by argument. Use the " f"KernelImportsToArguments transformation to convert such " f"symbols to kernel arguments first.") # In OpenCL all kernel loops should iterate the whole grid for kernel in node.kernels(): inner_loop = kernel.ancestor(GOLoop) outer_loop = inner_loop.ancestor(GOLoop) if not (inner_loop.field_space == "go_every" and outer_loop.field_space == "go_every" and inner_loop.iteration_space == "go_all_pts" and outer_loop.iteration_space == "go_all_pts"): raise TransformationError( f"The kernel '{kernel.name}' does not iterate over all " f"grid points. This is a necessary requirement for " f"generating the OpenCL code and can be done by applying " f"the GOMoveIterationBoundariesInsideKernelTrans to each " f"kernel before the GOOpenCLTrans.")
[docs] def apply(self, node, options=None): ''' Apply the OpenCL transformation to the supplied GOInvokeSchedule. This causes PSyclone to generate an OpenCL version of the corresponding PSy-layer routine. The generated code makes use of the FortCL library (https://github.com/stfc/FortCL) in order to manage the OpenCL device directly from Fortran. :param node: the InvokeSchedule to transform. :type node: :py:class:`psyclone.psyGen.GOInvokeSchedule` :param options: set of option to tune the OpenCL generation. :type options: dict of str:value or None :param bool options["enable_profiling"]: whether or not to set up the \ OpenCL environment with the profiling option enabled. :param bool options["out_of_order"]: whether or not to set up the \ OpenCL environment with the out_of_order option enabled. :param bool options["end_barrier"]: whether or not to add an OpenCL \ barrier at the end of the transformed invoke. ''' if not options: options = {} self.validate(node, options) api_config = Config.get().api_conf("gocean") # Update class attributes if 'enable_profiling' in options: self._enable_profiling = options['enable_profiling'] if 'out_of_order' in options: self._out_of_order = options['out_of_order'] self._transformed_invokes += 1 # Get end_barrier option end_barrier = options.get('end_barrier', True) # Update the maximum value that the queue_number have. for kernel in node.coded_kernels(): self._max_queue_number = max(self._max_queue_number, kernel.opencl_options["queue_number"]) # Insert, if they don't already exist, the necessary OpenCL helper # subroutines in the root Container. module = node.ancestor(Container) psy_init = self._insert_opencl_init_routine(module) init_grid = self._insert_initialise_grid_buffers(module) write_grid_buf = self._insert_write_grid_buffers(module) self._insert_ocl_read_from_device_function(module) self._insert_ocl_write_to_device_function(module) init_buf = self._insert_ocl_initialise_buffer(module) for kern in node.coded_kernels(): self._insert_ocl_arg_setter_routine(module, kern) # Insert fortcl, clfortran and c_iso_binding import statement fortcl = ContainerSymbol("fortcl") node.symbol_table.add(fortcl) get_num_cmd_queues = RoutineSymbol( "get_num_cmd_queues", interface=ImportInterface(fortcl)) get_cmd_queues = RoutineSymbol( "get_cmd_queues", interface=ImportInterface(fortcl)) get_kernel_by_name = RoutineSymbol( "get_kernel_by_name", interface=ImportInterface(fortcl)) node.symbol_table.add(get_num_cmd_queues) node.symbol_table.add(get_cmd_queues) node.symbol_table.add(get_kernel_by_name) clfortran = ContainerSymbol("clfortran") node.symbol_table.add(clfortran) cl_finish = RoutineSymbol( "clFinish", interface=ImportInterface(clfortran)) cl_launch = RoutineSymbol( "clEnqueueNDRangeKernel", interface=ImportInterface(clfortran)) node.symbol_table.add(cl_finish) node.symbol_table.add(cl_launch) iso_c_binding = ContainerSymbol("iso_c_binding") node.symbol_table.add(iso_c_binding) c_loc = RoutineSymbol( "C_LOC", interface=ImportInterface(iso_c_binding)) c_null = DataSymbol( "C_NULL_PTR", datatype=INTEGER_TYPE, interface=ImportInterface(iso_c_binding)) node.symbol_table.add(c_loc) node.symbol_table.add(c_null) # Include the check_status subroutine if we are in debug_mode if api_config.debug_mode: ocl_utils = ContainerSymbol("ocl_utils_mod") check_status = RoutineSymbol( "check_status", interface=ImportInterface(ocl_utils)) node.symbol_table.add(ocl_utils) node.symbol_table.add(check_status) # Declare local variables needed by an OpenCL PSy-layer invoke qlist = node.symbol_table.new_symbol( "cmd_queues", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)"), tag="opencl_cmd_queues") # 'first_time' needs to be an UnsupportedFortranType because it has # SAVE and initial value first = DataSymbol("first_time", datatype=UnsupportedFortranType( "logical, save :: first_time = .true.")) node.symbol_table.add(first, tag="first_time") flag = node.symbol_table.new_symbol( "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE, tag="opencl_error") global_size = node.symbol_table.new_symbol( "globalsize", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_size_t), target :: globalsize(2)")) local_size = node.symbol_table.new_symbol( "localsize", symbol_type=DataSymbol, datatype=UnsupportedFortranType( "integer(kind=c_size_t), target :: localsize(2)")) # Bring all the boundaries at the beginning (since we are going to # use them during the setup block - and they don't change) boundary_vars = [] for tag, symbol in node.symbol_table.tags_dict.items(): if tag.startswith(("xstart_", "xstop_", "ystart_", "ystop_")): boundary_vars.append(symbol) cursor = 0 for assignment in node.walk(Assignment): if assignment.lhs.symbol in boundary_vars: node.children.insert(cursor, assignment.detach()) cursor += 1 # Create block of code to execute only the first time: setup_block = IfBlock.create(Reference(first), []) setup_block.preceding_comment = \ "Initialise OpenCL runtime, kernels and buffers" node.children.insert(cursor, setup_block) setup_block.if_body.addchild(Call.create(psy_init, [])) # Set up cmd_queues pointer ptree = Fortran2003.Pointer_Assignment_Stmt( f"{qlist.name} => {get_cmd_queues.name}()") cblock = CodeBlock([ptree], CodeBlock.Structure.STATEMENT) setup_block.if_body.addchild(cblock) # Declare and assign kernel pointers for kern in node.coded_kernels(): name = "kernel_" + kern.name try: kpointer = node.symbol_table.lookup_with_tag(name) except KeyError: pointer_type = UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), TARGET, SAVE :: " + name) kpointer = DataSymbol(name, datatype=pointer_type) node.symbol_table.add(kpointer, tag=name) setup_block.if_body.addchild( Assignment.create( Reference(kpointer), Call.create(get_kernel_by_name, [Literal(kern.name, CHARACTER_TYPE)]))) # Traverse all arguments and make sure all the buffers are initialised initialised_fields = set() there_is_a_grid_buffer = False for kern in node.coded_kernels(): for arg in kern.arguments.args: if arg.argument_type == "field": field = node.symbol_table.lookup(arg.name) if field not in initialised_fields: # Call the init_buffer routine with this field call = Call.create(init_buf, [Reference(field)]) setup_block.if_body.addchild(call) initialised_fields.add(field) elif (arg.argument_type == "grid_property" and not arg.is_scalar): if not there_is_a_grid_buffer: # Call the grid init_buffer routine field = node.symbol_table.lookup( kern.arguments.find_grid_access().name) call = Call.create(init_grid, [Reference(field)]) setup_block.if_body.addchild(call) there_is_a_grid_buffer = True if not arg.is_scalar: # All buffers will be assigned to a local OpenCL memory # object to easily reference them, make sure this local # variable is declared in the Invoke. name = arg.name + "_cl_mem" try: node.symbol_table.lookup_with_tag(name) except KeyError: node.symbol_table.new_symbol( name, tag=name, symbol_type=DataSymbol, datatype=UnsupportedFortranType( "INTEGER(KIND=c_intptr_t) :: " + name)) # Now call all the set_args routines because in some platforms (e.g. # in Xilinx FPGA) knowing which arguments each kernel is going to use # allows the write operation to place the data into the appropriate # memory bank. first_statement_comment = False kernel_names = set() for kern in node.coded_kernels(): if kern.name not in kernel_names: kernel_names.add(kern.name) callblock = self._generate_set_args_call(kern, node.scope) for child in callblock.pop_all_children(): setup_block.if_body.addchild(child) if not first_statement_comment: child.preceding_comment = ( "Do a set_args now so subsequent writes place the " "data appropriately") first_statement_comment = True # Now we can insert calls to write_to_device method for each buffer # and the grid writing call if there is one (in a new first time block) first_statement_comment = False for field in initialised_fields: call = Call.create( RoutineSymbol(field.name+"%write_to_device"), []) setup_block.if_body.addchild(call) if not first_statement_comment: call.preceding_comment = "Write data to the device" first_statement_comment = True if there_is_a_grid_buffer: fieldarg = node.coded_kernels()[0].arguments.find_grid_access() field = node.symbol_table.lookup(fieldarg.name) call = Call.create(write_grid_buf, [Reference(field)]) setup_block.if_body.addchild(call) # We will just mark the nodes we are replacing as deleting them inside # the loop would break the PSy-layer backward_dependency method in the # following iterations. We will detach all these nodes after the loop. nodes_to_detach = [] # Transform each kernel call loop construct to its equivalent FortCL # statements for kern in node.coded_kernels(): outerloop = kern.ancestor(GOLoop).ancestor(GOLoop) # Set up globalsize and localsize arrays garg = node.coded_kernels()[0].arguments.find_grid_access() num_x = api_config.grid_properties["go_grid_nx"].fortran\ .format(garg.name) num_y = api_config.grid_properties["go_grid_ny"].fortran\ .format(garg.name) assig = Assignment.create( Reference(global_size), Literal(f"(/{num_x}, {num_y}/)", ArrayType(INTEGER_TYPE, [2]))) node.children.insert(outerloop.position, assig) local_size_value = kern.opencl_options['local_size'] assig = Assignment.create( Reference(local_size), Literal(f"(/{local_size_value}, 1/)", ArrayType(INTEGER_TYPE, [2]))) node.children.insert(outerloop.position, assig) # Check that the global_size is multiple of the local_size if api_config.debug_mode: fortran_reader = FortranReader() global_size_expr = fortran_reader.psyir_from_expression( num_x, node.symbol_table) self._add_divisibility_check(node, outerloop.position, check_status, global_size_expr, local_size_value) # Retrieve kernel symbol kernelsym = node.symbol_table.lookup_with_tag( "kernel_" + kern.name) # Choose the command queue number to which to dispatch this kernel. # We have do deal with possible dependencies to kernels dispatched # in different command queues as the order of execution is not # guaranteed. queue_number = kern.opencl_options['queue_number'] cmd_queue = ArrayReference.create( qlist, [Literal(str(queue_number), INTEGER_TYPE)]) dependency = outerloop.backward_dependence() # If the dependency is a loop containing a kernel, add a barrier if # the previous kernels were dispatched in a different command queue if dependency: for kernel_dep in dependency.coded_kernels(): previous_queue = kernel_dep.opencl_options['queue_number'] if previous_queue != queue_number: # If the backward dependency is being executed in # another queue we add a barrier to make sure the # previous kernel has finished before this halo # exchange starts. barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(previous_queue), INTEGER_TYPE)])])) node.children.insert(outerloop.position, barrier) # If the dependency is something other than a kernel, currently we # dispatch everything else to queue _OCL_MANAGEMENT_QUEUE, so add a # barrier if this kernel is not on queue _OCL_MANAGEMENT_QUEUE. if dependency and not dependency.coded_kernels() and \ queue_number != self._OCL_MANAGEMENT_QUEUE: barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(self._OCL_MANAGEMENT_QUEUE), INTEGER_TYPE)])])) node.children.insert(outerloop.position, barrier) # Check that everything has succeeded before the kernel launch if api_config.debug_mode: self._add_ready_check(node, outerloop.position, check_status, kern.name, flag, cl_finish, cmd_queue.copy()) callblock = self._generate_set_args_call(kern, node.scope) for child in callblock.pop_all_children(): node.children.insert(outerloop.position, child) # Then we call the clEnqueueNDRangeKernel assig = Assignment.create( Reference(flag), Call.create(cl_launch, [ # OpenCL Command Queue cmd_queue, # OpenCL Kernel object Reference(kernelsym), # Number of work dimensions Literal("2", INTEGER_TYPE), # Global offset (if NULL the global IDs start at # offset (0,0,0)) Reference(c_null), # Global work size Call.create(c_loc, [Reference(global_size)]), # Local work size Call.create(c_loc, [Reference(local_size)]), # Number of events in wait list Literal("0", INTEGER_TYPE), # Event wait list that need to be completed before # this kernel Reference(c_null), # Event that identifies this kernel completion Reference(c_null)])) assig.preceding_comment = "Launch the kernel" node.children.insert(outerloop.position, assig) self._insert_kernel_code_in_opencl_file(kern) # Add additional checks if we are in debug mode if api_config.debug_mode: self._add_kernel_check(node, outerloop.position, check_status, kern.name, flag, cl_finish, cmd_queue.copy()) nodes_to_detach.append(outerloop) # If we execute the kernels asynchronously, we need to add wait # statements before the halo exchanges to guarantee that the data # has been updated for possible_dependent_node in node.walk(HaloExchange): # The backward_dependences returns the last Loop with a kernel # that has a dependency with this halo exchange dependency = possible_dependent_node.backward_dependence() if dependency: for kernel_dep in dependency.coded_kernels(): previous_queue = kernel_dep.opencl_options['queue_number'] if previous_queue != self._OCL_MANAGEMENT_QUEUE: # If the backward dependency is being executed in # another queue we add a barrier to make sure the # previous kernel has finished before this one starts. barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [ ArrayReference.create(qlist, [ Literal(str(previous_queue), INTEGER_TYPE)])])) pos = possible_dependent_node.position node.children.insert(pos, barrier) for node_to_detach in nodes_to_detach: node_to_detach.detach() if end_barrier: self._add_end_barrier(node, flag, cl_finish, qlist) # And at the very end always makes sure that first_time value is False assign = Assignment.create(Reference(first), Literal("false", BOOLEAN_TYPE)) assign.preceding_comment = "Unset the first time flag" node.addchild(assign) self._output_opencl_kernels_file()
def _add_end_barrier(self, node, flag, cl_finish, qlist): ''' Append into the given node a OpenCL Wait operation for each of the OpenCL queues in use. :param node: PSyIR node where to append the barrier. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param qlist: PSyIR symbol of the OpenCL queues array. :type qlist: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' # We need a clFinish for each of the queues in the implementation added_comment = False for num in range(1, self._max_queue_number + 1): queue = ArrayReference.create(qlist, [Literal(str(num), INTEGER_TYPE)]) node.addchild( Assignment.create( Reference(flag), Call.create(cl_finish, [queue]))) if not added_comment: node.children[-1].preceding_comment = \ "Wait until all kernels have finished" added_comment = True @staticmethod def _add_divisibility_check(node, position, check_status, global_size_expr, local_size): ''' Insert into node a check that the global_size is exactly divisible by the local size. :param node: where to insert the divisibility check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the divisibilitay check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param global_size_expr: PSyIR representing the global_size. :type global_size_expr: :py:class:`psyclone.psyir.nodes.DataNode` :param int local_size: size of the OpenCL local work_group. ''' check = BinaryOperation.create( BinaryOperation.Operator.NE, IntrinsicCall.create( IntrinsicCall.Intrinsic.MOD, [global_size_expr, Literal(str(local_size), INTEGER_TYPE)] ), Literal("0", INTEGER_TYPE)) message = ("Global size is not a multiple of local size (" "mandatory in OpenCL < 2.0).") error = Call.create(check_status, [Literal(message, CHARACTER_TYPE), Literal("-1", INTEGER_TYPE)]) ifblock = IfBlock.create(check, [error]) node.children.insert(position, ifblock) @staticmethod def _add_kernel_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue): ''' Insert into node a check that the kernel has been launched and has been executed successfully. :param node: where to insert the kernel check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the kernel check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param str kernel_name: name of the kernel being checked. :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param cmd_queue: PSyIR symbol of the OpenCL command queues array. :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' # First check the launch return value message = Literal(f"{kernel_name} clEnqueueNDRangeKernel", CHARACTER_TYPE) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position, check) # Then add a barrier barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [cmd_queue])) node.children.insert(position + 1, barrier) # And check the kernel executed successfully message = Literal(f"Errors during {kernel_name}", CHARACTER_TYPE) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position + 2, check) @staticmethod def _add_ready_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue): ''' Insert into node a check that verifies if everything in the command queues previous to a kernel launch has completed successfully. :param node: where to insert the kernel check. :type node: :py:class:`psyclone.psyir.nodes.Schedule` :param int position: location where to insert the kernel check. :param check_status: PSyIR symbol of the check routine. :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param str kernel_name: name of the kernel being checked. :param flag: PSyIR symbol to use as flag. :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol` :param cl_finish: PSyIR symbol of the barrier routine. :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol` :param cmd_queue: PSyIR symbol of the OpenCL command queues array. :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol` ''' barrier = Assignment.create( Reference(flag), Call.create(cl_finish, [cmd_queue])) node.children.insert(position, barrier) message = Literal(f"Errors before {kernel_name} launch", CHARACTER_TYPE) check = Call.create(check_status, [message, Reference(flag)]) node.children.insert(position + 1, check) def _insert_kernel_code_in_opencl_file(self, kernel): ''' Insert the given kernel into a OpenCL file. For this we need to remove the 'go_wp' precision symbol which can't be generated by OpenCL. We assume 'go_wp' is a OpenCL double. :param kernel: the kernel to insert. :type kernel: :py:class:`psyclone.psyir.nodes.KernelSchedule` ''' if not self._kernels_file: self._kernels_file = FileContainer("opencl_kernels") # Create a copy of the kernel and remove precision symbols since they # are not supported in the OpenCL backend. kernel_copy = kernel.get_kernel_schedule().copy() symtab = kernel_copy.symbol_table # TODO #898: Removing symbols is not properly supported by PSyIR # because we have to deal with all references to it. In this case we # implement manually a conversion of all 'go_wp' to a double precision # and remove the symbol because we guarantee that it just appear in the # declarations of other symbols (symtab.datasymbols). # pylint: disable=protected-access for sym in symtab.datasymbols: # Not all types have the 'precision' attribute (e.g. # UnresolvedType) if (hasattr(sym.datatype, "precision") and isinstance(sym.datatype.precision, DataSymbol)): sym.datatype._precision = ScalarType.Precision.DOUBLE if 'go_wp' in symtab: del symtab._symbols['go_wp'] # Insert kernel in the OpenCL kernels file if it doesn't already exist for routine in self._kernels_file.walk(Routine): if routine.name == kernel.name: break # if it exist re-use existing one # TODO 1572: Here we assume that in the same Invoke (scope) a # kernel with the same name will be the same kernel, but that # may not be true when doing multiple invokes. else: self._kernels_file.addchild(kernel_copy) def _output_opencl_kernels_file(self): ''' Write the OpenCL kernels to a file using the OpenCL backend. ''' # TODO 1013: The code below duplicates some logic of the CodedKern # rename_and_write method. Ideally this should be moved out of # the AST and transformations and put into some kind of IOManager. ocl_writer = OpenCLWriter(kernels_local_size=64) new_kern_code = ocl_writer(self._kernels_file) fdesc = None name_idx = -1 while not fdesc: name_idx += 1 new_name = f"opencl_kernels_{name_idx}.cl" try: # Atomically attempt to open the new kernel file (in case # this is part of a parallel build) fdesc = os.open( os.path.join(Config.get().kernel_output_dir, new_name), os.O_CREAT | os.O_WRONLY | os.O_EXCL) except (OSError, IOError): # The os.O_CREATE and os.O_EXCL flags in combination mean # that open() raises an error if the file exists continue # Write the modified AST out to file os.write(fdesc, new_kern_code.encode()) # Close the new kernel file os.close(fdesc) @staticmethod def _generate_set_args_call(kernel, scope): ''' Generate the Call statement to the set_args subroutine for the provided kernel. :param kernel: the kernel for which to generate a call to its \ arg_setter subroutine. :type kernel: :py:class:`psyclone.psyGen.CodedKern` :param scope: The node representing the scope where the call \ statements will be inserted. :type scope: :py:class:`psyclone.psyir.nodes.ScopingNode` :returns: a block of statements that represent the set_args call :rtype: :py:class:`psyclone.psyir.nodes.Schedule` ''' call_block = Schedule() # Retrieve symbol table and kernel symbol symtab = scope.symbol_table kernelsym = symtab.lookup_with_tag("kernel_" + kernel.name) # Find the symbol that defines each boundary for this kernel. # In OpenCL the iteration boundaries are passed as arguments to the # kernel because the global work size may exceed the dimensions and # therefore the updates outside the boundaries should be masked. # If any of the boundaries is not found, it can not proceed. boundaries = [] try: for boundary in ["xstart", "xstop", "ystart", "ystop"]: tag = boundary + "_" + kernel.name symbol = symtab.lookup_with_tag(tag) boundaries.append(symbol.name) except KeyError as err: raise GenerationError( f"Boundary symbol tag '{tag}' not found while generating the " f"OpenCL code for kernel '{kernel.name}'. Make sure to apply " f"the GOMoveIterationBoundariesInsideKernelTrans before " f"attempting the OpenCL code generation.") from err api_config = Config.get().api_conf("gocean") # Prepare the argument list for the set_args routine arguments = [Reference(kernelsym)] for arg in kernel.arguments.args: if arg.argument_type == "scalar": if arg.name in boundaries: # Boundary values are 0-indexed in OpenCL and 1-indexed in # PSyIR, therefore we need to subtract 1 bop = BinaryOperation.create(BinaryOperation.Operator.SUB, arg.psyir_expression(), Literal("1", INTEGER_TYPE)) arguments.append(bop) else: arguments.append(arg.psyir_expression()) elif arg.argument_type == "field": # Cast buffer to cl_mem type expected by OpenCL field = symtab.lookup(arg.name) symbol = symtab.lookup_with_tag(arg.name + "_cl_mem") source = StructureReference.create(field, ['device_ptr']) dest = Reference(symbol) icall = IntrinsicCall.create(IntrinsicCall.Intrinsic.TRANSFER, [source, dest]) assig = Assignment.create(dest.copy(), icall) call_block.addchild(assig) arguments.append(Reference(symbol)) elif arg.argument_type == "grid_property": garg = kernel.arguments.find_grid_access() if arg.is_scalar: # pylint: disable=protected-access arguments.append( StructureReference.create( symtab.lookup(garg.name), api_config.grid_properties[arg._property_name] .fortran.split('%')[1:] )) else: # Cast grid buffer to cl_mem type expected by OpenCL device_grid_property = arg.name + "_device" field = symtab.lookup(garg.name) source = StructureReference.create( field, ['grid', device_grid_property]) symbol = symtab.lookup_with_tag(arg.name + "_cl_mem") dest = Reference(symbol) icall = IntrinsicCall.create( IntrinsicCall.Intrinsic.TRANSFER, [source, dest]) assig = Assignment.create(dest.copy(), icall) call_block.addchild(assig) arguments.append(Reference(symbol)) call_symbol = symtab.lookup_with_tag(kernel.name + "_set_args") call_block.addchild(Call.create(call_symbol, arguments)) return call_block @staticmethod def _insert_ocl_arg_setter_routine(node, kernel): ''' Returns the symbol of the subroutine that sets the OpenCL kernel arguments for the provided PSy-layer kernel using FortCL. If the subroutine doesn't exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :param kernel: the kernel call for which to provide the arg_setter \ subroutine. :type kernel: :py:class:`psyclone.psyGen.CodedKern` :returns: the symbol representing the arg_setter subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # Check if the subroutine already exist. sub_name = kernel.name + "_set_args" try: return node.symbol_table.lookup_with_tag(sub_name) except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Create the new Routine sub_name = node.symbol_table.next_available_name(sub_name) sub_symbol = node.symbol_table.new_symbol( sub_name, tag=kernel.name + "_set_args", symbol_type=RoutineSymbol) argsetter = Routine(sub_symbol) arg_list = [] # Add subroutine imported symbols clfortran = ContainerSymbol("clfortran") clsetkernelarg = RoutineSymbol("clSetKernelArg", interface=ImportInterface(clfortran)) iso_c = ContainerSymbol("iso_c_binding") c_sizeof = RoutineSymbol("C_SIZEOF", interface=ImportInterface(iso_c)) c_loc = RoutineSymbol("C_LOC", interface=ImportInterface(iso_c)) c_intptr_t = RoutineSymbol("c_intptr_t", interface=ImportInterface(iso_c)) ocl_utils = ContainerSymbol("ocl_utils_mod") check_status = RoutineSymbol("check_status", interface=ImportInterface(ocl_utils)) argsetter.symbol_table.add(clfortran) argsetter.symbol_table.add(clsetkernelarg) argsetter.symbol_table.add(iso_c) argsetter.symbol_table.add(c_sizeof) argsetter.symbol_table.add(c_loc) argsetter.symbol_table.add(c_intptr_t) argsetter.symbol_table.add(ocl_utils) argsetter.symbol_table.add(check_status) # Add an argument symbol for the kernel object kobj = argsetter.symbol_table.new_symbol( "kernel_obj", symbol_type=DataSymbol, interface=ArgumentInterface(ArgumentInterface.Access.READ), datatype=UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), TARGET :: kernel_obj")) arg_list.append(kobj) # Include each kernel call argument as an argument of this routine for arg in kernel.arguments.args: name = argsetter.symbol_table.next_available_name(arg.name) # This function requires 'TARGET' annotated declarations which are # not supported in the PSyIR, so we build them as # UnsupportedFortranType for now. if arg.is_scalar and arg.intrinsic_type == "real": pointer_type = UnsupportedFortranType( "REAL(KIND=go_wp), INTENT(IN), TARGET :: " + name) elif arg.is_scalar: pointer_type = UnsupportedFortranType( "INTEGER, INTENT(IN), TARGET :: " + name) else: # Everything else is a cl_mem pointer (c_intptr_t) pointer_type = UnsupportedFortranType( "INTEGER(KIND=c_intptr_t), INTENT(IN), TARGET :: " + name) new_arg = DataSymbol( name, datatype=pointer_type, interface=ArgumentInterface(ArgumentInterface.Access.READ)) argsetter.symbol_table.add(new_arg) arg_list.append(new_arg) argsetter.symbol_table.specify_argument_list(arg_list) # Create the ierr local variable ierr = argsetter.symbol_table.new_symbol( "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE) # Call the clSetKernelArg for each argument and a check_status to # see if the OpenCL call has succeeded for index, variable in enumerate(arg_list[1:]): call = Call.create(clsetkernelarg, [Reference(kobj), Literal(str(index), INTEGER_TYPE), Call.create(c_sizeof, [Reference(variable)]), Call.create(c_loc, [Reference(variable)])]) assignment = Assignment.create(Reference(ierr), call) argsetter.addchild(assignment) emsg = f"clSetKernelArg: arg {index} of {kernel.name}" call = Call.create(check_status, [Literal(emsg, CHARACTER_TYPE), Reference(ierr)]) argsetter.addchild(call) argsetter.children[0].preceding_comment = \ f"Set the arguments for the {kernel.name} OpenCL Kernel" # Add the subroutine as child of the provided node node.addchild(argsetter) return node.symbol_table.lookup_with_tag(kernel.name + "_set_args") def _insert_opencl_init_routine(self, node): ''' Returns the symbol of the subroutine that initialises the OpenCL environment using FortCL. If the subroutine doesn't exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol representing the OpenCL initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: # TODO #1572: The ocl_init routine may need to be regenerated if # there are multiple Invokes because _max_queue_number may have # increased and we need to load the kernels of both invokes. return symtab.lookup_with_tag("ocl_init_routine") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Choose a round-robin device number if it has MPI and multiple # accelerators. distributed_memory = Config.get().distributed_memory devices_per_node = Config.get().ocl_devices_per_node additional_uses = "" additional_stmts = "" if devices_per_node > 1 and distributed_memory: additional_uses += "USE parallel_mod, ONLY: get_rank" additional_stmts += \ f"ocl_device_num = mod(get_rank()-1, {devices_per_node}) + 1" # Get a set of all kernel names in the Container. This implementation # currently assumes all of them will be available in OpenCL unique_kernels = {kernel.name for kernel in node.coded_kernels()} # Code of the subroutine in Fortran code = f''' subroutine psy_init() {additional_uses} use fortcl, only: ocl_env_init, add_kernels character(len=30) kernel_names({len(unique_kernels)}) integer :: ocl_device_num=1 logical, save :: initialised=.false. ! Check to make sure we only execute this routine once if (.not. initialised) then initialised = .true. ! Initialise the opencl environment/device {additional_stmts} call ocl_env_init({self._max_queue_number}, ocl_device_num, & {".true." if self._enable_profiling else ".false."}, & {".true." if self._out_of_order else ".false."}) ! The kernels this psy layer module requires ''' for index, kernel_name in enumerate(unique_kernels): code += f"kernel_names({index + 1}) = \"{kernel_name}\"\n" code += f'''\ ! Create the opencl kernel objects. This expects to find all of ! the compiled kernels in FORTCL_KERNELS_FILE environment variable call add_kernels({len(unique_kernels)}, kernel_names) end if end subroutine psy_init''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("psy_init") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] subroutine.detach() node.symbol_table.add(subroutine_symbol, tag="ocl_init_routine") subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_routine") @staticmethod def _insert_initialise_grid_buffers(node): ''' Returns the symbol of a subroutine that initialises all OpenCL grid buffers in the OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the grid buffer initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_init_grid_buffers") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") int_arrays = [] real_arrays = [] for key, prop in props.items(): if key == "go_grid_data": # TODO #676: Ignore because go_grid_data is actually a field # property continue if prop.type == "array" and prop.intrinsic_type == "integer": int_arrays.append(prop.fortran.format("field")) elif prop.type == "array" and prop.intrinsic_type == "real": real_arrays.append(prop.fortran.format("field")) # Code of the subroutine in Fortran code = f''' subroutine initialise_device_grid(field) USE fortcl, ONLY: create_ronly_buffer USE iso_c_binding, only: c_size_t use field_mod type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes IF (.not. c_associated({int_arrays[0]}_device)) THEN ! Create integer grid fields size_in_bytes = int({num_x}*{num_y}, 8) * & c_sizeof({int_arrays[0]}(1,1)) ''' for int_array in int_arrays: code += f''' {int_array}_device = transfer( & create_ronly_buffer(size_in_bytes), {int_array}_device) ''' code += f''' ! Create real grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof({real_arrays[0]}(1,1)) ''' for real_array in real_arrays: code += f''' {real_array}_device = transfer( & create_ronly_buffer(size_in_bytes), {real_array}_device) ''' code += ''' END IF end subroutine initialise_device_grid ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("initialise_grid_device_buffers") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_init_grid_buffers") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_grid_buffers") def _insert_write_grid_buffers(self, node): ''' Returns the symbol of a subroutine that writes the values of the grid properties into the OpenCL device buffers using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol representing the grid buffers writing subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_write_grid_buffers") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") # Code of the subroutine in Fortran code = f''' subroutine write_device_grid(field) USE fortcl, ONLY: get_cmd_queues use iso_c_binding, only: c_intptr_t, c_size_t, c_sizeof USE clfortran USE ocl_utils_mod, ONLY: check_status type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer(c_intptr_t) :: cl_mem integer :: ierr cmd_queues => get_cmd_queues() ! Integer grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof(field%grid%tmask(1,1)) cl_mem = transfer(field%grid%tmask_device, cl_mem) ierr = clEnqueueWriteBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), & cl_mem, CL_TRUE, 0_8, size_in_bytes, & C_LOC(field%grid%tmask), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer tmask", ierr) ! Real grid buffers size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof(field%grid%area_t(1,1)) ''' write_str = ''' cl_mem = transfer(field%grid%{0}_device, cl_mem) ierr = clEnqueueWriteBuffer(cmd_queues({1}), & cl_mem, CL_TRUE, 0_8, size_in_bytes, & C_LOC(field%grid%{0}), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer {0}_device", ierr) ''' for grid_prop in ['area_t', 'area_u', 'area_v', 'dx_u', 'dx_v', 'dx_t', 'dy_u', 'dy_v', 'dy_t', 'gphiu', 'gphiv']: code += write_str.format(grid_prop, self._OCL_MANAGEMENT_QUEUE) code += "end subroutine write_device_grid" # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("write_grid_buffers") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_write_grid_buffers") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_write_grid_buffers") def _insert_ocl_read_from_device_function(self, node): ''' Returns the symbol of a subroutine that retrieves the data back from an OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer data retrieving subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_read_func") except KeyError: # If the subroutines does not exist, it needs to be # generated first. pass # Code of the subroutine in Fortran code = f''' subroutine read_sub(from, to, startx, starty, nx, ny, blocking) USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof USE ocl_utils_mod, ONLY: check_status use kind_params_mod, only: go_wp USE clfortran USE fortcl, ONLY: get_cmd_queues type(c_ptr), intent(in) :: from real(go_wp), intent(inout), dimension(:,:), target :: to integer, intent(in) :: startx, starty, nx, ny logical, intent(in) :: blocking INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes integer(c_intptr_t) :: cl_mem INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer :: ierr, i ! Give the from pointer the appropriate OpenCL memory object type cl_mem = transfer(from, cl_mem) cmd_queues => get_cmd_queues() ! Two copy strategies depending on how much of the total length ! nx covers. if (nx < size(to, 1) / 2) then ! Dispatch asynchronous copies of just the contiguous data. do i = starty, starty+ny size_in_bytes = int(nx, 8) * c_sizeof(to(1,1)) offset_in_bytes = int(size(to, 1) * (i-1) + (startx-1)) & * c_sizeof(to(1,1)) ierr = clEnqueueReadBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_FALSE, offset_in_bytes, size_in_bytes, & C_LOC(to(startx, i)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueReadBuffer", ierr) enddo if (blocking) then CALL check_status("clFinish on read", & clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE}))) endif else ! Copy across the whole starty:starty+ny rows in a single ! copy operation. size_in_bytes = int(size(to, 1) * ny, 8) * c_sizeof(to(1,1)) offset_in_bytes = int(size(to,1)*(starty-1), 8) & * c_sizeof(to(1,1)) ierr = clEnqueueReadBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_TRUE, offset_in_bytes, size_in_bytes, & C_LOC(to(1,starty)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueReadBuffer", ierr) endif end subroutine read_sub ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("read_from_device") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_read_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_read_func") def _insert_ocl_write_to_device_function(self, node): ''' Returns the symbol of a subroutine that writes the buffer data into an OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer writing subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_write_func") except KeyError: # If the subroutines does not exist, it needs to be # generated first. pass # Code of the subroutine in Fortran code = f''' subroutine write_sub(from, to, startx, starty, nx, ny, blocking) USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof USE ocl_utils_mod, ONLY: check_status use kind_params_mod, only: go_wp USE clfortran USE fortcl, ONLY: get_cmd_queues real(go_wp), intent(in), dimension(:,:), target :: from type(c_ptr), intent(in) :: to integer, intent(in) :: startx, starty, nx, ny logical, intent(in) :: blocking integer(c_intptr_t) :: cl_mem INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes INTEGER(c_intptr_t), pointer :: cmd_queues(:) integer :: ierr, i ! Give the to pointer the appropriate OpenCL memory object type cl_mem = transfer(to, cl_mem) cmd_queues => get_cmd_queues() ! Two copy strategies depending on how much of the total length ! nx covers. if (nx < size(from,1) / 2) then ! Dispatch asynchronous copies of just the contiguous data. do i=starty, starty+ny size_in_bytes = int(nx, 8) * c_sizeof(from(1,1)) offset_in_bytes = int(size(from, 1) * (i-1) + (startx-1)) & * c_sizeof(from(1,1)) ierr = clEnqueueWriteBuffer( & cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_FALSE, offset_in_bytes, size_in_bytes, & C_LOC(from(startx, i)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer", ierr) enddo if (blocking) then CALL check_status("clFinish on write", & clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE}))) endif else ! Copy across the whole starty:starty+ny rows in a single ! copy operation. size_in_bytes = int(size(from,1) * ny, 8) * c_sizeof(from(1,1)) offset_in_bytes = int(size(from,1) * (starty-1)) & * c_sizeof(from(1,1)) ierr = clEnqueueWriteBuffer(& cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, & CL_TRUE, offset_in_bytes, size_in_bytes, & C_LOC(from(1, starty)), 0, C_NULL_PTR, C_NULL_PTR) CALL check_status("clEnqueueWriteBuffer", ierr) endif end subroutine write_sub ''' # Create the symbol for the routine. subroutine_symbol = RoutineSymbol("write_to_device") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_write_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_write_func") @staticmethod def _insert_ocl_initialise_buffer(node): ''' Returns the symbol of a subroutine that initialises an OpenCL buffer in the OpenCL device using FortCL. If the subroutine doesn't already exist it also generates it. :param node: the container where the new subroutine will be inserted. :type node: :py:class:`psyclone.psyir.nodes.Container` :returns: the symbol of the buffer initialisation subroutine. :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol` ''' # pylint: disable=too-many-locals symtab = node.symbol_table try: return symtab.lookup_with_tag("ocl_init_buffer_func") except KeyError: # If the Symbol does not exist, the rest of this method # will generate it. pass # Get the GOcean API property names used in this routine api_config = Config.get().api_conf("gocean") host_buff = \ api_config.grid_properties["go_grid_data"].fortran.format("field") props = api_config.grid_properties num_x = props["go_grid_nx"].fortran.format("field") num_y = props["go_grid_ny"].fortran.format("field") # Fields need to provide a function pointer to how the # device data is going to be read and written, if it doesn't # exist, create the appropriate subroutine first. read_fp = symtab.lookup_with_tag("ocl_read_func").name write_fp = symtab.lookup_with_tag("ocl_write_func").name # Code of the subroutine in Fortran code = f''' subroutine initialise_device_buffer(field) USE fortcl, ONLY: create_rw_buffer USE iso_c_binding, only: c_size_t use field_mod type(r2d_field), intent(inout), target :: field integer(kind=c_size_t) size_in_bytes IF (.NOT. field%data_on_device) THEN size_in_bytes = int({num_x} * {num_y}, 8) * & c_sizeof({host_buff}(1,1)) ! Create buffer on device, we store it without type information ! on the dl_esm_inf pointer (transfer/static_cast to void*) field%device_ptr = transfer( & create_rw_buffer(size_in_bytes), & field%device_ptr) field%data_on_device = .true. field%read_from_device_f => {read_fp} field%write_to_device_f => {write_fp} END IF end subroutine initialise_device_buffer ''' # Create the symbol for the routine . subroutine_symbol = RoutineSymbol("initialise_device_buffer") # Obtain the PSyIR representation of the code above fortran_reader = FortranReader() container = fortran_reader.psyir_from_source(code) subroutine = container.children[0] symtab.add(subroutine_symbol, tag="ocl_init_buffer_func") subroutine.detach() subroutine.symbol = subroutine_symbol # Add the subroutine as child of the provided node node.addchild(subroutine) return symtab.lookup_with_tag("ocl_init_buffer_func")
# For AutoAPI documentation generation __all__ = ["GOOpenCLTrans"]