# -----------------------------------------------------------------------------
# BSD 3-Clause License
#
# Copyright (c) 2021-2025, Science and Technology Facilities Council.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
# Authors R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab
'''This module contains the GOcean-specific OpenCL transformation.
'''
import os
from fparser.two import Fortran2003
from psyclone.configuration import Config
from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.errors import GenerationError
from psyclone.gocean1p0 import GOInvokeSchedule, GOLoop
from psyclone.psyGen import (
Transformation, args_filter, InvokeSchedule,
HaloExchange
)
from psyclone.psyir.backend.opencl import OpenCLWriter
from psyclone.psyir.frontend.fortran import FortranReader
from psyclone.psyir.nodes import (
Routine, Call, Reference, Literal,
Assignment, IfBlock, ArrayReference, Schedule, BinaryOperation,
StructureReference, FileContainer, CodeBlock, IntrinsicCall,
Container, DataNode)
from psyclone.psyir.symbols import (
ArrayType, DataSymbol, RoutineSymbol, ContainerSymbol,
UnsupportedFortranType, ArgumentInterface, ImportInterface,
INTEGER_TYPE, CHARACTER_TYPE, BOOLEAN_TYPE, ScalarType)
from psyclone.transformations import TransformationError
[docs]
class GOOpenCLTrans(Transformation):
'''
Switches on/off the generation of an OpenCL PSy layer for a given
InvokeSchedule. Additionally, it will generate OpenCL kernels for
each of the kernels referenced by the Invoke. For example:
>>> from psyclone.parse.algorithm import parse
>>> from psyclone.psyGen import PSyFactory
>>> API = "gocean"
>>> FILENAME = "shallow_alg.f90" # examples/gocean/eg1
>>> ast, invoke_info = parse(FILENAME, api=API)
>>> psy = PSyFactory(API, distributed_memory=False).create(invoke_info)
>>> schedule = psy.invokes.get('invoke_0').schedule
>>> ocl_trans = GOOpenCLTrans()
>>> ocl_trans.apply(schedule)
>>> print(schedule.view())
'''
# Specify which OpenCL command queue to use for management operations like
# data transfers when generating an OpenCL PSy-layer
_OCL_MANAGEMENT_QUEUE = 1
# TODO #1572: These are class attributes because multiple invokes may need
# to generate a single OpenCL environment (e.g. to share the device data
# pointers) and therefore guarantee the same properties, but this hasn't
# been tested. PSycloneBench ShallowWater could be an example of this.
# Biggest queue number that any kernel is allocated to. The OpenCL
# environment should be set up with at least this number of queues.
_max_queue_number = 1
# Whether to enable the profiling option in the OpenCL environment
_enable_profiling = False
# Whether to enable the out_of_order option in the OpenCL environment
_out_of_order = False
# Total number of invokes that have been transformed to OpenCL
_transformed_invokes = 0
# Reference to the OpenCL kernels file
_kernels_file = None
@property
def name(self):
'''
:returns: the name of this transformation.
:rtype: str
'''
return "GOOpenCLTrans"
[docs]
def validate(self, node, options=None):
'''
Checks that the supplied InvokeSchedule is valid and that an OpenCL
version of it can be generated.
:param node: the Schedule to check.
:type node: :py:class:`psyclone.psyGen.InvokeSchedule`
:param options: a dictionary with options for transformations.
:type options: dict of str:value or None
:param bool options["enable_profiling"]: whether or not to set up the
OpenCL environment with the profiling option enabled.
:param bool options["out_of_order"]: whether or not to set up the
OpenCL environment with the out_of_order option enabled.
:param bool options["end_barrier"]: whether or not to add an OpenCL
barrier at the end of the transformed invoke.
:raises TransformationError: if the InvokeSchedule is not for the
GOcean API.
:raises TransformationError: if any of the kernels have arguments
which are passed as a literal.
:raises TransformationError: if any of the provided options is invalid.
:raises TransformationError: if any of the provided options is not
compatible with a previous OpenCL environment.
:raises TransformationError: if any kernel in this invoke has a
global variable used by an import.
:raises TransformationError: if any kernel does not iterate over
the whole grid.
'''
if isinstance(node, InvokeSchedule):
if not isinstance(node, GOInvokeSchedule):
raise TransformationError(
f"OpenCL generation is currently only supported for the "
f"GOcean API but got an InvokeSchedule of type: "
f"'{type(node).__name__}'")
else:
raise TransformationError(
f"Error in GOOpenCLTrans: the supplied node must be a (sub-"
f"class of) InvokeSchedule but got {type(node)}")
# Validate options map
valid_options = ['end_barrier', 'enable_profiling', 'out_of_order']
if options:
for key, value in options.items():
if key in valid_options:
# All current options should contain boolean values
if not isinstance(value, bool):
raise TransformationError(
f"InvokeSchedule OpenCL option '{key}' should be "
f"a boolean.")
else:
raise TransformationError(
f"InvokeSchedule does not support the OpenCL option "
f"'{key}'. The supported options are: "
f"{valid_options}.")
# Validate that the options are valid with previously generated OpenCL
if self._transformed_invokes > 0:
if ('enable_profiling' in options and
self._enable_profiling != options['enable_profiling']):
raise TransformationError(
f"Can't generate an OpenCL Invoke with enable_profiling='"
f"{options['enable_profiling']}' since a previous "
f"transformation used a different value, and their OpenCL"
f" environments must match.")
if ('out_of_order' in options and
self._out_of_order != options['out_of_order']):
raise TransformationError(
f"Can't generate an OpenCL Invoke with out_of_order='"
f"{options['out_of_order']}' since a previous "
f"transformation used a different value, and their OpenCL "
f"environments must match.")
# Now we need to check that none of the invoke arguments is a literal
args = args_filter(node.args, arg_types=["scalar"])
for arg in args:
if arg.is_literal:
raise TransformationError(
f"Cannot generate OpenCL for Invokes that contain kernel "
f"arguments which are a literal, but found the literal "
f"'{arg.name}' used as an argument in invoke "
f"'{node.name}'.")
# Check that we can construct the PSyIR and SymbolTable of each of
# the kernels in this Schedule. Also check that none of them access
# any form of global data (that is not a routine argument or just
# type information).
for kern in node.kernels():
KernelModuleInlineTrans().validate(kern)
for ksched in kern.get_callees():
global_variables = set(ksched.symbol_table.imported_symbols)
prec_symbols = set(ksched.symbol_table.precision_datasymbols)
if global_variables.difference(prec_symbols):
names = sorted([sym.name for sym in
global_variables.difference(prec_symbols)])
raise TransformationError(
f"The Symbol Table for kernel '{kern.name}' contains "
f"the following symbols with 'global' scope: {names}. "
f"An OpenCL kernel cannot call other kernels and all "
f"of the data it accesses must be passed by argument. "
f"Use the KernelImportsToArguments transformation to "
f"convert such symbols to kernel arguments first.")
# In OpenCL all kernel loops should iterate the whole grid
for kernel in node.kernels():
inner_loop = kernel.ancestor(GOLoop)
outer_loop = inner_loop.ancestor(GOLoop)
if not (inner_loop.field_space == "go_every" and
outer_loop.field_space == "go_every" and
inner_loop.iteration_space == "go_all_pts" and
outer_loop.iteration_space == "go_all_pts"):
raise TransformationError(
f"The kernel '{kernel.name}' does not iterate over all "
f"grid points. This is a necessary requirement for "
f"generating the OpenCL code and can be done by applying "
f"the GOMoveIterationBoundariesInsideKernelTrans to each "
f"kernel before the GOOpenCLTrans.")
[docs]
def apply(self, node, options=None):
'''
Apply the OpenCL transformation to the supplied GOInvokeSchedule. This
causes PSyclone to generate an OpenCL version of the corresponding
PSy-layer routine. The generated code makes use of the FortCL
library (https://github.com/stfc/FortCL) in order to manage the
OpenCL device directly from Fortran.
:param node: the InvokeSchedule to transform.
:type node: :py:class:`psyclone.psyGen.GOInvokeSchedule`
:param options: set of option to tune the OpenCL generation.
:type options: dict of str:value or None
:param bool options["enable_profiling"]: whether or not to set up the \
OpenCL environment with the profiling option enabled.
:param bool options["out_of_order"]: whether or not to set up the \
OpenCL environment with the out_of_order option enabled.
:param bool options["end_barrier"]: whether or not to add an OpenCL \
barrier at the end of the transformed invoke.
'''
if not options:
options = {}
self.validate(node, options)
api_config = Config.get().api_conf("gocean")
# Update class attributes
if 'enable_profiling' in options:
self._enable_profiling = options['enable_profiling']
if 'out_of_order' in options:
self._out_of_order = options['out_of_order']
self._transformed_invokes += 1
# Get end_barrier option
end_barrier = options.get('end_barrier', True)
# Update the maximum value that the queue_number have.
for kernel in node.coded_kernels():
self._max_queue_number = max(self._max_queue_number,
kernel.opencl_options["queue_number"])
# Insert, if they don't already exist, the necessary OpenCL helper
# subroutines in the root Container.
module = node.ancestor(Container)
psy_init = self._insert_opencl_init_routine(module)
init_grid = self._insert_initialise_grid_buffers(module)
write_grid_buf = self._insert_write_grid_buffers(module)
self._insert_ocl_read_from_device_function(module)
self._insert_ocl_write_to_device_function(module)
init_buf = self._insert_ocl_initialise_buffer(module)
for kern in node.coded_kernels():
self._insert_ocl_arg_setter_routine(module, kern)
# Insert fortcl, clfortran and c_iso_binding import statement
fortcl = ContainerSymbol("fortcl")
node.symbol_table.add(fortcl)
get_num_cmd_queues = RoutineSymbol(
"get_num_cmd_queues", interface=ImportInterface(fortcl))
get_cmd_queues = RoutineSymbol(
"get_cmd_queues", interface=ImportInterface(fortcl))
get_kernel_by_name = RoutineSymbol(
"get_kernel_by_name", interface=ImportInterface(fortcl))
node.symbol_table.add(get_num_cmd_queues)
node.symbol_table.add(get_cmd_queues)
node.symbol_table.add(get_kernel_by_name)
clfortran = ContainerSymbol("clfortran")
node.symbol_table.add(clfortran)
cl_finish = RoutineSymbol(
"clFinish", interface=ImportInterface(clfortran))
cl_launch = RoutineSymbol(
"clEnqueueNDRangeKernel",
interface=ImportInterface(clfortran))
node.symbol_table.add(cl_finish)
node.symbol_table.add(cl_launch)
iso_c_binding = ContainerSymbol("iso_c_binding")
node.symbol_table.add(iso_c_binding)
c_loc = RoutineSymbol(
"C_LOC", interface=ImportInterface(iso_c_binding))
c_null = DataSymbol(
"C_NULL_PTR", datatype=INTEGER_TYPE,
interface=ImportInterface(iso_c_binding))
node.symbol_table.add(c_loc)
node.symbol_table.add(c_null)
# Include the check_status subroutine if we are in debug_mode
if api_config.debug_mode:
ocl_utils = ContainerSymbol("ocl_utils_mod")
check_status = RoutineSymbol(
"check_status", interface=ImportInterface(ocl_utils))
node.symbol_table.add(ocl_utils)
node.symbol_table.add(check_status)
# Declare local variables needed by an OpenCL PSy-layer invoke
qlist = node.symbol_table.new_symbol(
"cmd_queues", symbol_type=DataSymbol,
datatype=UnsupportedFortranType(
"integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)"),
tag="opencl_cmd_queues")
# 'first_time' needs to be an UnsupportedFortranType because it has
# SAVE and initial value
first = DataSymbol("first_time",
datatype=UnsupportedFortranType(
"logical, save :: first_time = .true."))
node.symbol_table.add(first, tag="first_time")
flag = node.symbol_table.new_symbol(
"ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE,
tag="opencl_error")
global_size = node.symbol_table.new_symbol(
"globalsize", symbol_type=DataSymbol,
datatype=UnsupportedFortranType(
"integer(kind=c_size_t), target :: globalsize(2)"))
local_size = node.symbol_table.new_symbol(
"localsize", symbol_type=DataSymbol,
datatype=UnsupportedFortranType(
"integer(kind=c_size_t), target :: localsize(2)"))
# Bring all the boundaries at the beginning (since we are going to
# use them during the setup block - and they don't change)
boundary_vars = []
for tag, symbol in node.symbol_table.tags_dict.items():
if tag.startswith(("xstart_", "xstop_", "ystart_", "ystop_")):
boundary_vars.append(symbol)
cursor = 0
for assignment in node.walk(Assignment):
if assignment.lhs.symbol in boundary_vars:
node.children.insert(cursor, assignment.detach())
cursor += 1
# Create block of code to execute only the first time:
setup_block = IfBlock.create(Reference(first), [])
setup_block.preceding_comment = \
"Initialise OpenCL runtime, kernels and buffers"
node.children.insert(cursor, setup_block)
setup_block.if_body.addchild(Call.create(psy_init, []))
# Set up cmd_queues pointer
ptree = Fortran2003.Pointer_Assignment_Stmt(
f"{qlist.name} => {get_cmd_queues.name}()")
cblock = CodeBlock([ptree], CodeBlock.Structure.STATEMENT)
setup_block.if_body.addchild(cblock)
# Declare and assign kernel pointers
for kern in node.coded_kernels():
name = "kernel_" + kern.name
try:
kpointer = node.symbol_table.lookup_with_tag(name)
except KeyError:
pointer_type = UnsupportedFortranType(
"INTEGER(KIND=c_intptr_t), TARGET, SAVE :: " + name)
kpointer = DataSymbol(name, datatype=pointer_type)
node.symbol_table.add(kpointer, tag=name)
setup_block.if_body.addchild(
Assignment.create(
Reference(kpointer),
Call.create(get_kernel_by_name,
[Literal(kern.name, CHARACTER_TYPE)])))
# Traverse all arguments and make sure all the buffers are initialised
initialised_fields = set()
there_is_a_grid_buffer = False
for kern in node.coded_kernels():
for arg in kern.arguments.args:
if arg.argument_type == "field":
field = node.symbol_table.lookup(arg.name)
if field not in initialised_fields:
# Call the init_buffer routine with this field
call = Call.create(init_buf, [Reference(field)])
setup_block.if_body.addchild(call)
initialised_fields.add(field)
elif (arg.argument_type == "grid_property" and
not arg.is_scalar):
if not there_is_a_grid_buffer:
# Call the grid init_buffer routine
field = node.symbol_table.lookup(
kern.arguments.find_grid_access().name)
call = Call.create(init_grid, [Reference(field)])
setup_block.if_body.addchild(call)
there_is_a_grid_buffer = True
if not arg.is_scalar:
# All buffers will be assigned to a local OpenCL memory
# object to easily reference them, make sure this local
# variable is declared in the Invoke.
name = arg.name + "_cl_mem"
try:
node.symbol_table.lookup_with_tag(name)
except KeyError:
node.symbol_table.new_symbol(
name, tag=name, symbol_type=DataSymbol,
datatype=UnsupportedFortranType(
"INTEGER(KIND=c_intptr_t) :: " + name))
# Now call all the set_args routines because in some platforms (e.g.
# in Xilinx FPGA) knowing which arguments each kernel is going to use
# allows the write operation to place the data into the appropriate
# memory bank.
first_statement_comment = False
kernel_names = set()
for kern in node.coded_kernels():
if kern.name not in kernel_names:
kernel_names.add(kern.name)
callblock = self._generate_set_args_call(kern, node.scope)
for child in callblock.pop_all_children():
setup_block.if_body.addchild(child)
if not first_statement_comment:
child.preceding_comment = (
"Do a set_args now so subsequent writes place the "
"data appropriately")
first_statement_comment = True
# Now we can insert calls to write_to_device method for each buffer
# and the grid writing call if there is one (in a new first time block)
first_statement_comment = False
for field in initialised_fields:
call = Call.create(
RoutineSymbol(field.name+"%write_to_device"), [])
setup_block.if_body.addchild(call)
if not first_statement_comment:
call.preceding_comment = "Write data to the device"
first_statement_comment = True
if there_is_a_grid_buffer:
fieldarg = node.coded_kernels()[0].arguments.find_grid_access()
field = node.symbol_table.lookup(fieldarg.name)
call = Call.create(write_grid_buf, [Reference(field)])
setup_block.if_body.addchild(call)
# We will just mark the nodes we are replacing as deleting them inside
# the loop would break the PSy-layer backward_dependency method in the
# following iterations. We will detach all these nodes after the loop.
nodes_to_detach = []
# Transform each kernel call loop construct to its equivalent FortCL
# statements
for kern in node.coded_kernels():
outerloop = kern.ancestor(GOLoop).ancestor(GOLoop)
# Set up globalsize and localsize arrays
garg = node.coded_kernels()[0].arguments.find_grid_access()
num_x = api_config.grid_properties["go_grid_nx"].fortran\
.format(garg.name)
num_y = api_config.grid_properties["go_grid_ny"].fortran\
.format(garg.name)
assig = Assignment.create(
Reference(global_size),
Literal(f"(/{num_x}, {num_y}/)",
ArrayType(INTEGER_TYPE, [2])))
node.children.insert(outerloop.position, assig)
local_size_value = kern.opencl_options['local_size']
assig = Assignment.create(
Reference(local_size),
Literal(f"(/{local_size_value}, 1/)",
ArrayType(INTEGER_TYPE, [2])))
node.children.insert(outerloop.position, assig)
# Check that the global_size is multiple of the local_size
if api_config.debug_mode:
fortran_reader = FortranReader()
global_size_expr = fortran_reader.psyir_from_expression(
num_x, node.symbol_table)
self._add_divisibility_check(node, outerloop.position,
check_status, global_size_expr,
local_size_value)
# Retrieve kernel symbol
kernelsym = node.symbol_table.lookup_with_tag(
"kernel_" + kern.name)
# Choose the command queue number to which to dispatch this kernel.
# We have do deal with possible dependencies to kernels dispatched
# in different command queues as the order of execution is not
# guaranteed.
queue_number = kern.opencl_options['queue_number']
cmd_queue = ArrayReference.create(
qlist, [Literal(str(queue_number), INTEGER_TYPE)])
dependency = outerloop.backward_dependence()
# If the dependency is a loop containing a kernel, add a barrier if
# the previous kernels were dispatched in a different command queue
if dependency:
for kernel_dep in dependency.coded_kernels():
previous_queue = kernel_dep.opencl_options['queue_number']
if previous_queue != queue_number:
# If the backward dependency is being executed in
# another queue we add a barrier to make sure the
# previous kernel has finished before this halo
# exchange starts.
barrier = Assignment.create(
Reference(flag),
Call.create(cl_finish, [
ArrayReference.create(qlist, [
Literal(str(previous_queue),
INTEGER_TYPE)])]))
node.children.insert(outerloop.position, barrier)
# If the dependency is something other than a kernel, currently we
# dispatch everything else to queue _OCL_MANAGEMENT_QUEUE, so add a
# barrier if this kernel is not on queue _OCL_MANAGEMENT_QUEUE.
if dependency and not dependency.coded_kernels() and \
queue_number != self._OCL_MANAGEMENT_QUEUE:
barrier = Assignment.create(
Reference(flag),
Call.create(cl_finish, [
ArrayReference.create(qlist, [
Literal(str(self._OCL_MANAGEMENT_QUEUE),
INTEGER_TYPE)])]))
node.children.insert(outerloop.position, barrier)
# Check that everything has succeeded before the kernel launch
if api_config.debug_mode:
self._add_ready_check(node, outerloop.position, check_status,
kern.name, flag, cl_finish,
cmd_queue.copy())
callblock = self._generate_set_args_call(kern, node.scope)
for child in callblock.pop_all_children():
node.children.insert(outerloop.position, child)
# Then we call the clEnqueueNDRangeKernel
assig = Assignment.create(
Reference(flag),
Call.create(cl_launch, [
# OpenCL Command Queue
cmd_queue,
# OpenCL Kernel object
Reference(kernelsym),
# Number of work dimensions
Literal("2", INTEGER_TYPE),
# Global offset (if NULL the global IDs start at
# offset (0,0,0))
Reference(c_null),
# Global work size
Call.create(c_loc, [Reference(global_size)]),
# Local work size
Call.create(c_loc, [Reference(local_size)]),
# Number of events in wait list
Literal("0", INTEGER_TYPE),
# Event wait list that need to be completed before
# this kernel
Reference(c_null),
# Event that identifies this kernel completion
Reference(c_null)]))
assig.preceding_comment = "Launch the kernel"
node.children.insert(outerloop.position, assig)
self._insert_kernel_code_in_opencl_file(kern)
# Add additional checks if we are in debug mode
if api_config.debug_mode:
self._add_kernel_check(node, outerloop.position, check_status,
kern.name, flag, cl_finish,
cmd_queue.copy())
nodes_to_detach.append(outerloop)
# If we execute the kernels asynchronously, we need to add wait
# statements before the halo exchanges to guarantee that the data
# has been updated
for possible_dependent_node in node.walk(HaloExchange):
# The backward_dependences returns the last Loop with a kernel
# that has a dependency with this halo exchange
dependency = possible_dependent_node.backward_dependence()
if dependency:
for kernel_dep in dependency.coded_kernels():
previous_queue = kernel_dep.opencl_options['queue_number']
if previous_queue != self._OCL_MANAGEMENT_QUEUE:
# If the backward dependency is being executed in
# another queue we add a barrier to make sure the
# previous kernel has finished before this one starts.
barrier = Assignment.create(
Reference(flag),
Call.create(cl_finish, [
ArrayReference.create(qlist, [
Literal(str(previous_queue),
INTEGER_TYPE)])]))
pos = possible_dependent_node.position
node.children.insert(pos, barrier)
for node_to_detach in nodes_to_detach:
node_to_detach.detach()
if end_barrier:
self._add_end_barrier(node, flag, cl_finish, qlist)
# And at the very end always makes sure that first_time value is False
assign = Assignment.create(Reference(first),
Literal("false", BOOLEAN_TYPE))
assign.preceding_comment = "Unset the first time flag"
node.addchild(assign)
self._output_opencl_kernels_file()
def _add_end_barrier(self, node, flag, cl_finish, qlist):
''' Append into the given node a OpenCL Wait operation for each of
the OpenCL queues in use.
:param node: PSyIR node where to append the barrier.
:type node: :py:class:`psyclone.psyir.nodes.Schedule`
:param flag: PSyIR symbol to use as flag.
:type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
:param cl_finish: PSyIR symbol of the barrier routine.
:type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param qlist: PSyIR symbol of the OpenCL queues array.
:type qlist: :py:class:`psyclone.psyir.symbols.DataSymbol`
'''
# We need a clFinish for each of the queues in the implementation
added_comment = False
for num in range(1, self._max_queue_number + 1):
queue = ArrayReference.create(qlist, [Literal(str(num),
INTEGER_TYPE)])
node.addchild(
Assignment.create(
Reference(flag), Call.create(cl_finish, [queue])))
if not added_comment:
node.children[-1].preceding_comment = \
"Wait until all kernels have finished"
added_comment = True
@staticmethod
def _add_divisibility_check(node, position, check_status, global_size_expr,
local_size):
''' Insert into node a check that the global_size is exactly
divisible by the local size.
:param node: where to insert the divisibility check.
:type node: :py:class:`psyclone.psyir.nodes.Schedule`
:param int position: location where to insert the divisibilitay check.
:param check_status: PSyIR symbol of the check routine.
:type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param global_size_expr: PSyIR representing the global_size.
:type global_size_expr: :py:class:`psyclone.psyir.nodes.DataNode`
:param int local_size: size of the OpenCL local work_group.
'''
check = BinaryOperation.create(
BinaryOperation.Operator.NE,
IntrinsicCall.create(
IntrinsicCall.Intrinsic.MOD,
[global_size_expr,
Literal(str(local_size), INTEGER_TYPE)]
),
Literal("0", INTEGER_TYPE))
message = ("Global size is not a multiple of local size ("
"mandatory in OpenCL < 2.0).")
error = Call.create(check_status,
[Literal(message, CHARACTER_TYPE),
Literal("-1", INTEGER_TYPE)])
ifblock = IfBlock.create(check, [error])
node.children.insert(position, ifblock)
@staticmethod
def _add_kernel_check(node, position, check_status, kernel_name,
flag, cl_finish, cmd_queue):
''' Insert into node a check that the kernel has been launched and
has been executed successfully.
:param node: where to insert the kernel check.
:type node: :py:class:`psyclone.psyir.nodes.Schedule`
:param int position: location where to insert the kernel check.
:param check_status: PSyIR symbol of the check routine.
:type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param str kernel_name: name of the kernel being checked.
:param flag: PSyIR symbol to use as flag.
:type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
:param cl_finish: PSyIR symbol of the barrier routine.
:type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param cmd_queue: PSyIR symbol of the OpenCL command queues array.
:type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`
'''
# First check the launch return value
message = Literal(f"{kernel_name} clEnqueueNDRangeKernel",
CHARACTER_TYPE)
check = Call.create(check_status, [message, Reference(flag)])
node.children.insert(position, check)
# Then add a barrier
barrier = Assignment.create(
Reference(flag),
Call.create(cl_finish, [cmd_queue]))
node.children.insert(position + 1, barrier)
# And check the kernel executed successfully
message = Literal(f"Errors during {kernel_name}", CHARACTER_TYPE)
check = Call.create(check_status, [message, Reference(flag)])
node.children.insert(position + 2, check)
@staticmethod
def _add_ready_check(node, position, check_status, kernel_name,
flag, cl_finish, cmd_queue):
''' Insert into node a check that verifies if everything in the
command queues previous to a kernel launch has completed successfully.
:param node: where to insert the kernel check.
:type node: :py:class:`psyclone.psyir.nodes.Schedule`
:param int position: location where to insert the kernel check.
:param check_status: PSyIR symbol of the check routine.
:type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param str kernel_name: name of the kernel being checked.
:param flag: PSyIR symbol to use as flag.
:type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
:param cl_finish: PSyIR symbol of the barrier routine.
:type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
:param cmd_queue: PSyIR symbol of the OpenCL command queues array.
:type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`
'''
barrier = Assignment.create(
Reference(flag),
Call.create(cl_finish, [cmd_queue]))
node.children.insert(position, barrier)
message = Literal(f"Errors before {kernel_name} launch",
CHARACTER_TYPE)
check = Call.create(check_status, [message, Reference(flag)])
node.children.insert(position + 1, check)
def _insert_kernel_code_in_opencl_file(self, kernel):
''' Insert the given kernel into a OpenCL file. For this we need
to remove the 'go_wp' precision symbol which can't be generated
by OpenCL. We assume 'go_wp' is a OpenCL double.
:param kernel: the kernel to insert.
:type kernel: :py:class:`psyclone.psyir.nodes.KernelSchedule`
'''
if not self._kernels_file:
self._kernels_file = FileContainer("opencl_kernels")
# Create a copy of the kernel and remove precision symbols since they
# are not supported in the OpenCL backend.
# validate() has checked that the kernel is not polymorphic.
schedule = kernel.get_callees()[0]
kernel_copy = schedule.copy()
symtab = kernel_copy.symbol_table
# TODO #898: Removing symbols is not properly supported by PSyIR
# because we have to deal with all references to it. In this case we
# implement manually a conversion of all 'go_wp' to a double precision
# and remove the symbol because we guarantee that it just appear in the
# declarations of other symbols (symtab.datasymbols).
# pylint: disable=protected-access
for sym in symtab.datasymbols:
# Not all types have the 'precision' attribute (e.g.
# UnresolvedType)
if (hasattr(sym.datatype, "precision") and
isinstance(sym.datatype.precision, DataNode)):
sym.datatype._precision = ScalarType.Precision.DOUBLE
if 'go_wp' in symtab:
del symtab._symbols['go_wp']
# Insert kernel in the OpenCL kernels file if it doesn't already exist
for routine in self._kernels_file.walk(Routine):
if routine.name == kernel.name:
break # if it exist re-use existing one
# TODO 1572: Here we assume that in the same Invoke (scope) a
# kernel with the same name will be the same kernel, but that
# may not be true when doing multiple invokes.
else:
self._kernels_file.addchild(kernel_copy)
def _output_opencl_kernels_file(self):
''' Write the OpenCL kernels to a file using the OpenCL backend.
'''
# TODO 1013: The code below duplicates some logic of the CodedKern
# rename_and_write method. Ideally this should be moved out of
# the AST and transformations and put into some kind of IOManager.
ocl_writer = OpenCLWriter(kernels_local_size=64)
new_kern_code = ocl_writer(self._kernels_file)
fdesc = None
name_idx = -1
while not fdesc:
name_idx += 1
new_name = f"opencl_kernels_{name_idx}.cl"
try:
# Atomically attempt to open the new kernel file (in case
# this is part of a parallel build)
fdesc = os.open(
os.path.join(Config.get().kernel_output_dir, new_name),
os.O_CREAT | os.O_WRONLY | os.O_EXCL)
except (OSError, IOError):
# The os.O_CREATE and os.O_EXCL flags in combination mean
# that open() raises an error if the file exists
continue
# Write the modified AST out to file
os.write(fdesc, new_kern_code.encode())
# Close the new kernel file
os.close(fdesc)
@staticmethod
def _generate_set_args_call(kernel, scope):
'''
Generate the Call statement to the set_args subroutine for the
provided kernel.
:param kernel: the kernel for which to generate a call to its \
arg_setter subroutine.
:type kernel: :py:class:`psyclone.psyGen.CodedKern`
:param scope: The node representing the scope where the call \
statements will be inserted.
:type scope: :py:class:`psyclone.psyir.nodes.ScopingNode`
:returns: a block of statements that represent the set_args call
:rtype: :py:class:`psyclone.psyir.nodes.Schedule`
'''
call_block = Schedule()
# Retrieve symbol table and kernel symbol
symtab = scope.symbol_table
kernelsym = symtab.lookup_with_tag("kernel_" + kernel.name)
# Find the symbol that defines each boundary for this kernel.
# In OpenCL the iteration boundaries are passed as arguments to the
# kernel because the global work size may exceed the dimensions and
# therefore the updates outside the boundaries should be masked.
# If any of the boundaries is not found, it can not proceed.
boundaries = []
try:
for boundary in ["xstart", "xstop", "ystart", "ystop"]:
tag = boundary + "_" + kernel.name
symbol = symtab.lookup_with_tag(tag)
boundaries.append(symbol.name)
except KeyError as err:
raise GenerationError(
f"Boundary symbol tag '{tag}' not found while generating the "
f"OpenCL code for kernel '{kernel.name}'. Make sure to apply "
f"the GOMoveIterationBoundariesInsideKernelTrans before "
f"attempting the OpenCL code generation.") from err
api_config = Config.get().api_conf("gocean")
# Prepare the argument list for the set_args routine
arguments = [Reference(kernelsym)]
for arg in kernel.arguments.args:
if arg.argument_type == "scalar":
if arg.name in boundaries:
# Boundary values are 0-indexed in OpenCL and 1-indexed in
# PSyIR, therefore we need to subtract 1
bop = BinaryOperation.create(BinaryOperation.Operator.SUB,
arg.psyir_expression(),
Literal("1", INTEGER_TYPE))
arguments.append(bop)
else:
arguments.append(arg.psyir_expression())
elif arg.argument_type == "field":
# Cast buffer to cl_mem type expected by OpenCL
field = symtab.lookup(arg.name)
symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")
source = StructureReference.create(field, ['device_ptr'])
dest = Reference(symbol)
icall = IntrinsicCall.create(IntrinsicCall.Intrinsic.TRANSFER,
[source, dest])
assig = Assignment.create(dest.copy(), icall)
call_block.addchild(assig)
arguments.append(Reference(symbol))
elif arg.argument_type == "grid_property":
garg = kernel.arguments.find_grid_access()
if arg.is_scalar:
# pylint: disable=protected-access
arguments.append(
StructureReference.create(
symtab.lookup(garg.name),
api_config.grid_properties[arg._property_name]
.fortran.split('%')[1:]
))
else:
# Cast grid buffer to cl_mem type expected by OpenCL
device_grid_property = arg.name + "_device"
field = symtab.lookup(garg.name)
source = StructureReference.create(
field, ['grid', device_grid_property])
symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")
dest = Reference(symbol)
icall = IntrinsicCall.create(
IntrinsicCall.Intrinsic.TRANSFER,
[source, dest])
assig = Assignment.create(dest.copy(), icall)
call_block.addchild(assig)
arguments.append(Reference(symbol))
call_symbol = symtab.lookup_with_tag(kernel.name + "_set_args")
call_block.addchild(Call.create(call_symbol, arguments))
return call_block
@staticmethod
def _insert_ocl_arg_setter_routine(node, kernel):
'''
Returns the symbol of the subroutine that sets the OpenCL kernel
arguments for the provided PSy-layer kernel using FortCL. If the
subroutine doesn't exist it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:param kernel: the kernel call for which to provide the arg_setter \
subroutine.
:type kernel: :py:class:`psyclone.psyGen.CodedKern`
:returns: the symbol representing the arg_setter subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
# Check if the subroutine already exist.
sub_name = kernel.name + "_set_args"
try:
return node.symbol_table.lookup_with_tag(sub_name)
except KeyError:
# If the Symbol does not exist, the rest of this method
# will generate it.
pass
# Create the new Routine
sub_name = node.symbol_table.next_available_name(sub_name)
sub_symbol = node.symbol_table.new_symbol(
sub_name, tag=kernel.name + "_set_args",
symbol_type=RoutineSymbol)
argsetter = Routine(sub_symbol)
arg_list = []
# Add subroutine imported symbols
clfortran = ContainerSymbol("clfortran")
clsetkernelarg = RoutineSymbol("clSetKernelArg",
interface=ImportInterface(clfortran))
iso_c = ContainerSymbol("iso_c_binding")
c_sizeof = RoutineSymbol("C_SIZEOF", interface=ImportInterface(iso_c))
c_loc = RoutineSymbol("C_LOC", interface=ImportInterface(iso_c))
c_intptr_t = RoutineSymbol("c_intptr_t",
interface=ImportInterface(iso_c))
ocl_utils = ContainerSymbol("ocl_utils_mod")
check_status = RoutineSymbol("check_status",
interface=ImportInterface(ocl_utils))
argsetter.symbol_table.add(clfortran)
argsetter.symbol_table.add(clsetkernelarg)
argsetter.symbol_table.add(iso_c)
argsetter.symbol_table.add(c_sizeof)
argsetter.symbol_table.add(c_loc)
argsetter.symbol_table.add(c_intptr_t)
argsetter.symbol_table.add(ocl_utils)
argsetter.symbol_table.add(check_status)
# Add an argument symbol for the kernel object
kobj = argsetter.symbol_table.new_symbol(
"kernel_obj", symbol_type=DataSymbol,
interface=ArgumentInterface(ArgumentInterface.Access.READ),
datatype=UnsupportedFortranType(
"INTEGER(KIND=c_intptr_t), TARGET :: kernel_obj"))
arg_list.append(kobj)
# Include each kernel call argument as an argument of this routine
for arg in kernel.arguments.args:
name = argsetter.symbol_table.next_available_name(arg.name)
# This function requires 'TARGET' annotated declarations which are
# not supported in the PSyIR, so we build them as
# UnsupportedFortranType for now.
if arg.is_scalar and arg.intrinsic_type == "real":
pointer_type = UnsupportedFortranType(
"REAL(KIND=go_wp), INTENT(IN), TARGET :: " + name)
elif arg.is_scalar:
pointer_type = UnsupportedFortranType(
"INTEGER, INTENT(IN), TARGET :: " + name)
else:
# Everything else is a cl_mem pointer (c_intptr_t)
pointer_type = UnsupportedFortranType(
"INTEGER(KIND=c_intptr_t), INTENT(IN), TARGET :: " + name)
new_arg = DataSymbol(
name, datatype=pointer_type,
interface=ArgumentInterface(ArgumentInterface.Access.READ))
argsetter.symbol_table.add(new_arg)
arg_list.append(new_arg)
argsetter.symbol_table.specify_argument_list(arg_list)
# Create the ierr local variable
ierr = argsetter.symbol_table.new_symbol(
"ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE)
# Call the clSetKernelArg for each argument and a check_status to
# see if the OpenCL call has succeeded
for index, variable in enumerate(arg_list[1:]):
call = Call.create(clsetkernelarg,
[Reference(kobj),
Literal(str(index), INTEGER_TYPE),
Call.create(c_sizeof, [Reference(variable)]),
Call.create(c_loc, [Reference(variable)])])
assignment = Assignment.create(Reference(ierr), call)
argsetter.addchild(assignment)
emsg = f"clSetKernelArg: arg {index} of {kernel.name}"
call = Call.create(check_status, [Literal(emsg, CHARACTER_TYPE),
Reference(ierr)])
argsetter.addchild(call)
argsetter.children[0].preceding_comment = \
f"Set the arguments for the {kernel.name} OpenCL Kernel"
# Add the subroutine as child of the provided node
node.addchild(argsetter)
return node.symbol_table.lookup_with_tag(kernel.name + "_set_args")
def _insert_opencl_init_routine(self, node):
'''
Returns the symbol of the subroutine that initialises the OpenCL
environment using FortCL. If the subroutine doesn't exist it also
generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol representing the OpenCL initialisation subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
symtab = node.symbol_table
try:
# TODO #1572: The ocl_init routine may need to be regenerated if
# there are multiple Invokes because _max_queue_number may have
# increased and we need to load the kernels of both invokes.
return symtab.lookup_with_tag("ocl_init_routine")
except KeyError:
# If the Symbol does not exist, the rest of this method
# will generate it.
pass
# Choose a round-robin device number if it has MPI and multiple
# accelerators.
distributed_memory = Config.get().distributed_memory
devices_per_node = Config.get().ocl_devices_per_node
additional_uses = ""
additional_stmts = ""
if devices_per_node > 1 and distributed_memory:
additional_uses += "USE parallel_mod, ONLY: get_rank"
additional_stmts += \
f"ocl_device_num = mod(get_rank()-1, {devices_per_node}) + 1"
# Get a set of all kernel names in the Container. This implementation
# currently assumes all of them will be available in OpenCL
unique_kernels = {kernel.name for kernel in node.coded_kernels()}
# Code of the subroutine in Fortran
code = f'''
subroutine psy_init()
{additional_uses}
use fortcl, only: ocl_env_init, add_kernels
character(len=30) kernel_names({len(unique_kernels)})
integer :: ocl_device_num=1
logical, save :: initialised=.false.
! Check to make sure we only execute this routine once
if (.not. initialised) then
initialised = .true.
! Initialise the opencl environment/device
{additional_stmts}
call ocl_env_init({self._max_queue_number}, ocl_device_num, &
{".true." if self._enable_profiling else ".false."}, &
{".true." if self._out_of_order else ".false."})
! The kernels this psy layer module requires
'''
for index, kernel_name in enumerate(unique_kernels):
code += f"kernel_names({index + 1}) = \"{kernel_name}\"\n"
code += f'''\
! Create the opencl kernel objects. This expects to find all of
! the compiled kernels in FORTCL_KERNELS_FILE environment variable
call add_kernels({len(unique_kernels)}, kernel_names)
end if
end subroutine psy_init'''
# Create the symbol for the routine.
subroutine_symbol = RoutineSymbol("psy_init")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
subroutine.detach()
node.symbol_table.add(subroutine_symbol, tag="ocl_init_routine")
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_init_routine")
@staticmethod
def _insert_initialise_grid_buffers(node):
'''
Returns the symbol of a subroutine that initialises all OpenCL grid
buffers in the OpenCL device using FortCL. If the subroutine doesn't
already exist it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol of the grid buffer initialisation subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
# pylint: disable=too-many-locals
symtab = node.symbol_table
try:
return symtab.lookup_with_tag("ocl_init_grid_buffers")
except KeyError:
# If the Symbol does not exist, the rest of this method
# will generate it.
pass
# Get the GOcean API property names used in this routine
api_config = Config.get().api_conf("gocean")
props = api_config.grid_properties
num_x = props["go_grid_nx"].fortran.format("field")
num_y = props["go_grid_ny"].fortran.format("field")
int_arrays = []
real_arrays = []
for key, prop in props.items():
if key == "go_grid_data":
# TODO #676: Ignore because go_grid_data is actually a field
# property
continue
if prop.type == "array" and prop.intrinsic_type == "integer":
int_arrays.append(prop.fortran.format("field"))
elif prop.type == "array" and prop.intrinsic_type == "real":
real_arrays.append(prop.fortran.format("field"))
# Code of the subroutine in Fortran
code = f'''
subroutine initialise_device_grid(field)
USE fortcl, ONLY: create_ronly_buffer
USE iso_c_binding, only: c_size_t
use field_mod
type(r2d_field), intent(inout), target :: field
integer(kind=c_size_t) size_in_bytes
IF (.not. c_associated({int_arrays[0]}_device)) THEN
! Create integer grid fields
size_in_bytes = int({num_x}*{num_y}, 8) * &
c_sizeof({int_arrays[0]}(1,1))
'''
for int_array in int_arrays:
code += f'''
{int_array}_device = transfer( &
create_ronly_buffer(size_in_bytes), {int_array}_device)
'''
code += f'''
! Create real grid buffers
size_in_bytes = int({num_x} * {num_y}, 8) * &
c_sizeof({real_arrays[0]}(1,1))
'''
for real_array in real_arrays:
code += f'''
{real_array}_device = transfer( &
create_ronly_buffer(size_in_bytes), {real_array}_device)
'''
code += '''
END IF
end subroutine initialise_device_grid
'''
# Create the symbol for the routine.
subroutine_symbol = RoutineSymbol("initialise_grid_device_buffers")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
symtab.add(subroutine_symbol, tag="ocl_init_grid_buffers")
subroutine.detach()
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_init_grid_buffers")
def _insert_write_grid_buffers(self, node):
'''
Returns the symbol of a subroutine that writes the values of the grid
properties into the OpenCL device buffers using FortCL. If the
subroutine doesn't already exist it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol representing the grid buffers writing subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
# pylint: disable=too-many-locals
symtab = node.symbol_table
try:
return symtab.lookup_with_tag("ocl_write_grid_buffers")
except KeyError:
# If the Symbol does not exist, the rest of this method
# will generate it.
pass
# Get the GOcean API property names used in this routine
api_config = Config.get().api_conf("gocean")
props = api_config.grid_properties
num_x = props["go_grid_nx"].fortran.format("field")
num_y = props["go_grid_ny"].fortran.format("field")
# Code of the subroutine in Fortran
code = f'''
subroutine write_device_grid(field)
USE fortcl, ONLY: get_cmd_queues
use iso_c_binding, only: c_intptr_t, c_size_t, c_sizeof
USE clfortran
USE ocl_utils_mod, ONLY: check_status
type(r2d_field), intent(inout), target :: field
integer(kind=c_size_t) size_in_bytes
INTEGER(c_intptr_t), pointer :: cmd_queues(:)
integer(c_intptr_t) :: cl_mem
integer :: ierr
cmd_queues => get_cmd_queues()
! Integer grid buffers
size_in_bytes = int({num_x} * {num_y}, 8) * &
c_sizeof(field%grid%tmask(1,1))
cl_mem = transfer(field%grid%tmask_device, cl_mem)
ierr = clEnqueueWriteBuffer( &
cmd_queues({self._OCL_MANAGEMENT_QUEUE}), &
cl_mem, CL_TRUE, 0_8, size_in_bytes, &
C_LOC(field%grid%tmask), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueWriteBuffer tmask", ierr)
! Real grid buffers
size_in_bytes = int({num_x} * {num_y}, 8) * &
c_sizeof(field%grid%area_t(1,1))
'''
write_str = '''
cl_mem = transfer(field%grid%{0}_device, cl_mem)
ierr = clEnqueueWriteBuffer(cmd_queues({1}), &
cl_mem, CL_TRUE, 0_8, size_in_bytes, &
C_LOC(field%grid%{0}), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueWriteBuffer {0}_device", ierr)
'''
for grid_prop in ['area_t', 'area_u', 'area_v', 'dx_u', 'dx_v',
'dx_t', 'dy_u', 'dy_v', 'dy_t', 'gphiu', 'gphiv']:
code += write_str.format(grid_prop, self._OCL_MANAGEMENT_QUEUE)
code += "end subroutine write_device_grid"
# Create the symbol for the routine.
subroutine_symbol = RoutineSymbol("write_grid_buffers")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
symtab.add(subroutine_symbol, tag="ocl_write_grid_buffers")
subroutine.detach()
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_write_grid_buffers")
def _insert_ocl_read_from_device_function(self, node):
'''
Returns the symbol of a subroutine that retrieves the data back from
an OpenCL device using FortCL. If the subroutine doesn't already exist
it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol of the buffer data retrieving subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
symtab = node.symbol_table
try:
return symtab.lookup_with_tag("ocl_read_func")
except KeyError:
# If the subroutines does not exist, it needs to be
# generated first.
pass
# Code of the subroutine in Fortran
code = f'''
subroutine read_sub(from, to, startx, starty, nx, ny, blocking)
USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof
USE ocl_utils_mod, ONLY: check_status
use kind_params_mod, only: go_wp
USE clfortran
USE fortcl, ONLY: get_cmd_queues
type(c_ptr), intent(in) :: from
real(go_wp), intent(inout), dimension(:,:), target :: to
integer, intent(in) :: startx, starty, nx, ny
logical, intent(in) :: blocking
INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes
integer(c_intptr_t) :: cl_mem
INTEGER(c_intptr_t), pointer :: cmd_queues(:)
integer :: ierr, i
! Give the from pointer the appropriate OpenCL memory object type
cl_mem = transfer(from, cl_mem)
cmd_queues => get_cmd_queues()
! Two copy strategies depending on how much of the total length
! nx covers.
if (nx < size(to, 1) / 2) then
! Dispatch asynchronous copies of just the contiguous data.
do i = starty, starty+ny
size_in_bytes = int(nx, 8) * c_sizeof(to(1,1))
offset_in_bytes = int(size(to, 1) * (i-1) + (startx-1)) &
* c_sizeof(to(1,1))
ierr = clEnqueueReadBuffer( &
cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
CL_FALSE, offset_in_bytes, size_in_bytes, &
C_LOC(to(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueReadBuffer", ierr)
enddo
if (blocking) then
CALL check_status("clFinish on read", &
clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))
endif
else
! Copy across the whole starty:starty+ny rows in a single
! copy operation.
size_in_bytes = int(size(to, 1) * ny, 8) * c_sizeof(to(1,1))
offset_in_bytes = int(size(to,1)*(starty-1), 8) &
* c_sizeof(to(1,1))
ierr = clEnqueueReadBuffer( &
cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
CL_TRUE, offset_in_bytes, size_in_bytes, &
C_LOC(to(1,starty)), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueReadBuffer", ierr)
endif
end subroutine read_sub
'''
# Create the symbol for the routine.
subroutine_symbol = RoutineSymbol("read_from_device")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
symtab.add(subroutine_symbol, tag="ocl_read_func")
subroutine.detach()
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_read_func")
def _insert_ocl_write_to_device_function(self, node):
'''
Returns the symbol of a subroutine that writes the buffer data into
an OpenCL device using FortCL. If the subroutine doesn't already exist
it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol of the buffer writing subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
symtab = node.symbol_table
try:
return symtab.lookup_with_tag("ocl_write_func")
except KeyError:
# If the subroutines does not exist, it needs to be
# generated first.
pass
# Code of the subroutine in Fortran
code = f'''
subroutine write_sub(from, to, startx, starty, nx, ny, blocking)
USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof
USE ocl_utils_mod, ONLY: check_status
use kind_params_mod, only: go_wp
USE clfortran
USE fortcl, ONLY: get_cmd_queues
real(go_wp), intent(in), dimension(:,:), target :: from
type(c_ptr), intent(in) :: to
integer, intent(in) :: startx, starty, nx, ny
logical, intent(in) :: blocking
integer(c_intptr_t) :: cl_mem
INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes
INTEGER(c_intptr_t), pointer :: cmd_queues(:)
integer :: ierr, i
! Give the to pointer the appropriate OpenCL memory object type
cl_mem = transfer(to, cl_mem)
cmd_queues => get_cmd_queues()
! Two copy strategies depending on how much of the total length
! nx covers.
if (nx < size(from,1) / 2) then
! Dispatch asynchronous copies of just the contiguous data.
do i=starty, starty+ny
size_in_bytes = int(nx, 8) * c_sizeof(from(1,1))
offset_in_bytes = int(size(from, 1) * (i-1) + (startx-1)) &
* c_sizeof(from(1,1))
ierr = clEnqueueWriteBuffer( &
cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
CL_FALSE, offset_in_bytes, size_in_bytes, &
C_LOC(from(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueWriteBuffer", ierr)
enddo
if (blocking) then
CALL check_status("clFinish on write", &
clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))
endif
else
! Copy across the whole starty:starty+ny rows in a single
! copy operation.
size_in_bytes = int(size(from,1) * ny, 8) * c_sizeof(from(1,1))
offset_in_bytes = int(size(from,1) * (starty-1)) &
* c_sizeof(from(1,1))
ierr = clEnqueueWriteBuffer(&
cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
CL_TRUE, offset_in_bytes, size_in_bytes, &
C_LOC(from(1, starty)), 0, C_NULL_PTR, C_NULL_PTR)
CALL check_status("clEnqueueWriteBuffer", ierr)
endif
end subroutine write_sub
'''
# Create the symbol for the routine.
subroutine_symbol = RoutineSymbol("write_to_device")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
symtab.add(subroutine_symbol, tag="ocl_write_func")
subroutine.detach()
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_write_func")
@staticmethod
def _insert_ocl_initialise_buffer(node):
'''
Returns the symbol of a subroutine that initialises an OpenCL buffer in
the OpenCL device using FortCL. If the subroutine doesn't already exist
it also generates it.
:param node: the container where the new subroutine will be inserted.
:type node: :py:class:`psyclone.psyir.nodes.Container`
:returns: the symbol of the buffer initialisation subroutine.
:rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
'''
# pylint: disable=too-many-locals
symtab = node.symbol_table
try:
return symtab.lookup_with_tag("ocl_init_buffer_func")
except KeyError:
# If the Symbol does not exist, the rest of this method
# will generate it.
pass
# Get the GOcean API property names used in this routine
api_config = Config.get().api_conf("gocean")
host_buff = \
api_config.grid_properties["go_grid_data"].fortran.format("field")
props = api_config.grid_properties
num_x = props["go_grid_nx"].fortran.format("field")
num_y = props["go_grid_ny"].fortran.format("field")
# Fields need to provide a function pointer to how the
# device data is going to be read and written, if it doesn't
# exist, create the appropriate subroutine first.
read_fp = symtab.lookup_with_tag("ocl_read_func").name
write_fp = symtab.lookup_with_tag("ocl_write_func").name
# Code of the subroutine in Fortran
code = f'''
subroutine initialise_device_buffer(field)
USE fortcl, ONLY: create_rw_buffer
USE iso_c_binding, only: c_size_t
use field_mod
type(r2d_field), intent(inout), target :: field
integer(kind=c_size_t) size_in_bytes
IF (.NOT. field%data_on_device) THEN
size_in_bytes = int({num_x} * {num_y}, 8) * &
c_sizeof({host_buff}(1,1))
! Create buffer on device, we store it without type information
! on the dl_esm_inf pointer (transfer/static_cast to void*)
field%device_ptr = transfer( &
create_rw_buffer(size_in_bytes), &
field%device_ptr)
field%data_on_device = .true.
field%read_from_device_f => {read_fp}
field%write_to_device_f => {write_fp}
END IF
end subroutine initialise_device_buffer
'''
# Create the symbol for the routine .
subroutine_symbol = RoutineSymbol("initialise_device_buffer")
# Obtain the PSyIR representation of the code above
fortran_reader = FortranReader()
container = fortran_reader.psyir_from_source(code)
subroutine = container.children[0]
symtab.add(subroutine_symbol, tag="ocl_init_buffer_func")
subroutine.detach()
subroutine.symbol = subroutine_symbol
# Add the subroutine as child of the provided node
node.addchild(subroutine)
return symtab.lookup_with_tag("ocl_init_buffer_func")
# For AutoAPI documentation generation
__all__ = ["GOOpenCLTrans"]