From 60d89d893026028ac7ce0098b728f2004cd46486 Mon Sep 17 00:00:00 2001 From: Tianjiao Sun Date: Thu, 11 Apr 2019 16:36:14 +0100 Subject: [PATCH 01/17] codegen: Implement SIMD vectorisation Only works when kernel is a Loopy kernel. --- pyop2/codegen/rep2loopy.py | 11 ++++-- pyop2/configuration.py | 2 + pyop2/sequential.py | 79 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 263b17b3c..c710be131 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -203,13 +203,14 @@ def solve_fn_lookup(target, identifier): class _PreambleGen(ImmutableRecord): - fields = set(("preamble", )) + fields = {"preamble", "idx"} - def __init__(self, preamble): + def __init__(self, preamble, idx="0"): self.preamble = preamble + self.idx = idx def __call__(self, preamble_info): - yield ("0", self.preamble) + yield (self.idx, self.preamble) class PyOP2KernelCallable(loopy.ScalarCallable): @@ -566,7 +567,9 @@ def generate(builder, wrapper_name=None): options=options, assumptions=assumptions, lang_version=(2018, 2), - name=wrapper_name) + name=wrapper_name, + # TODO, should these really be silenced? + silenced_warnings=["write_race*"]) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/configuration.py b/pyop2/configuration.py index fe5a2c4c5..f70100ec3 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -79,6 +79,8 @@ class Configuration(dict): DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "alignment": ("PYOP2_ALIGNMENT", int, 64), + "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "cflags": ("PYOP2_CFLAGS", str, ""), "ldflags": ("PYOP2_LDFLAGS", str, ""), diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 1dbab1c18..3a166660e 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -37,6 +37,8 @@ from copy import deepcopy as dcopy import ctypes +import loopy +import numpy from pyop2.datatypes import IntType, as_ctypes from pyop2 import base @@ -57,8 +59,48 @@ from pyop2.mpi import collective from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir +from pyop2.configuration import configuration +from pyop2.codegen.rep2loopy import _PreambleGen -import loopy + +def vectorise(wrapper, iname, batch_size): + """Return a vectorised version of wrapper, vectorising over iname. + + :arg wrapper: A loopy kernel to vectorise. + :arg iname: The iteration index to vectorise over. + :arg batch_size: The vector width.""" + if batch_size == 1: + return wrapper + + # create constant zero vectors + wrapper = wrapper.copy(target=loopy.CVecTarget()) + kernel = wrapper.root_kernel + zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, + initializer=numpy.array(0.0, dtype=numpy.float64), + address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size) + tmps = kernel.temporary_variables.copy() + tmps["_zeros"] = zeros + kernel = kernel.copy(temporary_variables=tmps) + + # split iname and vectorize the inner loop + inner_iname = iname + "_batch" + + # vectorize using vector extenstions + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + + alignment = configuration["alignment"] + tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) + kernel = kernel.copy(temporary_variables=tmps) + + wrapper = wrapper.with_root_kernel(kernel) + + # vector data type + vec_types = [("double", 8), ("int", 4)] # scalar type, bytes + preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types] + preamble = "\n" + "\n".join(preamble) + + wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")]) + return wrapper class JITModule(base.JITModule): @@ -122,6 +164,15 @@ def code_to_compile(self): builder.add_argument(arg) wrapper = generate(builder) + if self._iterset._extruded: + iname = "layer" + else: + iname = "n" + has_matrix = any(arg._is_mat for arg in self._args) + has_rw = any(arg.access == RW for arg in self._args) + if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw): + wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name) + wrapper = vectorise(wrapper, iname, configuration["simd_width"]) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: @@ -137,8 +188,6 @@ def compile(self): if not hasattr(self, '_args'): raise RuntimeError("JITModule has no args associated with it, should never happen") - from pyop2.configuration import configuration - compiler = configuration["compiler"] extension = "cpp" if self._kernel._cpp else "c" cppargs = self._cppargs @@ -184,6 +233,24 @@ def argtypes(self): class ParLoop(petsc_base.ParLoop): + def set_nbytes(self, args): + nbytes = 0 + seen = set() + for arg in args: + if arg.access is INC: + nbytes += arg.data.nbytes + else: + nbytes += arg.data.nbytes + for map_ in arg.map_tuple: + if map_ is None: + continue + for k in map_._kernel_args_: + if k in seen: + continue + nbytes += map_.values.nbytes + seen.add(k) + self.nbytes = nbytes + def prepare_arglist(self, iterset, *args): arglist = iterset._kernel_args_ for arg in args: @@ -199,6 +266,8 @@ def prepare_arglist(self, iterset, *args): continue arglist += (k,) seen.add(k) + if configuration["time"]: + self.set_nbytes(args) return arglist @cached_property @@ -213,6 +282,10 @@ def _compute_event(self): @collective def _compute(self, part, fun, *arglist): + if configuration["time"]: + nbytes = self.comm.allreduce(self.nbytes) + if self.comm.Get_rank() == 0: + print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes)) with self._compute_event: self.log_flops(part.size * self.num_flops) fun(part.offset, part.offset + part.size, *arglist) From 72f3b3f3c8c768a5fbc0174b2cd4e06c398bad6b Mon Sep 17 00:00:00 2001 From: tj sun Date: Thu, 1 Aug 2019 17:55:28 +0100 Subject: [PATCH 02/17] add omp simd vectorization mode --- pyop2/configuration.py | 1 + pyop2/sequential.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index f70100ec3..eab30d5e5 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -79,6 +79,7 @@ class Configuration(dict): DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 3a166660e..cc8d78d55 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -85,8 +85,13 @@ def vectorise(wrapper, iname, batch_size): # split iname and vectorize the inner loop inner_iname = iname + "_batch" - # vectorize using vector extenstions - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + if configuration["vectorization_strategy"] == "ve": + # vectorize using vector extenstions + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + else: + # vectoriza using omp pragma simd + assert configuration["vectorization_strategy"] == "omp" + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From 9a3a5493428d8f881f6b9922a0eb21b711b8c264 Mon Sep 17 00:00:00 2001 From: tj sun Date: Sun, 4 Aug 2019 19:46:27 +0100 Subject: [PATCH 03/17] add openmp flag and by pass workaround flag --- pyop2/codegen/rep2loopy.py | 9 +++++++++ pyop2/compilation.py | 4 ++-- pyop2/sequential.py | 9 ++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index c710be131..2cbf66eb7 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -570,6 +570,15 @@ def generate(builder, wrapper_name=None): name=wrapper_name, # TODO, should these really be silenced? silenced_warnings=["write_race*"]) + from pyop2.configuration import configuration + if configuration["time"]: + batch_size = configuration["simd_width"] + if builder.extruded: + start, end = parameters.layer_start, parameters.layer_end + else: + start, end = "start", "end" + wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size)) + wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start)) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/compilation.py b/pyop2/compilation.py index e5a9fefdd..1648be38e 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -218,7 +218,7 @@ def workaround_cflags(self): if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"): # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055 # See also https://github.com/firedrakeproject/firedrake/issues/1442 - # And https://github.com/firedrakeproject/firedrake/issues/1717 + return # enable vectorization for paper # Bug also on skylake with the vectoriser in this # combination (disappears without # -fno-tree-loop-vectorize!) @@ -396,7 +396,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-O3', '-ffast-math', '-fopenmp'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" diff --git a/pyop2/sequential.py b/pyop2/sequential.py index cc8d78d55..bebe7a2c0 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -83,15 +83,18 @@ def vectorise(wrapper, iname, batch_size): kernel = kernel.copy(temporary_variables=tmps) # split iname and vectorize the inner loop + slabs = (1, 1) + if configuration["time"]: + slabs = (0, 0) inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": # vectorize using vector extenstions - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) else: # vectoriza using omp pragma simd assert configuration["vectorization_strategy"] == "omp" - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) @@ -243,7 +246,7 @@ def set_nbytes(self, args): seen = set() for arg in args: if arg.access is INC: - nbytes += arg.data.nbytes + nbytes += arg.data.nbytes * 2 else: nbytes += arg.data.nbytes for map_ in arg.map_tuple: From 3a2d5ff2ba39eb73d44fac437327e15a2d9a7e3d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Apr 2019 17:09:25 +0100 Subject: [PATCH 04/17] DROP BEFORE MERGE: test with correct loopy branch --- requirements-git.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-git.txt b/requirements-git.txt index 718e27330..4790f7f1b 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,4 +1,4 @@ git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc --no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy From beedc5fe196b9a7e15575c71dc2ee86402b299e0 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 09:57:13 +0100 Subject: [PATCH 05/17] Turn of tree vectorize for certain gcc compilers. We might not need the tree vectorisation flag for our vectorisation anyways. --- pyop2/compilation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 1648be38e..7e951bcca 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -218,7 +218,6 @@ def workaround_cflags(self): if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"): # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055 # See also https://github.com/firedrakeproject/firedrake/issues/1442 - return # enable vectorization for paper # Bug also on skylake with the vectoriser in this # combination (disappears without # -fno-tree-loop-vectorize!) From 567ec0cc85611ecfa812d5e0b2ac03f0c727d690 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 10:01:26 +0100 Subject: [PATCH 06/17] Add simd compiler flags. --- pyop2/compilation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 7e951bcca..94d058683 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -367,7 +367,7 @@ class MacCompiler(Compiler): """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -395,7 +395,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-O3', '-ffast-math', '-fopenmp'] + opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -421,7 +421,7 @@ class LinuxIntelCompiler(Compiler): rank 0 compiles code) (defaults to COMM_WORLD). """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-Ofast', '-xHost'] + opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" From 1f0ea23251f91c38c265b96538f432f5d865ee5e Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 10:02:37 +0100 Subject: [PATCH 07/17] Remove time configuration. --- pyop2/codegen/rep2loopy.py | 9 --------- pyop2/sequential.py | 8 -------- 2 files changed, 17 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 2cbf66eb7..c710be131 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -570,15 +570,6 @@ def generate(builder, wrapper_name=None): name=wrapper_name, # TODO, should these really be silenced? silenced_warnings=["write_race*"]) - from pyop2.configuration import configuration - if configuration["time"]: - batch_size = configuration["simd_width"] - if builder.extruded: - start, end = parameters.layer_start, parameters.layer_end - else: - start, end = "start", "end" - wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size)) - wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start)) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/sequential.py b/pyop2/sequential.py index bebe7a2c0..98a2c5abe 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -84,8 +84,6 @@ def vectorise(wrapper, iname, batch_size): # split iname and vectorize the inner loop slabs = (1, 1) - if configuration["time"]: - slabs = (0, 0) inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": @@ -274,8 +272,6 @@ def prepare_arglist(self, iterset, *args): continue arglist += (k,) seen.add(k) - if configuration["time"]: - self.set_nbytes(args) return arglist @cached_property @@ -290,10 +286,6 @@ def _compute_event(self): @collective def _compute(self, part, fun, *arglist): - if configuration["time"]: - nbytes = self.comm.allreduce(self.nbytes) - if self.comm.Get_rank() == 0: - print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes)) with self._compute_event: self.log_flops(part.size * self.num_flops) fun(part.offset, part.offset + part.size, *arglist) From 5e885a7b5bd899d6790d59fe2b38021b996e641c Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 13:02:23 +0100 Subject: [PATCH 08/17] Default SIMD width. --- pyop2/configuration.py | 20 ++++++++++++++++++-- requirements-ext.txt | 1 + 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index eab30d5e5..ce666a64e 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -39,6 +39,22 @@ from pyop2.exceptions import ConfigurationError +def default_simd_width(): + from cpuinfo import get_cpu_info + avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4, + 'avx256': 4, 'avx3': 8, 'avx512': 8} + longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] + if longest_ext not in avx_to_width.keys(): + if longest_ext[:6] not in avx_to_width.keys(): + assert longest_ext[:4] in avx_to_width.keys(), \ + "The vector extension of your architecture is unknown. Disable vectorisation!" + return avx_to_width[longest_ext[:4]] + else: + return avx_to_width[longest_ext[:6]] + else: + return avx_to_width[longest_ext] + + class Configuration(dict): r"""PyOP2 configuration parameters @@ -78,8 +94,8 @@ class Configuration(dict): # name, env variable, type, default, write once DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), - "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), - "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"), + "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()), + "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "ve"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), diff --git a/requirements-ext.txt b/requirements-ext.txt index 758ccd963..a73f7da24 100644 --- a/requirements-ext.txt +++ b/requirements-ext.txt @@ -5,3 +5,4 @@ flake8>=2.1.0 pycparser>=2.10 mpi4py>=1.3.1 decorator +py-cpuinfo From db6c2dbae83919efa22c080638e8a7e45718fd86 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 3 Jul 2020 13:08:23 +0100 Subject: [PATCH 09/17] Generate CVec Target with batch size infomation and move typedef into loopy codebase. --- pyop2/sequential.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 98a2c5abe..12040fef9 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -73,7 +73,7 @@ def vectorise(wrapper, iname, batch_size): return wrapper # create constant zero vectors - wrapper = wrapper.copy(target=loopy.CVecTarget()) + wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size)) kernel = wrapper.root_kernel zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, initializer=numpy.array(0.0, dtype=numpy.float64), @@ -100,12 +100,6 @@ def vectorise(wrapper, iname, batch_size): wrapper = wrapper.with_root_kernel(kernel) - # vector data type - vec_types = [("double", 8), ("int", 4)] # scalar type, bytes - preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types] - preamble = "\n" + "\n".join(preamble) - - wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")]) return wrapper From 8c172f033bef979eb38065de5c7acb03e4df745a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 3 Jul 2020 15:59:45 +0100 Subject: [PATCH 10/17] Move zero declaration to loopy code base to be more robust in naming the variable. --- pyop2/sequential.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 12040fef9..5ffd4909e 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -38,7 +38,6 @@ import ctypes import loopy -import numpy from pyop2.datatypes import IntType, as_ctypes from pyop2 import base @@ -60,7 +59,6 @@ from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir from pyop2.configuration import configuration -from pyop2.codegen.rep2loopy import _PreambleGen def vectorise(wrapper, iname, batch_size): @@ -75,12 +73,6 @@ def vectorise(wrapper, iname, batch_size): # create constant zero vectors wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size)) kernel = wrapper.root_kernel - zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, - initializer=numpy.array(0.0, dtype=numpy.float64), - address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size) - tmps = kernel.temporary_variables.copy() - tmps["_zeros"] = zeros - kernel = kernel.copy(temporary_variables=tmps) # split iname and vectorize the inner loop slabs = (1, 1) From 17ec318fe09a4c7ba5f45c7a56be3224208bd0df Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 00:33:57 +0100 Subject: [PATCH 11/17] Added conditionals when to vectorise: Don't vectorise, if complex arguments. Check if vect strategy specified, otw dont vectorise. --- pyop2/sequential.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 5ffd4909e..a7fd81e68 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -160,11 +160,16 @@ def code_to_compile(self): iname = "layer" else: iname = "n" + has_matrix = any(arg._is_mat for arg in self._args) has_rw = any(arg.access == RW for arg in self._args) - if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw): + is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args) + vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) + + if (isinstance(self._kernel.code, loopy.LoopKernel) and vectorisable): wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name) - wrapper = vectorise(wrapper, iname, configuration["simd_width"]) + if not is_cplx: + wrapper = vectorise(wrapper, iname, configuration["simd_width"]) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: From ec03e1e2fbda9fe2406a9a76a492cb39a680c794 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 00:18:15 +0100 Subject: [PATCH 12/17] Drop omp vectorisation. --- pyop2/sequential.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index a7fd81e68..ddb6dd054 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -79,12 +79,7 @@ def vectorise(wrapper, iname, batch_size): inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": - # vectorize using vector extenstions kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) - else: - # vectoriza using omp pragma simd - assert configuration["vectorization_strategy"] == "omp" - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From f4f2bebea21e501f9c7c19f08b7bbd2e993bec23 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 14:18:15 +0100 Subject: [PATCH 13/17] Add -march=native everywhere. --- pyop2/compilation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 94d058683..71a339cde 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -395,7 +395,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -421,7 +421,7 @@ class LinuxIntelCompiler(Compiler): rank 0 compiles code) (defaults to COMM_WORLD). """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd'] + opt_flags = ['-march=native', '-Ofast', '-xHost', '-qopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" From dc8ec6d84a8c9a2268e23984c6850dd7e81b1abe Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 22 Jul 2020 12:45:21 +0100 Subject: [PATCH 14/17] Silence warnings. --- pyop2/codegen/rep2loopy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index c710be131..057ea9386 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -569,7 +569,7 @@ def generate(builder, wrapper_name=None): lang_version=(2018, 2), name=wrapper_name, # TODO, should these really be silenced? - silenced_warnings=["write_race*"]) + silenced_warnings=["write_race*", "data_dep*"]) # prioritize loops for indices in context.index_ordering: From 5b49dd660cf5f3e0627f1daee7db2d22fafc2f54 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 24 Aug 2020 11:24:46 +0200 Subject: [PATCH 15/17] Change vector tag. --- pyop2/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index ddb6dd054..6b6022995 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -79,7 +79,7 @@ def vectorise(wrapper, iname, batch_size): inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From e2280730e5ce69ccfa364e7a8a02618a0a18dbad Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 4 Sep 2020 14:33:10 +0200 Subject: [PATCH 16/17] Drop initialisation with batch size. --- pyop2/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 6b6022995..5b442ec7c 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -71,7 +71,7 @@ def vectorise(wrapper, iname, batch_size): return wrapper # create constant zero vectors - wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size)) + wrapper = wrapper.copy(target=loopy.CVecTarget()) kernel = wrapper.root_kernel # split iname and vectorize the inner loop From 6bf9de3cbb8cd2b3e963a24f17d07b70f8c5a15a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 4 Sep 2020 14:59:58 +0200 Subject: [PATCH 17/17] Remove useless comment. --- pyop2/sequential.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 5b442ec7c..1408fd4f8 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -70,7 +70,6 @@ def vectorise(wrapper, iname, batch_size): if batch_size == 1: return wrapper - # create constant zero vectors wrapper = wrapper.copy(target=loopy.CVecTarget()) kernel = wrapper.root_kernel