From 60d89d893026028ac7ce0098b728f2004cd46486 Mon Sep 17 00:00:00 2001
From: Tianjiao Sun <tianjiao.sun14@imperial.ac.uk>
Date: Thu, 11 Apr 2019 16:36:14 +0100
Subject: [PATCH 01/17] codegen: Implement SIMD vectorisation

Only works when kernel is a Loopy kernel.
---
 pyop2/codegen/rep2loopy.py | 11 ++++--
 pyop2/configuration.py     |  2 +
 pyop2/sequential.py        | 79 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index 263b17b3c..c710be131 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -203,13 +203,14 @@ def solve_fn_lookup(target, identifier):
 
 
 class _PreambleGen(ImmutableRecord):
-    fields = set(("preamble", ))
+    fields = {"preamble", "idx"}
 
-    def __init__(self, preamble):
+    def __init__(self, preamble, idx="0"):
         self.preamble = preamble
+        self.idx = idx
 
     def __call__(self, preamble_info):
-        yield ("0", self.preamble)
+        yield (self.idx, self.preamble)
 
 
 class PyOP2KernelCallable(loopy.ScalarCallable):
@@ -566,7 +567,9 @@ def generate(builder, wrapper_name=None):
                                 options=options,
                                 assumptions=assumptions,
                                 lang_version=(2018, 2),
-                                name=wrapper_name)
+                                name=wrapper_name,
+                                # TODO, should these really be silenced?
+                                silenced_warnings=["write_race*"])
 
     # prioritize loops
     for indices in context.index_ordering:
diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index fe5a2c4c5..f70100ec3 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -79,6 +79,8 @@ class Configuration(dict):
     DEFAULTS = {
         "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
         "simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
+        "alignment": ("PYOP2_ALIGNMENT", int, 64),
+        "time": ("PYOP2_TIME", bool, False),
         "debug": ("PYOP2_DEBUG", bool, False),
         "cflags": ("PYOP2_CFLAGS", str, ""),
         "ldflags": ("PYOP2_LDFLAGS", str, ""),
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 1dbab1c18..3a166660e 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -37,6 +37,8 @@
 from copy import deepcopy as dcopy
 
 import ctypes
+import loopy
+import numpy
 
 from pyop2.datatypes import IntType, as_ctypes
 from pyop2 import base
@@ -57,8 +59,48 @@
 from pyop2.mpi import collective
 from pyop2.profiling import timed_region
 from pyop2.utils import cached_property, get_petsc_dir
+from pyop2.configuration import configuration
+from pyop2.codegen.rep2loopy import _PreambleGen
 
-import loopy
+
+def vectorise(wrapper, iname, batch_size):
+    """Return a vectorised version of wrapper, vectorising over iname.
+
+    :arg wrapper: A loopy kernel to vectorise.
+    :arg iname: The iteration index to vectorise over.
+    :arg batch_size: The vector width."""
+    if batch_size == 1:
+        return wrapper
+
+    # create constant zero vectors
+    wrapper = wrapper.copy(target=loopy.CVecTarget())
+    kernel = wrapper.root_kernel
+    zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True,
+                                    initializer=numpy.array(0.0, dtype=numpy.float64),
+                                    address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size)
+    tmps = kernel.temporary_variables.copy()
+    tmps["_zeros"] = zeros
+    kernel = kernel.copy(temporary_variables=tmps)
+
+    # split iname and vectorize the inner loop
+    inner_iname = iname + "_batch"
+
+    # vectorize using vector extenstions
+    kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname)
+
+    alignment = configuration["alignment"]
+    tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())
+    kernel = kernel.copy(temporary_variables=tmps)
+
+    wrapper = wrapper.with_root_kernel(kernel)
+
+    # vector data type
+    vec_types = [("double", 8), ("int", 4)]  # scalar type, bytes
+    preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types]
+    preamble = "\n" + "\n".join(preamble)
+
+    wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")])
+    return wrapper
 
 
 class JITModule(base.JITModule):
@@ -122,6 +164,15 @@ def code_to_compile(self):
             builder.add_argument(arg)
 
         wrapper = generate(builder)
+        if self._iterset._extruded:
+            iname = "layer"
+        else:
+            iname = "n"
+        has_matrix = any(arg._is_mat for arg in self._args)
+        has_rw = any(arg.access == RW for arg in self._args)
+        if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw):
+            wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name)
+            wrapper = vectorise(wrapper, iname, configuration["simd_width"])
         code = loopy.generate_code_v2(wrapper)
 
         if self._kernel._cpp:
@@ -137,8 +188,6 @@ def compile(self):
         if not hasattr(self, '_args'):
             raise RuntimeError("JITModule has no args associated with it, should never happen")
 
-        from pyop2.configuration import configuration
-
         compiler = configuration["compiler"]
         extension = "cpp" if self._kernel._cpp else "c"
         cppargs = self._cppargs
@@ -184,6 +233,24 @@ def argtypes(self):
 
 class ParLoop(petsc_base.ParLoop):
 
+    def set_nbytes(self, args):
+        nbytes = 0
+        seen = set()
+        for arg in args:
+            if arg.access is INC:
+                nbytes += arg.data.nbytes
+            else:
+                nbytes += arg.data.nbytes
+            for map_ in arg.map_tuple:
+                if map_ is None:
+                    continue
+                for k in map_._kernel_args_:
+                    if k in seen:
+                        continue
+                    nbytes += map_.values.nbytes
+                    seen.add(k)
+        self.nbytes = nbytes
+
     def prepare_arglist(self, iterset, *args):
         arglist = iterset._kernel_args_
         for arg in args:
@@ -199,6 +266,8 @@ def prepare_arglist(self, iterset, *args):
                         continue
                     arglist += (k,)
                     seen.add(k)
+        if configuration["time"]:
+            self.set_nbytes(args)
         return arglist
 
     @cached_property
@@ -213,6 +282,10 @@ def _compute_event(self):
 
     @collective
     def _compute(self, part, fun, *arglist):
+        if configuration["time"]:
+            nbytes = self.comm.allreduce(self.nbytes)
+            if self.comm.Get_rank() == 0:
+                print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes))
         with self._compute_event:
             self.log_flops(part.size * self.num_flops)
             fun(part.offset, part.offset + part.size, *arglist)

From 72f3b3f3c8c768a5fbc0174b2cd4e06c398bad6b Mon Sep 17 00:00:00 2001
From: tj sun <me@tjsun.info>
Date: Thu, 1 Aug 2019 17:55:28 +0100
Subject: [PATCH 02/17] add omp simd vectorization mode

---
 pyop2/configuration.py | 1 +
 pyop2/sequential.py    | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index f70100ec3..eab30d5e5 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -79,6 +79,7 @@ class Configuration(dict):
     DEFAULTS = {
         "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
         "simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
+        "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"),
         "alignment": ("PYOP2_ALIGNMENT", int, 64),
         "time": ("PYOP2_TIME", bool, False),
         "debug": ("PYOP2_DEBUG", bool, False),
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 3a166660e..cc8d78d55 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -85,8 +85,13 @@ def vectorise(wrapper, iname, batch_size):
     # split iname and vectorize the inner loop
     inner_iname = iname + "_batch"
 
-    # vectorize using vector extenstions
-    kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname)
+    if configuration["vectorization_strategy"] == "ve":
+        # vectorize using vector extenstions
+        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname)
+    else:
+        # vectoriza using omp pragma simd
+        assert configuration["vectorization_strategy"] == "omp"
+        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname)
 
     alignment = configuration["alignment"]
     tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())

From 9a3a5493428d8f881f6b9922a0eb21b711b8c264 Mon Sep 17 00:00:00 2001
From: tj sun <me@tjsun.info>
Date: Sun, 4 Aug 2019 19:46:27 +0100
Subject: [PATCH 03/17] add openmp flag and by pass workaround flag

---
 pyop2/codegen/rep2loopy.py | 9 +++++++++
 pyop2/compilation.py       | 4 ++--
 pyop2/sequential.py        | 9 ++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index c710be131..2cbf66eb7 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -570,6 +570,15 @@ def generate(builder, wrapper_name=None):
                                 name=wrapper_name,
                                 # TODO, should these really be silenced?
                                 silenced_warnings=["write_race*"])
+    from pyop2.configuration import configuration
+    if configuration["time"]:
+        batch_size = configuration["simd_width"]
+        if builder.extruded:
+            start, end = parameters.layer_start, parameters.layer_end
+        else:
+            start, end = "start", "end"
+        wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size))
+        wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start))
 
     # prioritize loops
     for indices in context.index_ordering:
diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index e5a9fefdd..1648be38e 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -218,7 +218,7 @@ def workaround_cflags(self):
             if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"):
                 # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055
                 # See also https://github.com/firedrakeproject/firedrake/issues/1442
-                # And https://github.com/firedrakeproject/firedrake/issues/1717
+                return  # enable vectorization for paper
                 # Bug also on skylake with the vectoriser in this
                 # combination (disappears without
                 # -fno-tree-loop-vectorize!)
@@ -396,7 +396,7 @@ class LinuxCompiler(Compiler):
     :kwarg comm: Optional communicator to compile the code on (only
     rank 0 compiles code) (defaults to COMM_WORLD)."""
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-march=native', '-O3', '-ffast-math']
+        opt_flags = ['-O3', '-ffast-math', '-fopenmp']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index cc8d78d55..bebe7a2c0 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -83,15 +83,18 @@ def vectorise(wrapper, iname, batch_size):
     kernel = kernel.copy(temporary_variables=tmps)
 
     # split iname and vectorize the inner loop
+    slabs = (1, 1)
+    if configuration["time"]:
+        slabs = (0, 0)
     inner_iname = iname + "_batch"
 
     if configuration["vectorization_strategy"] == "ve":
         # vectorize using vector extenstions
-        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname)
+        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname)
     else:
         # vectoriza using omp pragma simd
         assert configuration["vectorization_strategy"] == "omp"
-        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname)
+        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname)
 
     alignment = configuration["alignment"]
     tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())
@@ -243,7 +246,7 @@ def set_nbytes(self, args):
         seen = set()
         for arg in args:
             if arg.access is INC:
-                nbytes += arg.data.nbytes
+                nbytes += arg.data.nbytes * 2
             else:
                 nbytes += arg.data.nbytes
             for map_ in arg.map_tuple:

From 3a2d5ff2ba39eb73d44fac437327e15a2d9a7e3d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lawrence@wence.uk>
Date: Thu, 11 Apr 2019 17:09:25 +0100
Subject: [PATCH 04/17] DROP BEFORE MERGE: test with correct loopy branch

---
 requirements-git.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-git.txt b/requirements-git.txt
index 718e27330..4790f7f1b 100644
--- a/requirements-git.txt
+++ b/requirements-git.txt
@@ -1,4 +1,4 @@
 git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc
 --no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py
 git+https://github.com/coneoproject/COFFEE.git#egg=coffee
-git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy
+git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy

From beedc5fe196b9a7e15575c71dc2ee86402b299e0 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Wed, 1 Jul 2020 09:57:13 +0100
Subject: [PATCH 05/17] Turn of tree vectorize for certain gcc compilers. We
 might not need the tree vectorisation flag for our vectorisation anyways.

---
 pyop2/compilation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 1648be38e..7e951bcca 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -218,7 +218,6 @@ def workaround_cflags(self):
             if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"):
                 # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055
                 # See also https://github.com/firedrakeproject/firedrake/issues/1442
-                return  # enable vectorization for paper
                 # Bug also on skylake with the vectoriser in this
                 # combination (disappears without
                 # -fno-tree-loop-vectorize!)

From 567ec0cc85611ecfa812d5e0b2ac03f0c727d690 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Wed, 1 Jul 2020 10:01:26 +0100
Subject: [PATCH 06/17] Add simd compiler flags.

---
 pyop2/compilation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 7e951bcca..94d058683 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -367,7 +367,7 @@ class MacCompiler(Compiler):
     """
 
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-march=native', '-O3', '-ffast-math']
+        opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"
@@ -395,7 +395,7 @@ class LinuxCompiler(Compiler):
     :kwarg comm: Optional communicator to compile the code on (only
     rank 0 compiles code) (defaults to COMM_WORLD)."""
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-O3', '-ffast-math', '-fopenmp']
+        opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"
@@ -421,7 +421,7 @@ class LinuxIntelCompiler(Compiler):
         rank 0 compiles code) (defaults to COMM_WORLD).
     """
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-Ofast', '-xHost']
+        opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"

From 1f0ea23251f91c38c265b96538f432f5d865ee5e Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Wed, 1 Jul 2020 10:02:37 +0100
Subject: [PATCH 07/17] Remove time configuration.

---
 pyop2/codegen/rep2loopy.py | 9 ---------
 pyop2/sequential.py        | 8 --------
 2 files changed, 17 deletions(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index 2cbf66eb7..c710be131 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -570,15 +570,6 @@ def generate(builder, wrapper_name=None):
                                 name=wrapper_name,
                                 # TODO, should these really be silenced?
                                 silenced_warnings=["write_race*"])
-    from pyop2.configuration import configuration
-    if configuration["time"]:
-        batch_size = configuration["simd_width"]
-        if builder.extruded:
-            start, end = parameters.layer_start, parameters.layer_end
-        else:
-            start, end = "start", "end"
-        wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size))
-        wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start))
 
     # prioritize loops
     for indices in context.index_ordering:
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index bebe7a2c0..98a2c5abe 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -84,8 +84,6 @@ def vectorise(wrapper, iname, batch_size):
 
     # split iname and vectorize the inner loop
     slabs = (1, 1)
-    if configuration["time"]:
-        slabs = (0, 0)
     inner_iname = iname + "_batch"
 
     if configuration["vectorization_strategy"] == "ve":
@@ -274,8 +272,6 @@ def prepare_arglist(self, iterset, *args):
                         continue
                     arglist += (k,)
                     seen.add(k)
-        if configuration["time"]:
-            self.set_nbytes(args)
         return arglist
 
     @cached_property
@@ -290,10 +286,6 @@ def _compute_event(self):
 
     @collective
     def _compute(self, part, fun, *arglist):
-        if configuration["time"]:
-            nbytes = self.comm.allreduce(self.nbytes)
-            if self.comm.Get_rank() == 0:
-                print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes))
         with self._compute_event:
             self.log_flops(part.size * self.num_flops)
             fun(part.offset, part.offset + part.size, *arglist)

From 5e885a7b5bd899d6790d59fe2b38021b996e641c Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Wed, 1 Jul 2020 13:02:23 +0100
Subject: [PATCH 08/17] Default SIMD width.

---
 pyop2/configuration.py | 20 ++++++++++++++++++--
 requirements-ext.txt   |  1 +
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index eab30d5e5..ce666a64e 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -39,6 +39,22 @@
 from pyop2.exceptions import ConfigurationError
 
 
+def default_simd_width():
+    from cpuinfo import get_cpu_info
+    avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4,
+                    'avx256': 4, 'avx3': 8, 'avx512': 8}
+    longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1]
+    if longest_ext not in avx_to_width.keys():
+        if longest_ext[:6] not in avx_to_width.keys():
+            assert longest_ext[:4] in avx_to_width.keys(), \
+                "The vector extension of your architecture is unknown. Disable vectorisation!"
+            return avx_to_width[longest_ext[:4]]
+        else:
+            return avx_to_width[longest_ext[:6]]
+    else:
+        return avx_to_width[longest_ext]
+
+
 class Configuration(dict):
     r"""PyOP2 configuration parameters
 
@@ -78,8 +94,8 @@ class Configuration(dict):
     # name, env variable, type, default, write once
     DEFAULTS = {
         "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
-        "simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
-        "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"),
+        "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()),
+        "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "ve"),
         "alignment": ("PYOP2_ALIGNMENT", int, 64),
         "time": ("PYOP2_TIME", bool, False),
         "debug": ("PYOP2_DEBUG", bool, False),
diff --git a/requirements-ext.txt b/requirements-ext.txt
index 758ccd963..a73f7da24 100644
--- a/requirements-ext.txt
+++ b/requirements-ext.txt
@@ -5,3 +5,4 @@ flake8>=2.1.0
 pycparser>=2.10
 mpi4py>=1.3.1
 decorator
+py-cpuinfo

From db6c2dbae83919efa22c080638e8a7e45718fd86 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Fri, 3 Jul 2020 13:08:23 +0100
Subject: [PATCH 09/17] Generate CVec Target with batch size infomation and
 move typedef into loopy codebase.

---
 pyop2/sequential.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 98a2c5abe..12040fef9 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -73,7 +73,7 @@ def vectorise(wrapper, iname, batch_size):
         return wrapper
 
     # create constant zero vectors
-    wrapper = wrapper.copy(target=loopy.CVecTarget())
+    wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size))
     kernel = wrapper.root_kernel
     zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True,
                                     initializer=numpy.array(0.0, dtype=numpy.float64),
@@ -100,12 +100,6 @@ def vectorise(wrapper, iname, batch_size):
 
     wrapper = wrapper.with_root_kernel(kernel)
 
-    # vector data type
-    vec_types = [("double", 8), ("int", 4)]  # scalar type, bytes
-    preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types]
-    preamble = "\n" + "\n".join(preamble)
-
-    wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")])
     return wrapper
 
 

From 8c172f033bef979eb38065de5c7acb03e4df745a Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Fri, 3 Jul 2020 15:59:45 +0100
Subject: [PATCH 10/17] Move zero declaration to loopy code base to be more
 robust in naming the variable.

---
 pyop2/sequential.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 12040fef9..5ffd4909e 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -38,7 +38,6 @@
 
 import ctypes
 import loopy
-import numpy
 
 from pyop2.datatypes import IntType, as_ctypes
 from pyop2 import base
@@ -60,7 +59,6 @@
 from pyop2.profiling import timed_region
 from pyop2.utils import cached_property, get_petsc_dir
 from pyop2.configuration import configuration
-from pyop2.codegen.rep2loopy import _PreambleGen
 
 
 def vectorise(wrapper, iname, batch_size):
@@ -75,12 +73,6 @@ def vectorise(wrapper, iname, batch_size):
     # create constant zero vectors
     wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size))
     kernel = wrapper.root_kernel
-    zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True,
-                                    initializer=numpy.array(0.0, dtype=numpy.float64),
-                                    address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size)
-    tmps = kernel.temporary_variables.copy()
-    tmps["_zeros"] = zeros
-    kernel = kernel.copy(temporary_variables=tmps)
 
     # split iname and vectorize the inner loop
     slabs = (1, 1)

From 17ec318fe09a4c7ba5f45c7a56be3224208bd0df Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Thu, 16 Jul 2020 00:33:57 +0100
Subject: [PATCH 11/17] Added conditionals when to vectorise: Don't vectorise,
 if complex arguments. Check if vect strategy specified, otw dont vectorise.

---
 pyop2/sequential.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 5ffd4909e..a7fd81e68 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -160,11 +160,16 @@ def code_to_compile(self):
             iname = "layer"
         else:
             iname = "n"
+
         has_matrix = any(arg._is_mat for arg in self._args)
         has_rw = any(arg.access == RW for arg in self._args)
-        if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw):
+        is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args)
+        vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"])
+
+        if (isinstance(self._kernel.code, loopy.LoopKernel) and vectorisable):
             wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name)
-            wrapper = vectorise(wrapper, iname, configuration["simd_width"])
+            if not is_cplx:
+                wrapper = vectorise(wrapper, iname, configuration["simd_width"])
         code = loopy.generate_code_v2(wrapper)
 
         if self._kernel._cpp:

From ec03e1e2fbda9fe2406a9a76a492cb39a680c794 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Thu, 16 Jul 2020 00:18:15 +0100
Subject: [PATCH 12/17] Drop omp vectorisation.

---
 pyop2/sequential.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index a7fd81e68..ddb6dd054 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -79,12 +79,7 @@ def vectorise(wrapper, iname, batch_size):
     inner_iname = iname + "_batch"
 
     if configuration["vectorization_strategy"] == "ve":
-        # vectorize using vector extenstions
         kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname)
-    else:
-        # vectoriza using omp pragma simd
-        assert configuration["vectorization_strategy"] == "omp"
-        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname)
 
     alignment = configuration["alignment"]
     tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())

From f4f2bebea21e501f9c7c19f08b7bbd2e993bec23 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Thu, 16 Jul 2020 14:18:15 +0100
Subject: [PATCH 13/17] Add -march=native everywhere.

---
 pyop2/compilation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 94d058683..71a339cde 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -395,7 +395,7 @@ class LinuxCompiler(Compiler):
     :kwarg comm: Optional communicator to compile the code on (only
     rank 0 compiles code) (defaults to COMM_WORLD)."""
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd']
+        opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"
@@ -421,7 +421,7 @@ class LinuxIntelCompiler(Compiler):
         rank 0 compiles code) (defaults to COMM_WORLD).
     """
     def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
-        opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd']
+        opt_flags = ['-march=native', '-Ofast', '-xHost', '-qopenmp-simd']
         if configuration['debug']:
             opt_flags = ['-O0', '-g']
         cc = "mpicc"

From dc8ec6d84a8c9a2268e23984c6850dd7e81b1abe Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Wed, 22 Jul 2020 12:45:21 +0100
Subject: [PATCH 14/17] Silence warnings.

---
 pyop2/codegen/rep2loopy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index c710be131..057ea9386 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -569,7 +569,7 @@ def generate(builder, wrapper_name=None):
                                 lang_version=(2018, 2),
                                 name=wrapper_name,
                                 # TODO, should these really be silenced?
-                                silenced_warnings=["write_race*"])
+                                silenced_warnings=["write_race*", "data_dep*"])
 
     # prioritize loops
     for indices in context.index_ordering:

From 5b49dd660cf5f3e0627f1daee7db2d22fafc2f54 Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Mon, 24 Aug 2020 11:24:46 +0200
Subject: [PATCH 15/17] Change vector tag.

---
 pyop2/sequential.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index ddb6dd054..6b6022995 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -79,7 +79,7 @@ def vectorise(wrapper, iname, batch_size):
     inner_iname = iname + "_batch"
 
     if configuration["vectorization_strategy"] == "ve":
-        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname)
+        kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname)
 
     alignment = configuration["alignment"]
     tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())

From e2280730e5ce69ccfa364e7a8a02618a0a18dbad Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Fri, 4 Sep 2020 14:33:10 +0200
Subject: [PATCH 16/17] Drop initialisation with batch size.

---
 pyop2/sequential.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 6b6022995..5b442ec7c 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -71,7 +71,7 @@ def vectorise(wrapper, iname, batch_size):
         return wrapper
 
     # create constant zero vectors
-    wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size))
+    wrapper = wrapper.copy(target=loopy.CVecTarget())
     kernel = wrapper.root_kernel
 
     # split iname and vectorize the inner loop

From 6bf9de3cbb8cd2b3e963a24f17d07b70f8c5a15a Mon Sep 17 00:00:00 2001
From: Sophia Vorderwuelbecke <sv2518@ic.ac.uk>
Date: Fri, 4 Sep 2020 14:59:58 +0200
Subject: [PATCH 17/17] Remove useless comment.

---
 pyop2/sequential.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 5b442ec7c..1408fd4f8 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -70,7 +70,6 @@ def vectorise(wrapper, iname, batch_size):
     if batch_size == 1:
         return wrapper
 
-    # create constant zero vectors
     wrapper = wrapper.copy(target=loopy.CVecTarget())
     kernel = wrapper.root_kernel