From fa419fd4d03f4984f4169fd13646baa259962803 Mon Sep 17 00:00:00 2001
From: Paul <baconpaul@users.noreply.github.com>
Date: Fri, 1 Nov 2024 11:16:48 -0400
Subject: [PATCH] Move to new SIMD methodology (#61)

Move to a new SIMD methodology which allows arm64ec to use neon instructions while keeping our code mostly intact.
---
 .github/workflows/build-pr.yml                |   39 +-
 CMakeLists.txt                                |   37 +-
 cmake/CPM.cmake                               | 1281 ++++++++++++++++-
 .../filter_plot_tool/filter_plot_tool.cpp     |   14 +
 .../FilterPlotComponent.cpp                   |   14 +
 .../FilterPlotComponent.h                     |   14 +
 .../filters_example_plugin/FiltersPlugin.cpp  |   61 +-
 .../filters_example_plugin/FiltersPlugin.h    |   14 +
 .../FiltersPluginEditor.cpp                   |   14 +
 .../FiltersPluginEditor.h                     |   14 +
 include-extras/sst/filters/FilterPlotter.h    |   96 +-
 include/sst/filters/CutoffWarp.h              |   77 +-
 include/sst/filters/CytomicSVF.h              |  147 +-
 include/sst/filters/DiodeLadder.h             |  102 +-
 .../sst/filters/FilterCoefficientMaker_Impl.h |    8 +-
 include/sst/filters/HalfRateFilter.h          |  311 ++--
 include/sst/filters/K35Filter.h               |   50 +-
 include/sst/filters/OBXDFilter.h              |  231 +--
 include/sst/filters/QuadFilterUnit.h          |    8 +-
 include/sst/filters/QuadFilterUnit_Impl.h     |  790 +++++-----
 include/sst/filters/ResonanceWarp.h           |   18 +-
 include/sst/filters/TriPoleFilter.h           |  111 +-
 include/sst/filters/VintageLadders.h          |   75 +-
 include/sst/utilities/globals.h               |   23 +-
 include/sst/utilities/shared.h                |   18 +-
 scripts/fix_file_comments.pl                  |    4 +-
 tests/BasicFiltersTest.cpp                    |   14 +
 tests/BiquadTest.cpp                          |   14 +
 tests/CMakeLists.txt                          |    1 +
 tests/CutoffWarpTest.cpp                      |   14 +
 tests/CytomicSVFTests.cpp                     |   17 +-
 tests/DiodeLadderTest.cpp                     |   14 +
 tests/HalfRateTest.cpp                        |   15 +-
 tests/K35FilterTest.cpp                       |   14 +
 tests/OBXDFilterTest.cpp                      |   14 +
 tests/ResonanceWarpTest.cpp                   |   14 +
 tests/TestUtils.h                             |   21 +-
 tests/TriPoleFilterTest.cpp                   |   14 +
 tests/VintageLaddersTest.cpp                  |   14 +
 tests/tests.cpp                               |   14 +
 40 files changed, 2691 insertions(+), 1074 deletions(-)

diff --git a/.github/workflows/build-pr.yml b/.github/workflows/build-pr.yml
index 17b87fb..a76d0ea 100644
--- a/.github/workflows/build-pr.yml
+++ b/.github/workflows/build-pr.yml
@@ -7,21 +7,47 @@ on:
 
 jobs:
   build_tests:
-    name: Test ${{ matrix.os }}
+    name: Test ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
         include:
           - os: ubuntu-latest
             name: linux
+            runTest: true
             testExe: build/tests/sst-filters-tests
+
           - os: macos-latest
-            name: mac
+            name: mac-x86
+            runTest: true
             testExe: build/tests/sst-filters-tests
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=x86_64
+
+          - os: macos-latest
+            name: mac-arm
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64
+
+          - os: macos-latest
+            name: mac-arm-nonative
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE
+
+          - os: windows-latest
+            name: win-x86
+            runTest: true
+            testExe:  build/tests/Release/sst-filters-tests.exe
+
+          - os: windows-latest
+            name: win-arm64
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10
+
+          - os: windows-latest
+            name: win-arm64ec
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64ec -DCMAKE_SYSTEM_VERSION=10
+
           - os: windows-latest
-            name: win
-            testExe: build/tests/Release/sst-filters-tests.exe
+            name: win-arm64-non-native
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE
 
     steps:
 
@@ -32,10 +58,11 @@ jobs:
 
       - name: Build Smoke test
         run: |
-          cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release -DSST_FILTERS_BUILD_TESTS=TRUE -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+          cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release -DSST_FILTERS_BUILD_TESTS=TRUE ${{ matrix.cmakeArgs }}
           cmake --build ./build --config Release
 
       - name: Run Smoke Test
+        if: ${{ matrix.runTest }}m
         run: |
           ls ${{ matrix.testExe }}
           ${{ matrix.testExe }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fedffd3..24fb7a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,12 +6,12 @@ set(CMAKE_CXX_STANDARD 17)
 add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE include)
 
-if(MSVC)
+if (MSVC)
     target_compile_definitions(${PROJECT_NAME}
-        INTERFACE
+            INTERFACE
             _USE_MATH_DEFINES=1 # So that we can have M_PI on MSVC
     )
-endif()
+endif ()
 
 add_library(${PROJECT_NAME}-extras INTERFACE)
 target_include_directories(${PROJECT_NAME}-extras INTERFACE include-extras)
@@ -30,42 +30,43 @@ option(SST_FILTERS_BUILD_EXAMPLES "Add targets for building and running sst-filt
 if (SST_FILTERS_BUILD_TESTS OR SST_FILTERS_BUILD_EXAMPLES)
     message(STATUS "Importing SIMDE with CPM")
 
-    if (NOT TARGET sst-basic-blocks)
-        CPMAddPackage(NAME sst-basic-blocks
-                GITHUB_REPOSITORY surge-synthesizer/sst-basic-blocks
-                GIT_TAG main
-                )
-    endif()
-
     if (NOT TARGET simde)
         CPMAddPackage(NAME simde
                 GITHUB_REPOSITORY simd-everywhere/simde
                 VERSION 0.7.2
-                )
+        )
         add_library(simde INTERFACE)
         target_include_directories(simde INTERFACE ${simde_SOURCE_DIR})
-    endif()
+    endif ()
+
+    if (NOT TARGET sst-basic-blocks)
+        CPMAddPackage(NAME sst-basic-blocks
+                GITHUB_REPOSITORY surge-synthesizer/sst-basic-blocks
+                GIT_TAG main
+        )
+    endif ()
 endif ()
 
 if (SST_GET_BASIC_BLOCKS)
     CPMAddPackage(NAME sst-basic-blocks
             GITHUB_REPOSITORY surge-synthesizer/sst-basic-blocks
             GIT_TAG main
-            )
-endif()
+    )
+endif ()
 
 if (NOT TARGET sst-basic-blocks)
     message(FATAL_ERROR "sst-basic-blocks is not available in this context. Set SST_GET_BASIC_BLOCKS=1 or add it")
-else()
+else ()
     target_link_libraries(${PROJECT_NAME} INTERFACE sst-basic-blocks)
-endif()
+endif ()
 
 if (SST_FILTERS_BUILD_TESTS)
+    message(STATUS "Adding test targets")
     add_subdirectory(tests)
 endif ()
 
-if(SST_FILTERS_BUILD_EXAMPLES)
+if (SST_FILTERS_BUILD_EXAMPLES)
     add_subdirectory(examples)
-endif()
+endif ()
 
 
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
index eaa397b..8269a8b 100644
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@@ -1,32 +1,1269 @@
-set(CPM_DOWNLOAD_VERSION 0.36.0)
+# CPM.cmake - CMake's missing package manager
+# ===========================================
+# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions.
+#
+# MIT License
+# -----------
+#[[
+  Copyright (c) 2019-2023 Lars Melchior and contributors
 
-if(CPM_SOURCE_CACHE)
-  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
-elseif(DEFINED ENV{CPM_SOURCE_CACHE})
-  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in all
+  copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+]]
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+# Initialize logging prefix
+if(NOT CPM_INDENT)
+  set(CPM_INDENT
+      "CPM:"
+      CACHE INTERNAL ""
+  )
+endif()
+
+if(NOT COMMAND cpm_message)
+  function(cpm_message)
+    message(${ARGV})
+  endfunction()
+endif()
+
+set(CURRENT_CPM_VERSION 0.40.2)
+
+get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+if(CPM_DIRECTORY)
+  if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY)
+    if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION)
+      message(
+        AUTHOR_WARNING
+          "${CPM_INDENT} \
+A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \
+It is recommended to upgrade CPM to the most recent version. \
+See https://github.com/cpm-cmake/CPM.cmake for more information."
+      )
+    endif()
+    if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
+      include(FetchContent)
+    endif()
+    return()
+  endif()
+
+  get_property(
+    CPM_INITIALIZED GLOBAL ""
+    PROPERTY CPM_INITIALIZED
+    SET
+  )
+  if(CPM_INITIALIZED)
+    return()
+  endif()
+endif()
+
+if(CURRENT_CPM_VERSION MATCHES "development-version")
+  message(
+    WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \
+Please update to a recent release if possible. \
+See https://github.com/cpm-cmake/CPM.cmake for details."
+  )
+endif()
+
+set_property(GLOBAL PROPERTY CPM_INITIALIZED true)
+
+macro(cpm_set_policies)
+  # the policy allows us to change options without caching
+  cmake_policy(SET CMP0077 NEW)
+  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+  # the policy allows us to change set(CACHE) without caching
+  if(POLICY CMP0126)
+    cmake_policy(SET CMP0126 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0126 NEW)
+  endif()
+
+  # The policy uses the download time for timestamp, instead of the timestamp in the archive. This
+  # allows for proper rebuilds when a projects url changes
+  if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
+  endif()
+
+  # treat relative git repository paths as being relative to the parent project's remote
+  if(POLICY CMP0150)
+    cmake_policy(SET CMP0150 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0150 NEW)
+  endif()
+endmacro()
+cpm_set_policies()
+
+option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies"
+       $ENV{CPM_USE_LOCAL_PACKAGES}
+)
+option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies"
+       $ENV{CPM_LOCAL_PACKAGES_ONLY}
+)
+option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL})
+option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package"
+       $ENV{CPM_DONT_UPDATE_MODULE_PATH}
+)
+option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path"
+       $ENV{CPM_DONT_CREATE_PACKAGE_LOCK}
+)
+option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK
+       "Add all packages added through CPM.cmake to the package lock"
+       $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK}
+)
+option(CPM_USE_NAMED_CACHE_DIRECTORIES
+       "Use additional directory of package name in cache on the most nested level."
+       $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES}
+)
+
+set(CPM_VERSION
+    ${CURRENT_CPM_VERSION}
+    CACHE INTERNAL ""
+)
+set(CPM_DIRECTORY
+    ${CPM_CURRENT_DIRECTORY}
+    CACHE INTERNAL ""
+)
+set(CPM_FILE
+    ${CMAKE_CURRENT_LIST_FILE}
+    CACHE INTERNAL ""
+)
+set(CPM_PACKAGES
+    ""
+    CACHE INTERNAL ""
+)
+set(CPM_DRY_RUN
+    OFF
+    CACHE INTERNAL "Don't download or configure dependencies (for testing)"
+)
+
+if(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE})
 else()
-  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+  set(CPM_SOURCE_CACHE_DEFAULT OFF)
 endif()
 
-# Expand relative path. This is important if the provided path contains a tilde (~)
-get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+set(CPM_SOURCE_CACHE
+    ${CPM_SOURCE_CACHE_DEFAULT}
+    CACHE PATH "Directory to download CPM dependencies"
+)
+
+if(NOT CPM_DONT_UPDATE_MODULE_PATH)
+  set(CPM_MODULE_PATH
+      "${CMAKE_BINARY_DIR}/CPM_modules"
+      CACHE INTERNAL ""
+  )
+  # remove old modules
+  file(REMOVE_RECURSE ${CPM_MODULE_PATH})
+  file(MAKE_DIRECTORY ${CPM_MODULE_PATH})
+  # locally added CPM modules should override global packages
+  set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}")
+endif()
 
-function(download_cpm)
-  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
-  file(DOWNLOAD
-       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
-       ${CPM_DOWNLOAD_LOCATION}
+if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+  set(CPM_PACKAGE_LOCK_FILE
+      "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake"
+      CACHE INTERNAL ""
   )
+  file(WRITE ${CPM_PACKAGE_LOCK_FILE}
+       "# CPM Package Lock\n# This file should be committed to version control\n\n"
+  )
+endif()
+
+include(FetchContent)
+
+# Try to infer package name from git repository uri (path or url)
+function(cpm_package_name_from_git_uri URI RESULT)
+  if("${URI}" MATCHES "([^/:]+)/?.git/?$")
+    set(${RESULT}
+        ${CMAKE_MATCH_1}
+        PARENT_SCOPE
+    )
+  else()
+    unset(${RESULT} PARENT_SCOPE)
+  endif()
 endfunction()
 
-if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
-  download_cpm()
-else()
-  # resume download if it previously failed
-  file(READ ${CPM_DOWNLOAD_LOCATION} check)
-  if("${check}" STREQUAL "")
-    download_cpm()
+# Try to infer package name and version from a url
+function(cpm_package_name_and_ver_from_url url outName outVer)
+  if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)")
+    # We matched an archive
+    set(filename "${CMAKE_MATCH_1}")
+
+    if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)")
+      # We matched <name>-<version> (ie foo-1.2.3)
+      set(${outName}
+          "${CMAKE_MATCH_1}"
+          PARENT_SCOPE
+      )
+      set(${outVer}
+          "${CMAKE_MATCH_2}"
+          PARENT_SCOPE
+      )
+    elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)")
+      # We couldn't find a name, but we found a version
+      #
+      # In many cases (which we don't handle here) the url would look something like
+      # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly
+      # distinguish the package name from the irrelevant bits. Moreover if we try to match the
+      # package name from the filename, we'd get bogus at best.
+      unset(${outName} PARENT_SCOPE)
+      set(${outVer}
+          "${CMAKE_MATCH_1}"
+          PARENT_SCOPE
+      )
+    else()
+      # Boldly assume that the file name is the package name.
+      #
+      # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but
+      # such cases should be quite rare. No popular service does this... we think.
+      set(${outName}
+          "${filename}"
+          PARENT_SCOPE
+      )
+      unset(${outVer} PARENT_SCOPE)
+    endif()
+  else()
+    # No ideas yet what to do with non-archives
+    unset(${outName} PARENT_SCOPE)
+    unset(${outVer} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(cpm_find_package NAME VERSION)
+  string(REPLACE " " ";" EXTRA_ARGS "${ARGN}")
+  find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET)
+  if(${CPM_ARGS_NAME}_FOUND)
+    if(DEFINED ${CPM_ARGS_NAME}_VERSION)
+      set(VERSION ${${CPM_ARGS_NAME}_VERSION})
+    endif()
+    cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}")
+    CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}")
+    set(CPM_PACKAGE_FOUND
+        YES
+        PARENT_SCOPE
+    )
+  else()
+    set(CPM_PACKAGE_FOUND
+        NO
+        PARENT_SCOPE
+    )
+  endif()
+endfunction()
+
+# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from
+# finding the system library
+function(cpm_create_module_file Name)
+  if(NOT CPM_DONT_UPDATE_MODULE_PATH)
+    # erase any previous modules
+    file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake
+         "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)"
+    )
+  endif()
+endfunction()
+
+# Find a package locally or fallback to CPMAddPackage
+function(CPMFindPackage)
+  set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS)
+
+  cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN})
+
+  if(NOT DEFINED CPM_ARGS_VERSION)
+    if(DEFINED CPM_ARGS_GIT_TAG)
+      cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+    endif()
+  endif()
+
+  set(downloadPackage ${CPM_DOWNLOAD_ALL})
+  if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME})
+    set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+  elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+    set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+  endif()
+  if(downloadPackage)
+    CPMAddPackage(${ARGN})
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
+
+  if(NOT CPM_PACKAGE_FOUND)
+    CPMAddPackage(${ARGN})
+    cpm_export_variables(${CPM_ARGS_NAME})
+  endif()
+
+endfunction()
+
+# checks if a package has been added before
+function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION)
+  if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES)
+    CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION)
+    if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}")
+      message(
+        WARNING
+          "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})."
+      )
+    endif()
+    cpm_get_fetch_properties(${CPM_ARGS_NAME})
+    set(${CPM_ARGS_NAME}_ADDED NO)
+    set(CPM_PACKAGE_ALREADY_ADDED
+        YES
+        PARENT_SCOPE
+    )
+    cpm_export_variables(${CPM_ARGS_NAME})
+  else()
+    set(CPM_PACKAGE_ALREADY_ADDED
+        NO
+        PARENT_SCOPE
+    )
+  endif()
+endfunction()
+
+# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of
+# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted
+# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3
+function(cpm_parse_add_package_single_arg arg outArgs)
+  # Look for a scheme
+  if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$")
+    string(TOLOWER "${CMAKE_MATCH_1}" scheme)
+    set(uri "${CMAKE_MATCH_2}")
+
+    # Check for CPM-specific schemes
+    if(scheme STREQUAL "gh")
+      set(out "GITHUB_REPOSITORY;${uri}")
+      set(packageType "git")
+    elseif(scheme STREQUAL "gl")
+      set(out "GITLAB_REPOSITORY;${uri}")
+      set(packageType "git")
+    elseif(scheme STREQUAL "bb")
+      set(out "BITBUCKET_REPOSITORY;${uri}")
+      set(packageType "git")
+      # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine
+      # type
+    elseif(arg MATCHES ".git/?(@|#|$)")
+      set(out "GIT_REPOSITORY;${arg}")
+      set(packageType "git")
+    else()
+      # Fall back to a URL
+      set(out "URL;${arg}")
+      set(packageType "archive")
+
+      # We could also check for SVN since FetchContent supports it, but SVN is so rare these days.
+      # We just won't bother with the additional complexity it will induce in this function. SVN is
+      # done by multi-arg
+    endif()
+  else()
+    if(arg MATCHES ".git/?(@|#|$)")
+      set(out "GIT_REPOSITORY;${arg}")
+      set(packageType "git")
+    else()
+      # Give up
+      message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'")
+    endif()
+  endif()
+
+  # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs
+  # containing '@' can be used
+  string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}")
+
+  # Parse the rest according to package type
+  if(packageType STREQUAL "git")
+    # For git repos we interpret #... as a tag or branch or commit hash
+    string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}")
+  elseif(packageType STREQUAL "archive")
+    # For archives we interpret #... as a URL hash.
+    string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}")
+    # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url
+    # should do this at a later point
+  else()
+    # We should never get here. This is an assertion and hitting it means there's a problem with the
+    # code above. A packageType was set, but not handled by this if-else.
+    message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'")
+  endif()
+
+  set(${outArgs}
+      ${out}
+      PARENT_SCOPE
+  )
+endfunction()
+
+# Check that the working directory for a git repo is clean
+function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean)
+
+  find_package(Git REQUIRED)
+
+  if(NOT GIT_EXECUTABLE)
+    # No git executable, assume directory is clean
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  # check for uncommitted changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} status --porcelain
+    RESULT_VARIABLE resultGitStatus
+    OUTPUT_VARIABLE repoStatus
+    OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+  if(resultGitStatus)
+    # not supposed to happen, assume clean anyway
+    message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed")
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  if(NOT "${repoStatus}" STREQUAL "")
+    set(${isClean}
+        FALSE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  # check for committed changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag}
+    RESULT_VARIABLE resultGitDiff
+    OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+
+  if(${resultGitDiff} EQUAL 0)
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+  else()
+    set(${isClean}
+        FALSE
+        PARENT_SCOPE
+    )
+  endif()
+
+endfunction()
+
+# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN
+# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended
+# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`.
+function(cpm_add_patches)
+  # Return if no patch files are supplied.
+  if(NOT ARGN)
+    return()
+  endif()
+
+  # Find the patch program.
+  find_program(PATCH_EXECUTABLE patch)
+  if(WIN32 AND NOT PATCH_EXECUTABLE)
+    # The Windows git executable is distributed with patch.exe. Find the path to the executable, if
+    # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe.
+    find_package(Git QUIET)
+    if(GIT_EXECUTABLE)
+      get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY)
+      get_filename_component(extra_search_path_1up ${extra_search_path} DIRECTORY)
+      get_filename_component(extra_search_path_2up ${extra_search_path_1up} DIRECTORY)
+      find_program(
+        PATCH_EXECUTABLE patch HINTS "${extra_search_path_1up}/usr/bin"
+                                     "${extra_search_path_2up}/usr/bin"
+      )
+    endif()
+  endif()
+  if(NOT PATCH_EXECUTABLE)
+    message(FATAL_ERROR "Couldn't find `patch` executable to use with PATCHES keyword.")
+  endif()
+
+  # Create a temporary
+  set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS})
+
+  # Ensure each file exists (or error out) and add it to the list.
+  set(first_item True)
+  foreach(PATCH_FILE ${ARGN})
+    # Make sure the patch file exists, if we can't find it, try again in the current directory.
+    if(NOT EXISTS "${PATCH_FILE}")
+      if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+        message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'")
+      endif()
+      set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+    endif()
+
+    # Convert to absolute path for use with patch file command.
+    get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE)
+
+    # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are
+    # preceded by "&&".
+    if(first_item)
+      set(first_item False)
+      list(APPEND temp_list "PATCH_COMMAND")
+    else()
+      list(APPEND temp_list "&&")
+    endif()
+    # Add the patch command to the list
+    list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}")
+  endforeach()
+
+  # Move temp out into parent scope.
+  set(CPM_ARGS_UNPARSED_ARGUMENTS
+      ${temp_list}
+      PARENT_SCOPE
+  )
+
+endfunction()
+
+# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload
+# FetchContent calls. As these are internal cmake properties, this method should be used carefully
+# and may need modification in future CMake versions. Source:
+# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152
+function(cpm_override_fetchcontent contentName)
+  cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "")
+  if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "")
+    message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}")
+  endif()
+
+  string(TOLOWER ${contentName} contentNameLower)
+  set(prefix "_FetchContent_${contentNameLower}")
+
+  set(propertyName "${prefix}_sourceDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}")
+
+  set(propertyName "${prefix}_binaryDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}")
+
+  set(propertyName "${prefix}_populated")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} TRUE)
+endfunction()
+
+# Download and add a package from source
+function(CPMAddPackage)
+  cpm_set_policies()
+
+  list(LENGTH ARGN argnLength)
+  if(argnLength EQUAL 1)
+    cpm_parse_add_package_single_arg("${ARGN}" ARGN)
+
+    # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM
+    set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;")
+  endif()
+
+  set(oneValueArgs
+      NAME
+      FORCE
+      VERSION
+      GIT_TAG
+      DOWNLOAD_ONLY
+      GITHUB_REPOSITORY
+      GITLAB_REPOSITORY
+      BITBUCKET_REPOSITORY
+      GIT_REPOSITORY
+      SOURCE_DIR
+      FIND_PACKAGE_ARGUMENTS
+      NO_CACHE
+      SYSTEM
+      GIT_SHALLOW
+      EXCLUDE_FROM_ALL
+      SOURCE_SUBDIR
+      CUSTOM_CACHE_KEY
+  )
+
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES)
+
+  cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
+
+  # Set default values for arguments
+
+  if(NOT DEFINED CPM_ARGS_VERSION)
+    if(DEFINED CPM_ARGS_GIT_TAG)
+      cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+    endif()
+  endif()
+
+  if(CPM_ARGS_DOWNLOAD_ONLY)
+    set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY})
+  else()
+    set(DOWNLOAD_ONLY NO)
+  endif()
+
+  if(DEFINED CPM_ARGS_GITHUB_REPOSITORY)
+    set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git")
+  elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY)
+    set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git")
+  elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY)
+    set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git")
   endif()
-endif()
 
-include(${CPM_DOWNLOAD_LOCATION})
+  if(DEFINED CPM_ARGS_GIT_REPOSITORY)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY})
+    if(NOT DEFINED CPM_ARGS_GIT_TAG)
+      set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION})
+    endif()
+
+    # If a name wasn't provided, try to infer it from the git repo
+    if(NOT DEFINED CPM_ARGS_NAME)
+      cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME)
+    endif()
+  endif()
+
+  set(CPM_SKIP_FETCH FALSE)
+
+  if(DEFINED CPM_ARGS_GIT_TAG)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG})
+    # If GIT_SHALLOW is explicitly specified, honor the value.
+    if(DEFINED CPM_ARGS_GIT_SHALLOW)
+      list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW})
+    endif()
+  endif()
+
+  if(DEFINED CPM_ARGS_URL)
+    # If a name or version aren't provided, try to infer them from the URL
+    list(GET CPM_ARGS_URL 0 firstUrl)
+    cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl)
+    # If we fail to obtain name and version from the first URL, we could try other URLs if any.
+    # However multiple URLs are expected to be quite rare, so for now we won't bother.
+
+    # If the caller provided their own name and version, they trump the inferred ones.
+    if(NOT DEFINED CPM_ARGS_NAME)
+      set(CPM_ARGS_NAME ${nameFromUrl})
+    endif()
+    if(NOT DEFINED CPM_ARGS_VERSION)
+      set(CPM_ARGS_VERSION ${verFromUrl})
+    endif()
+
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}")
+  endif()
+
+  # Check for required arguments
+
+  if(NOT DEFINED CPM_ARGS_NAME)
+    message(
+      FATAL_ERROR
+        "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'"
+    )
+  endif()
+
+  # Check if package has been added before
+  cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+  if(CPM_PACKAGE_ALREADY_ADDED)
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  # Check for manual overrides
+  if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "")
+    set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE})
+    set(CPM_${CPM_ARGS_NAME}_SOURCE "")
+    CPMAddPackage(
+      NAME "${CPM_ARGS_NAME}"
+      SOURCE_DIR "${PACKAGE_SOURCE}"
+      EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+      SYSTEM "${CPM_ARGS_SYSTEM}"
+      PATCHES "${CPM_ARGS_PATCHES}"
+      OPTIONS "${CPM_ARGS_OPTIONS}"
+      SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}"
+      DOWNLOAD_ONLY "${DOWNLOAD_ONLY}"
+      FORCE True
+    )
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  # Check for available declaration
+  if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "")
+    set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}})
+    set(CPM_DECLARATION_${CPM_ARGS_NAME} "")
+    CPMAddPackage(${declaration})
+    cpm_export_variables(${CPM_ARGS_NAME})
+    # checking again to ensure version and option compatibility
+    cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+    return()
+  endif()
+
+  if(NOT CPM_ARGS_FORCE)
+    if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY)
+      cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
+
+      if(CPM_PACKAGE_FOUND)
+        cpm_export_variables(${CPM_ARGS_NAME})
+        return()
+      endif()
+
+      if(CPM_LOCAL_PACKAGES_ONLY)
+        message(
+          SEND_ERROR
+            "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})"
+        )
+      endif()
+    endif()
+  endif()
+
+  CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}")
+
+  if(DEFINED CPM_ARGS_GIT_TAG)
+    set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}")
+  elseif(DEFINED CPM_ARGS_SOURCE_DIR)
+    set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}")
+  else()
+    set(PACKAGE_INFO "${CPM_ARGS_VERSION}")
+  endif()
+
+  if(DEFINED FETCHCONTENT_BASE_DIR)
+    # respect user's FETCHCONTENT_BASE_DIR if set
+    set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR})
+  else()
+    set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps)
+  endif()
+
+  cpm_add_patches(${CPM_ARGS_PATCHES})
+
+  if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND})
+  elseif(DEFINED CPM_ARGS_SOURCE_DIR)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR})
+    if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR})
+      # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work
+      # for relative paths.
+      get_filename_component(
+        source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}
+      )
+    else()
+      set(source_directory ${CPM_ARGS_SOURCE_DIR})
+    endif()
+    if(NOT EXISTS ${source_directory})
+      string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+      # remove timestamps so CMake will re-download the dependency
+      file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild")
+    endif()
+  elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE)
+    string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+    set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS})
+    list(SORT origin_parameters)
+    if(CPM_ARGS_CUSTOM_CACHE_KEY)
+      # Application set a custom unique directory name
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY})
+    elseif(CPM_USE_NAMED_CACHE_DIRECTORIES)
+      string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG")
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME})
+    else()
+      string(SHA1 origin_hash "${origin_parameters}")
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash})
+    endif()
+    # Expand `download_directory` relative path. This is important because EXISTS doesn't work for
+    # relative paths.
+    get_filename_component(download_directory ${download_directory} ABSOLUTE)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory})
+
+    if(CPM_SOURCE_CACHE)
+      file(LOCK ${download_directory}/../cmake.lock)
+    endif()
+
+    if(EXISTS ${download_directory})
+      if(CPM_SOURCE_CACHE)
+        file(LOCK ${download_directory}/../cmake.lock RELEASE)
+      endif()
+
+      cpm_store_fetch_properties(
+        ${CPM_ARGS_NAME} "${download_directory}"
+        "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+      )
+      cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+
+      if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS))
+        # warn if cache has been changed since checkout
+        cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN)
+        if(NOT ${IS_CLEAN})
+          message(
+            WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty"
+          )
+        endif()
+      endif()
+
+      cpm_add_subdirectory(
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
+      )
+      set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}")
+
+      # As the source dir is already cached/populated, we override the call to FetchContent.
+      set(CPM_SKIP_FETCH TRUE)
+      cpm_override_fetchcontent(
+        "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}"
+      )
+
+    else()
+      # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but
+      # it should guarantee no commit hash get mis-detected.
+      if(NOT DEFINED CPM_ARGS_GIT_SHALLOW)
+        cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH)
+        if(NOT ${IS_HASH})
+          list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE)
+        endif()
+      endif()
+
+      # remove timestamps so CMake will re-download the dependency
+      file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild)
+      set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}")
+    endif()
+  endif()
+
+  cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")")
+
+  if(CPM_PACKAGE_LOCK_ENABLED)
+    if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK)
+      cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+    elseif(CPM_ARGS_SOURCE_DIR)
+      cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory")
+    else()
+      cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+    endif()
+  endif()
+
+  cpm_message(
+    STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})"
+  )
+
+  if(NOT CPM_SKIP_FETCH)
+    # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare.
+    # Calling FetchContent_MakeAvailable will then internally forward these options to
+    # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and
+    # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30.
+    set(fetchContentDeclareExtraArgs "")
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if(${CPM_ARGS_EXCLUDE_FROM_ALL})
+        list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL)
+      endif()
+      if(${CPM_ARGS_SYSTEM})
+        list(APPEND fetchContentDeclareExtraArgs SYSTEM)
+      endif()
+      if(DEFINED CPM_ARGS_SOURCE_SUBDIR)
+        list(APPEND fetchContentDeclareExtraArgs SOURCE_SUBDIR ${CPM_ARGS_SOURCE_SUBDIR})
+      endif()
+      # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory
+      if(CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY)
+        foreach(OPTION ${CPM_ARGS_OPTIONS})
+          cpm_parse_option("${OPTION}")
+          set(${OPTION_KEY} "${OPTION_VALUE}")
+        endforeach()
+      endif()
+    endif()
+    cpm_declare_fetch(
+      "${CPM_ARGS_NAME}" ${fetchContentDeclareExtraArgs} "${CPM_ARGS_UNPARSED_ARGUMENTS}"
+    )
+
+    cpm_fetch_package("${CPM_ARGS_NAME}" ${DOWNLOAD_ONLY} populated ${CPM_ARGS_UNPARSED_ARGUMENTS})
+    if(CPM_SOURCE_CACHE AND download_directory)
+      file(LOCK ${download_directory}/../cmake.lock RELEASE)
+    endif()
+    if(${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.28.0")
+      cpm_add_subdirectory(
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
+      )
+    endif()
+    cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+  endif()
+
+  set(${CPM_ARGS_NAME}_ADDED YES)
+  cpm_export_variables("${CPM_ARGS_NAME}")
+endfunction()
+
+# Fetch a previously declared package
+macro(CPMGetPackage Name)
+  if(DEFINED "CPM_DECLARATION_${Name}")
+    CPMAddPackage(NAME ${Name})
+  else()
+    message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available")
+  endif()
+endmacro()
+
+# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set
+macro(cpm_export_variables name)
+  set(${name}_SOURCE_DIR
+      "${${name}_SOURCE_DIR}"
+      PARENT_SCOPE
+  )
+  set(${name}_BINARY_DIR
+      "${${name}_BINARY_DIR}"
+      PARENT_SCOPE
+  )
+  set(${name}_ADDED
+      "${${name}_ADDED}"
+      PARENT_SCOPE
+  )
+  set(CPM_LAST_PACKAGE_NAME
+      "${name}"
+      PARENT_SCOPE
+  )
+endmacro()
+
+# declares a package, so that any call to CPMAddPackage for the package name will use these
+# arguments instead. Previous declarations will not be overridden.
+macro(CPMDeclarePackage Name)
+  if(NOT DEFINED "CPM_DECLARATION_${Name}")
+    set("CPM_DECLARATION_${Name}" "${ARGN}")
+  endif()
+endmacro()
+
+function(cpm_add_to_package_lock Name)
+  if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN})
+    file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n")
+  endif()
+endfunction()
+
+function(cpm_add_comment_to_package_lock Name)
+  if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN})
+    file(APPEND ${CPM_PACKAGE_LOCK_FILE}
+         "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n"
+    )
+  endif()
+endfunction()
+
+# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to
+# update it
+macro(CPMUsePackageLock file)
+  if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE)
+    if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+      include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+    endif()
+    if(NOT TARGET cpm-update-package-lock)
+      add_custom_target(
+        cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE}
+                                        ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}
+      )
+    endif()
+    set(CPM_PACKAGE_LOCK_ENABLED true)
+  endif()
+endmacro()
+
+# registers a package that has been added to CPM
+function(CPMRegisterPackage PACKAGE VERSION)
+  list(APPEND CPM_PACKAGES ${PACKAGE})
+  set(CPM_PACKAGES
+      ${CPM_PACKAGES}
+      CACHE INTERNAL ""
+  )
+  set("CPM_PACKAGE_${PACKAGE}_VERSION"
+      ${VERSION}
+      CACHE INTERNAL ""
+  )
+endfunction()
+
+# retrieve the current version of the package to ${OUTPUT}
+function(CPMGetPackageVersion PACKAGE OUTPUT)
+  set(${OUTPUT}
+      "${CPM_PACKAGE_${PACKAGE}_VERSION}"
+      PARENT_SCOPE
+  )
+endfunction()
+
+# declares a package in FetchContent_Declare
+function(cpm_declare_fetch PACKAGE)
+  if(${CPM_DRY_RUN})
+    cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)")
+    return()
+  endif()
+
+  FetchContent_Declare(${PACKAGE} ${ARGN})
+endfunction()
+
+# returns properties for a package previously defined by cpm_declare_fetch
+function(cpm_get_fetch_properties PACKAGE)
+  if(${CPM_DRY_RUN})
+    return()
+  endif()
+
+  set(${PACKAGE}_SOURCE_DIR
+      "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}"
+      PARENT_SCOPE
+  )
+  set(${PACKAGE}_BINARY_DIR
+      "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}"
+      PARENT_SCOPE
+  )
+endfunction()
+
+function(cpm_store_fetch_properties PACKAGE source_dir binary_dir)
+  if(${CPM_DRY_RUN})
+    return()
+  endif()
+
+  set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR
+      "${source_dir}"
+      CACHE INTERNAL ""
+  )
+  set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR
+      "${binary_dir}"
+      CACHE INTERNAL ""
+  )
+endfunction()
+
+# adds a package as a subdirectory if viable, according to provided options
+function(
+  cpm_add_subdirectory
+  PACKAGE
+  DOWNLOAD_ONLY
+  SOURCE_DIR
+  BINARY_DIR
+  EXCLUDE
+  SYSTEM
+  OPTIONS
+)
+
+  if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt)
+    set(addSubdirectoryExtraArgs "")
+    if(EXCLUDE)
+      list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL)
+    endif()
+    if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25")
+      # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM
+      list(APPEND addSubdirectoryExtraArgs SYSTEM)
+    endif()
+    if(OPTIONS)
+      foreach(OPTION ${OPTIONS})
+        cpm_parse_option("${OPTION}")
+        set(${OPTION_KEY} "${OPTION_VALUE}")
+      endforeach()
+    endif()
+    set(CPM_OLD_INDENT "${CPM_INDENT}")
+    set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:")
+    add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs})
+    set(CPM_INDENT "${CPM_OLD_INDENT}")
+  endif()
+endfunction()
+
+# downloads a previously declared package via FetchContent and exports the variables
+# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope
+function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated)
+  set(${populated}
+      FALSE
+      PARENT_SCOPE
+  )
+  if(${CPM_DRY_RUN})
+    cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)")
+    return()
+  endif()
+
+  FetchContent_GetProperties(${PACKAGE})
+
+  string(TOLOWER "${PACKAGE}" lower_case_name)
+
+  if(NOT ${lower_case_name}_POPULATED)
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if(DOWNLOAD_ONLY)
+        # MakeAvailable will call add_subdirectory internally which is not what we want when
+        # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the
+        # build
+        FetchContent_Populate(
+          ${PACKAGE}
+          SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src"
+          BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+          SUBBUILD_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild"
+          ${ARGN}
+        )
+      else()
+        FetchContent_MakeAvailable(${PACKAGE})
+      endif()
+    else()
+      FetchContent_Populate(${PACKAGE})
+    endif()
+    set(${populated}
+        TRUE
+        PARENT_SCOPE
+    )
+  endif()
+
+  cpm_store_fetch_properties(
+    ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR}
+  )
+
+  set(${PACKAGE}_SOURCE_DIR
+      ${${lower_case_name}_SOURCE_DIR}
+      PARENT_SCOPE
+  )
+  set(${PACKAGE}_BINARY_DIR
+      ${${lower_case_name}_BINARY_DIR}
+      PARENT_SCOPE
+  )
+endfunction()
+
+# splits a package option
+function(cpm_parse_option OPTION)
+  string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}")
+  string(LENGTH "${OPTION}" OPTION_LENGTH)
+  string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH)
+  if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH)
+    # no value for key provided, assume user wants to set option to "ON"
+    set(OPTION_VALUE "ON")
+  else()
+    math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1")
+    string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE)
+  endif()
+  set(OPTION_KEY
+      "${OPTION_KEY}"
+      PARENT_SCOPE
+  )
+  set(OPTION_VALUE
+      "${OPTION_VALUE}"
+      PARENT_SCOPE
+  )
+endfunction()
+
+# guesses the package version from a git tag
+function(cpm_get_version_from_git_tag GIT_TAG RESULT)
+  string(LENGTH ${GIT_TAG} length)
+  if(length EQUAL 40)
+    # GIT_TAG is probably a git hash
+    set(${RESULT}
+        0
+        PARENT_SCOPE
+    )
+  else()
+    string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG})
+    set(${RESULT}
+        ${CMAKE_MATCH_1}
+        PARENT_SCOPE
+    )
+  endif()
+endfunction()
+
+# guesses if the git tag is a commit hash or an actual tag or a branch name.
+function(cpm_is_git_tag_commit_hash GIT_TAG RESULT)
+  string(LENGTH "${GIT_TAG}" length)
+  # full hash has 40 characters, and short hash has at least 7 characters.
+  if(length LESS 7 OR length GREATER 40)
+    set(${RESULT}
+        0
+        PARENT_SCOPE
+    )
+  else()
+    if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$")
+      set(${RESULT}
+          1
+          PARENT_SCOPE
+      )
+    else()
+      set(${RESULT}
+          0
+          PARENT_SCOPE
+      )
+    endif()
+  endif()
+endfunction()
+
+function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT)
+  set(oneValueArgs
+      NAME
+      FORCE
+      VERSION
+      GIT_TAG
+      DOWNLOAD_ONLY
+      GITHUB_REPOSITORY
+      GITLAB_REPOSITORY
+      BITBUCKET_REPOSITORY
+      GIT_REPOSITORY
+      SOURCE_DIR
+      FIND_PACKAGE_ARGUMENTS
+      NO_CACHE
+      SYSTEM
+      GIT_SHALLOW
+      EXCLUDE_FROM_ALL
+      SOURCE_SUBDIR
+  )
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND)
+  cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  foreach(oneArgName ${oneValueArgs})
+    if(DEFINED CPM_ARGS_${oneArgName})
+      if(${IS_IN_COMMENT})
+        string(APPEND PRETTY_OUT_VAR "#")
+      endif()
+      if(${oneArgName} STREQUAL "SOURCE_DIR")
+        string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName}
+                       ${CPM_ARGS_${oneArgName}}
+        )
+      endif()
+      string(APPEND PRETTY_OUT_VAR "  ${oneArgName} ${CPM_ARGS_${oneArgName}}\n")
+    endif()
+  endforeach()
+  foreach(multiArgName ${multiValueArgs})
+    if(DEFINED CPM_ARGS_${multiArgName})
+      if(${IS_IN_COMMENT})
+        string(APPEND PRETTY_OUT_VAR "#")
+      endif()
+      string(APPEND PRETTY_OUT_VAR "  ${multiArgName}\n")
+      foreach(singleOption ${CPM_ARGS_${multiArgName}})
+        if(${IS_IN_COMMENT})
+          string(APPEND PRETTY_OUT_VAR "#")
+        endif()
+        string(APPEND PRETTY_OUT_VAR "    \"${singleOption}\"\n")
+      endforeach()
+    endif()
+  endforeach()
+
+  if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "")
+    if(${IS_IN_COMMENT})
+      string(APPEND PRETTY_OUT_VAR "#")
+    endif()
+    string(APPEND PRETTY_OUT_VAR " ")
+    foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS})
+      string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}")
+    endforeach()
+    string(APPEND PRETTY_OUT_VAR "\n")
+  endif()
+
+  set(${OUT_VAR}
+      ${PRETTY_OUT_VAR}
+      PARENT_SCOPE
+  )
+
+endfunction()
diff --git a/examples/filter_plot_tool/filter_plot_tool.cpp b/examples/filter_plot_tool/filter_plot_tool.cpp
index 53e6444..f284853 100644
--- a/examples/filter_plot_tool/filter_plot_tool.cpp
+++ b/examples/filter_plot_tool/filter_plot_tool.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include <matplotlibcpp.h>
 #include <sst/filters.h>
 #include <sst/filters/FilterPlotter.h>
diff --git a/examples/filters_example_plugin/FilterPlotComponent.cpp b/examples/filters_example_plugin/FilterPlotComponent.cpp
index 5d4426f..72fd024 100644
--- a/examples/filters_example_plugin/FilterPlotComponent.cpp
+++ b/examples/filters_example_plugin/FilterPlotComponent.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "FilterPlotComponent.h"
 #include "FiltersPlugin.h"
 
diff --git a/examples/filters_example_plugin/FilterPlotComponent.h b/examples/filters_example_plugin/FilterPlotComponent.h
index b34d5ce..2ba35ae 100644
--- a/examples/filters_example_plugin/FilterPlotComponent.h
+++ b/examples/filters_example_plugin/FilterPlotComponent.h
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #ifndef SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERPLOTCOMPONENT_H
 #define SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERPLOTCOMPONENT_H
 
diff --git a/examples/filters_example_plugin/FiltersPlugin.cpp b/examples/filters_example_plugin/FiltersPlugin.cpp
index c6d3ece..acfe024 100644
--- a/examples/filters_example_plugin/FiltersPlugin.cpp
+++ b/examples/filters_example_plugin/FiltersPlugin.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "FiltersPlugin.h"
 #include "FiltersPluginEditor.h"
 
@@ -13,10 +27,10 @@ FiltersPlugin::FiltersPlugin()
       vts(*this, nullptr, juce::Identifier("Parameters"), createParameters())
 {
     using namespace ParamTags;
-    freqHzParam = vts.getRawParameterValue (freqTag);
-    resParam = vts.getRawParameterValue (resTag);
-    filterTypeParam = vts.getRawParameterValue (filterTypeTag);
-    filterSubTypeParam = vts.getRawParameterValue (filterSubTypeTag);
+    freqHzParam = vts.getRawParameterValue(freqTag);
+    resParam = vts.getRawParameterValue(resTag);
+    filterTypeParam = vts.getRawParameterValue(filterTypeTag);
+    filterSubTypeParam = vts.getRawParameterValue(filterSubTypeTag);
 }
 
 juce::AudioProcessorValueTreeState::ParameterLayout FiltersPlugin::createParameters()
@@ -49,11 +63,13 @@ juce::AudioProcessorValueTreeState::ParameterLayout FiltersPlugin::createParamet
         resTag, "Resonance", juce::NormalisableRange{0.0f, 1.0f}, 0.5f));
 
     juce::StringArray filterTypeChoices;
-    for (const auto& filter_type_name : sst::filters::filter_type_names)
+    for (const auto &filter_type_name : sst::filters::filter_type_names)
         filterTypeChoices.add(filter_type_name);
 
-    params.push_back(std::make_unique<juce::AudioParameterChoice> (filterTypeTag, "Filter Type", filterTypeChoices, 0));
-    params.push_back(std::make_unique<juce::AudioParameterInt>(filterSubTypeTag, "Filter Sub-Type", 0, sst::filters::FilterSubType::st_tripole_HHH3, 0));
+    params.push_back(std::make_unique<juce::AudioParameterChoice>(filterTypeTag, "Filter Type",
+                                                                  filterTypeChoices, 0));
+    params.push_back(std::make_unique<juce::AudioParameterInt>(
+        filterSubTypeTag, "Filter Sub-Type", 0, sst::filters::FilterSubType::st_tripole_HHH3, 0));
 
     return {params.begin(), params.end()};
 }
@@ -80,8 +96,8 @@ void FiltersPlugin::prepareToPlay(double sampleRate, int samplesPerBlock)
     for (auto &filt : filterUnits)
         filt.reset();
 
-    lastFilterType = ParamConversions::getFilterType (filterTypeParam);
-    lastFilterSubType = ParamConversions::getFilterSubType (filterSubTypeParam);
+    lastFilterType = ParamConversions::getFilterType(filterTypeParam);
+    lastFilterSubType = ParamConversions::getFilterSubType(filterSubTypeParam);
 }
 
 void FiltersPlugin::processBlock(juce::AudioBuffer<float> &buffer, juce::MidiBuffer &)
@@ -102,51 +118,48 @@ void FiltersPlugin::processBlock(juce::AudioBuffer<float> &buffer, juce::MidiBuf
     }
 
     auto filterUnitPtr = sst::filters::GetQFPtrFilterUnit(filterType, filterSubType);
-    coeffMaker.MakeCoeffs(ParamConversions::freq_hz_to_note_num (*freqHzParam), *resParam, filterType, filterSubType, nullptr, false);
+    coeffMaker.MakeCoeffs(ParamConversions::freq_hz_to_note_num(*freqHzParam), *resParam,
+                          filterType, filterSubType, nullptr, false);
 
     if (filterUnitPtr == nullptr)
         return; // no filter to process!
 
     for (int ch = 0; ch < numChannels; ++ch)
     {
-        auto* x = buffer.getWritePointer (ch);
+        auto *x = buffer.getWritePointer(ch);
 
-        auto& filter = filterUnits[ch];
-        coeffMaker.updateState (filter.filterState);
+        auto &filter = filterUnits[ch];
+        coeffMaker.updateState(filter.filterState);
 
         for (int n = 0; n < numSamples; ++n)
         {
             auto yVec = filterUnitPtr(&filter.filterState, _mm_set_ps1(x[n]));
 
             float yArr alignas(16)[4];
-            _mm_store_ps (yArr, yVec);
+            _mm_store_ps(yArr, yVec);
             x[n] = yArr[0];
         }
-
     }
 
     coeffMaker.updateCoefficients(filterUnits[0].filterState);
 }
 
-juce::AudioProcessorEditor *FiltersPlugin::createEditor()
-{
-    return new FiltersPluginEditor (*this);
-}
+juce::AudioProcessorEditor *FiltersPlugin::createEditor() { return new FiltersPluginEditor(*this); }
 
 void FiltersPlugin::getStateInformation(juce::MemoryBlock &data)
 {
     auto state = vts.copyState();
-    std::unique_ptr<juce::XmlElement> xml (state.createXml());
-    copyXmlToBinary (*xml, data);
+    std::unique_ptr<juce::XmlElement> xml(state.createXml());
+    copyXmlToBinary(*xml, data);
 }
 
 void FiltersPlugin::setStateInformation(const void *data, int sizeInBytes)
 {
-    std::unique_ptr<juce::XmlElement> xmlState (getXmlFromBinary (data, sizeInBytes));
+    std::unique_ptr<juce::XmlElement> xmlState(getXmlFromBinary(data, sizeInBytes));
 
     if (xmlState != nullptr)
-        if (xmlState->hasTagName (vts.state.getType()))
-            vts.replaceState (juce::ValueTree::fromXml (*xmlState));
+        if (xmlState->hasTagName(vts.state.getType()))
+            vts.replaceState(juce::ValueTree::fromXml(*xmlState));
 }
 
 // This creates new instances of the plugin
diff --git a/examples/filters_example_plugin/FiltersPlugin.h b/examples/filters_example_plugin/FiltersPlugin.h
index 5e9543b..079cadf 100644
--- a/examples/filters_example_plugin/FiltersPlugin.h
+++ b/examples/filters_example_plugin/FiltersPlugin.h
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #ifndef SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERSPLUGIN_H
 #define SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERSPLUGIN_H
 
diff --git a/examples/filters_example_plugin/FiltersPluginEditor.cpp b/examples/filters_example_plugin/FiltersPluginEditor.cpp
index a7798ba..0117e11 100644
--- a/examples/filters_example_plugin/FiltersPluginEditor.cpp
+++ b/examples/filters_example_plugin/FiltersPluginEditor.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "FiltersPluginEditor.h"
 
 class FiltersPluginEditor::SubTypeComboBoxParameterAttachment : private juce::ComboBox::Listener
diff --git a/examples/filters_example_plugin/FiltersPluginEditor.h b/examples/filters_example_plugin/FiltersPluginEditor.h
index 3564be9..6ab8f0c 100644
--- a/examples/filters_example_plugin/FiltersPluginEditor.h
+++ b/examples/filters_example_plugin/FiltersPluginEditor.h
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #ifndef SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERSPLUGINEDITOR_H
 #define SSTFILTERS_EXAMPLES_FILTERS_EXAMPLE_PLUGIN_FILTERSPLUGINEDITOR_H
 
diff --git a/include-extras/sst/filters/FilterPlotter.h b/include-extras/sst/filters/FilterPlotter.h
index f65ba79..c1f6a5a 100644
--- a/include-extras/sst/filters/FilterPlotter.h
+++ b/include-extras/sst/filters/FilterPlotter.h
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #ifndef INCLUDE_EXTRAS_SST_FILTERS_FILTERPLOTTER_H
 #define INCLUDE_EXTRAS_SST_FILTERS_FILTERPLOTTER_H
 
@@ -28,7 +42,7 @@ class FilterPlotter
     std::pair<std::vector<float>, std::vector<float>>
     plotFilterMagnitudeResponse(sst::filters::FilterType filterType,
                                 sst::filters::FilterSubType filterSubType, float pitch, float res,
-                                const FilterPlotParameters& params = {})
+                                const FilterPlotParameters &params = {})
     {
         // set up input sweep
         std::vector<float> sweepBuffer(fftSize, 0.0f);
@@ -36,9 +50,9 @@ class FilterPlotter
 
         // set up filter
         float delayBuffer[4][sst::filters::utilities::MAX_FB_COMB +
-                       sst::filters::utilities::SincTable::FIRipol_N];
+                             sst::filters::utilities::SincTable::FIRipol_N];
         auto filterState = sst::filters::QuadFilterUnitState{};
-        for (auto i=0; i<4; ++i)
+        for (auto i = 0; i < 4; ++i)
         {
             filterState.DB[i] = &(delayBuffer[i][0]);
         }
@@ -52,37 +66,45 @@ class FilterPlotter
         // process filter
         std::vector<float> filterBuffer(fftSize, 0.0f);
         if (filterUnitPtr != nullptr)
-            runFilter (filterState, filterUnitPtr, sweepBuffer.data(), filterBuffer.data(), fftSize);
+            runFilter(filterState, filterUnitPtr, sweepBuffer.data(), filterBuffer.data(), fftSize);
         else
-            std::copy (sweepBuffer.begin(), sweepBuffer.end(), filterBuffer.begin());
+            std::copy(sweepBuffer.begin(), sweepBuffer.end(), filterBuffer.begin());
 
-        auto magResponseDB = computeFrequencyResponse(sweepBuffer.data(), filterBuffer.data(), fftSize);
-        auto magResponseDBSmoothed = freqSmooth(magResponseDB.data(), (int) magResponseDB.size(), params.freqSmoothOctaves);
-        auto freqAxis = fftFreqs((int) magResponseDB.size(), 1.0f / params.sampleRate);
+        auto magResponseDB =
+            computeFrequencyResponse(sweepBuffer.data(), filterBuffer.data(), fftSize);
+        auto magResponseDBSmoothed =
+            freqSmooth(magResponseDB.data(), (int)magResponseDB.size(), params.freqSmoothOctaves);
+        auto freqAxis = fftFreqs((int)magResponseDB.size(), 1.0f / params.sampleRate);
 
-        return { std::move (freqAxis), std::move (magResponseDBSmoothed) };
+        return {std::move(freqAxis), std::move(magResponseDBSmoothed)};
     }
 
   private:
-    static void generateLogSweep(float *buffer, int nSamples, const FilterPlotParameters& params)
+    static void generateLogSweep(float *buffer, int nSamples, const FilterPlotParameters &params)
     {
         const auto beta = (float)nSamples / std::log(params.endFreqHz / params.startFreqHz);
 
         for (int i = 0; i < nSamples; i++)
         {
-            float phase = 2.0f * (float)M_PI * beta * params.startFreqHz *
-                          (std::pow(params.endFreqHz / params.startFreqHz, (float)i / (float)nSamples) - 1.0f);
+            float phase =
+                2.0f * (float)M_PI * beta * params.startFreqHz *
+                (std::pow(params.endFreqHz / params.startFreqHz, (float)i / (float)nSamples) -
+                 1.0f);
 
-            buffer[i] = params.inputAmplitude * std::sin((phase + (float)M_PI / 180.0f) / params.sampleRate);
+            buffer[i] = params.inputAmplitude *
+                        std::sin((phase + (float)M_PI / 180.0f) / params.sampleRate);
         }
     }
 
-    static void runFilter (sst::filters::QuadFilterUnitState &filterState, sst::filters::FilterUnitQFPtr &filterUnitPtr, const float* inBuffer, float* outBuffer, int numSamples)
+    static void runFilter(sst::filters::QuadFilterUnitState &filterState,
+                          sst::filters::FilterUnitQFPtr &filterUnitPtr, const float *inBuffer,
+                          float *outBuffer, int numSamples)
     {
         // reset filter state
-        std::fill (filterState.R, &filterState.R[sst::filters::n_filter_registers], _mm_setzero_ps());
+        std::fill(filterState.R, &filterState.R[sst::filters::n_filter_registers],
+                  _mm_setzero_ps());
 
-        for (int i=0; i<4; ++i)
+        for (int i = 0; i < 4; ++i)
         {
             filterState.WP[i] = 0;
             filterState.active[i] = 0;
@@ -94,24 +116,25 @@ class FilterPlotter
             auto yVec = filterUnitPtr(&filterState, _mm_set_ps1(inBuffer[i]));
 
             float yArr alignas(16)[4];
-            _mm_store_ps (yArr, yVec);
+            _mm_store_ps(yArr, yVec);
             outBuffer[i] = yArr[0];
         }
     };
 
-    std::vector<float> computeFrequencyResponse(float* sweepBuffer, float* filterBuffer, int numSamples)
+    std::vector<float> computeFrequencyResponse(float *sweepBuffer, float *filterBuffer,
+                                                int numSamples)
     {
         const auto fftDataSize = numSamples * 2;
-        std::vector<float> sweepFFT (fftDataSize, 0.0f);
-        std::copy (sweepBuffer, sweepBuffer + numSamples, sweepFFT.begin());
-        fft.performFrequencyOnlyForwardTransform (sweepFFT.data(), true);
+        std::vector<float> sweepFFT(fftDataSize, 0.0f);
+        std::copy(sweepBuffer, sweepBuffer + numSamples, sweepFFT.begin());
+        fft.performFrequencyOnlyForwardTransform(sweepFFT.data(), true);
 
-        std::vector<float> filtFFT (fftDataSize, 0.0f);
-        std::copy (filterBuffer, filterBuffer + numSamples, filtFFT.begin());
-        fft.performFrequencyOnlyForwardTransform (filtFFT.data(), true);
+        std::vector<float> filtFFT(fftDataSize, 0.0f);
+        std::copy(filterBuffer, filterBuffer + numSamples, filtFFT.begin());
+        fft.performFrequencyOnlyForwardTransform(filtFFT.data(), true);
 
         const auto fftOutSize = numSamples / 2 + 1;
-        std::vector<float> magnitudeResponseDB (fftOutSize, 0.0f);
+        std::vector<float> magnitudeResponseDB(fftOutSize, 0.0f);
         for (int i = 0; i < fftOutSize; ++i)
             magnitudeResponseDB[i] = juce::Decibels::gainToDecibels(filtFFT[i] / sweepFFT[i]);
 
@@ -120,26 +143,29 @@ class FilterPlotter
 
     static std::vector<float> fftFreqs(int N, float T)
     {
-        auto val = 0.5f / ((float) N * T);
+        auto val = 0.5f / ((float)N * T);
 
-        std::vector<float> results (N, 0.0f);
-        std::iota (results.begin(), results.end(), 0.0f);
-        std::transform(results.begin(), results.end(), results.begin(), [val] (auto x) { return x * val; });
+        std::vector<float> results(N, 0.0f);
+        std::iota(results.begin(), results.end(), 0.0f);
+        std::transform(results.begin(), results.end(), results.begin(),
+                       [val](auto x) { return x * val; });
 
         return results;
     }
 
-    static std::vector<float> freqSmooth (const float* data, int numSamples, float smFactor = 1.0f / 24.0f)
+    static std::vector<float> freqSmooth(const float *data, int numSamples,
+                                         float smFactor = 1.0f / 24.0f)
     {
-        const auto s = smFactor > 1.0f ? smFactor : std::sqrt (std::pow (2.0f, smFactor));
+        const auto s = smFactor > 1.0f ? smFactor : std::sqrt(std::pow(2.0f, smFactor));
 
-        std::vector<float> smoothedVec (numSamples, 0.0f);
+        std::vector<float> smoothedVec(numSamples, 0.0f);
         for (int i = 0; i < numSamples; ++i)
         {
-            auto i1 = std::max (int ((float) i / s), 0);
-            auto i2 = std::min (int ((float) i * s) + 1, numSamples - 1);
+            auto i1 = std::max(int((float)i / s), 0);
+            auto i2 = std::min(int((float)i * s) + 1, numSamples - 1);
 
-            smoothedVec[i] = i2 > i1 ? std::accumulate(data + i1, data + i2, 0.0f) / float (i2 - i1) : 0.0f;
+            smoothedVec[i] =
+                i2 > i1 ? std::accumulate(data + i1, data + i2, 0.0f) / float(i2 - i1) : 0.0f;
         }
 
         return smoothedVec;
diff --git a/include/sst/filters/CutoffWarp.h b/include/sst/filters/CutoffWarp.h
index 885365d..1553540 100644
--- a/include/sst/filters/CutoffWarp.h
+++ b/include/sst/filters/CutoffWarp.h
@@ -37,10 +37,10 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
     return freq;
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 
 enum Saturator
 {
@@ -52,48 +52,53 @@ enum Saturator
 
 // this is a duplicate of the code in QuadFilterWaveshapers.cpp except without the multiplication by
 // 'drive' and without the unused QuadFilterWaveshaperState pointer.
-static inline __m128 ojd_waveshaper_ps(const __m128 x) noexcept
+static inline SIMD_M128 ojd_waveshaper_ps(const SIMD_M128 x) noexcept
 {
-    const auto pm17 = _mm_set1_ps(-1.7f);
-    const auto p11 = _mm_set1_ps(1.1f);
-    const auto pm03 = _mm_set1_ps(-0.3f);
-    const auto p09 = _mm_set1_ps(0.9f);
-
-    const auto denLow = _mm_set1_ps(1.f / (4 * (1 - 0.3f)));
-    const auto denHigh = _mm_set1_ps(1.f / (4 * (1 - 0.9f)));
-
-    auto maskNeg = _mm_cmple_ps(x, pm17);                         // in <= -1.7f
-    auto maskPos = _mm_cmpge_ps(x, p11);                          // in > 1.1f
-    auto maskLow = _mm_andnot_ps(maskNeg, _mm_cmplt_ps(x, pm03)); // in > -1.7 && in < =0.3
-    auto maskHigh = _mm_andnot_ps(maskPos, _mm_cmpgt_ps(x, p09)); // in > 0.9 && in < 1.1
-    auto maskMid = _mm_and_ps(_mm_cmpge_ps(x, pm03), _mm_cmple_ps(x, p09)); // the middle
-
-    const auto vNeg = _mm_set1_ps(-1.0);
-    const auto vPos = _mm_set1_ps(1.0);
+    const auto pm17 = SIMD_MM(set1_ps)(-1.7f);
+    const auto p11 = SIMD_MM(set1_ps)(1.1f);
+    const auto pm03 = SIMD_MM(set1_ps)(-0.3f);
+    const auto p09 = SIMD_MM(set1_ps)(0.9f);
+
+    const auto denLow = SIMD_MM(set1_ps)(1.f / (4 * (1 - 0.3f)));
+    const auto denHigh = SIMD_MM(set1_ps)(1.f / (4 * (1 - 0.9f)));
+
+    auto maskNeg = SIMD_MM(cmple_ps)(x, pm17); // in <= -1.7f
+    auto maskPos = SIMD_MM(cmpge_ps)(x, p11);  // in > 1.1f
+    auto maskLow =
+        SIMD_MM(andnot_ps)(maskNeg, SIMD_MM(cmplt_ps)(x, pm03)); // in > -1.7 && in < =0.3
+    auto maskHigh = SIMD_MM(andnot_ps)(maskPos, SIMD_MM(cmpgt_ps)(x, p09)); // in > 0.9 && in < 1.1
+    auto maskMid =
+        SIMD_MM(and_ps)(SIMD_MM(cmpge_ps)(x, pm03), SIMD_MM(cmple_ps)(x, p09)); // the middle
+
+    const auto vNeg = SIMD_MM(set1_ps)(-1.0);
+    const auto vPos = SIMD_MM(set1_ps)(1.0);
     auto vMid = x;
 
-    auto xlow = _mm_sub_ps(x, pm03);
-    auto vLow = _mm_add_ps(xlow, _mm_mul_ps(denLow, _mm_mul_ps(xlow, xlow)));
-    vLow = _mm_add_ps(vLow, pm03);
+    auto xlow = SIMD_MM(sub_ps)(x, pm03);
+    auto vLow = SIMD_MM(add_ps)(xlow, SIMD_MM(mul_ps)(denLow, SIMD_MM(mul_ps)(xlow, xlow)));
+    vLow = SIMD_MM(add_ps)(vLow, pm03);
 
-    auto xhi = _mm_sub_ps(x, p09);
-    auto vHi = _mm_sub_ps(xhi, _mm_mul_ps(denHigh, _mm_mul_ps(xhi, xhi)));
-    vHi = _mm_add_ps(vHi, p09);
+    auto xhi = SIMD_MM(sub_ps)(x, p09);
+    auto vHi = SIMD_MM(sub_ps)(xhi, SIMD_MM(mul_ps)(denHigh, SIMD_MM(mul_ps)(xhi, xhi)));
+    vHi = SIMD_MM(add_ps)(vHi, p09);
 
-    return _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_and_ps(maskNeg, vNeg), _mm_and_ps(maskLow, vLow)),
-                                 _mm_add_ps(_mm_and_ps(maskHigh, vHi), _mm_and_ps(maskPos, vPos))),
-                      _mm_and_ps(maskMid, vMid));
+    return SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(
+            SIMD_MM(add_ps)(SIMD_MM(and_ps)(maskNeg, vNeg), SIMD_MM(and_ps)(maskLow, vLow)),
+            SIMD_MM(add_ps)(SIMD_MM(and_ps)(maskHigh, vHi), SIMD_MM(and_ps)(maskPos, vPos))),
+        SIMD_MM(and_ps)(maskMid, vMid));
 }
 
-static inline __m128 doNLFilter(const __m128 input, const __m128 a1, const __m128 a2,
-                                const __m128 b0, const __m128 b1, const __m128 b2,
-                                const __m128 makeup, const int sat, __m128 &z1, __m128 &z2) noexcept
+static inline SIMD_M128 doNLFilter(const SIMD_M128 input, const SIMD_M128 a1, const SIMD_M128 a2,
+                                   const SIMD_M128 b0, const SIMD_M128 b1, const SIMD_M128 b2,
+                                   const SIMD_M128 makeup, const int sat, SIMD_M128 &z1,
+                                   SIMD_M128 &z2) noexcept
 {
     // out = z1 + b0 * input
-    const __m128 out = A(z1, M(b0, input));
+    const auto out = A(z1, M(b0, input));
 
     // nonlinear feedback = saturator(out)
-    __m128 nf;
+    SIMD_M128 nf;
     switch (sat)
     {
     case SAT_SOFT:
@@ -228,7 +233,7 @@ void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float freq, fl
 }
 
 template <FilterSubType subtype>
-inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     // lower 2 bits of subtype is the stage count
     const int stages = subtype & 3;
diff --git a/include/sst/filters/CytomicSVF.h b/include/sst/filters/CytomicSVF.h
index d4c917d..f7cd829 100644
--- a/include/sst/filters/CytomicSVF.h
+++ b/include/sst/filters/CytomicSVF.h
@@ -59,13 +59,13 @@ namespace sst::filters
 {
 struct CytomicSVF
 {
-    __m128 ic1eq{_mm_setzero_ps()}, ic2eq{_mm_setzero_ps()};
-    __m128 g, k, gk, a1, a2, a3, m0, m1, m2;
+    SIMD_M128 ic1eq{SIMD_MM(setzero_ps)()}, ic2eq{SIMD_MM(setzero_ps)()};
+    SIMD_M128 g, k, gk, a1, a2, a3, m0, m1, m2;
 
-    __m128 oneSSE{_mm_set1_ps(1.0)};
-    __m128 negoneSSE{_mm_set1_ps(-1.0)};
-    __m128 twoSSE{_mm_set1_ps(2.0)};
-    __m128 negtwoSSE{_mm_set1_ps(-2.0)};
+    SIMD_M128 oneSSE{SIMD_MM(set1_ps)(1.0)};
+    SIMD_M128 negoneSSE{SIMD_MM(set1_ps)(-1.0)};
+    SIMD_M128 twoSSE{SIMD_MM(set1_ps)(2.0)};
+    SIMD_M128 negtwoSSE{SIMD_MM(set1_ps)(-2.0)};
     enum Mode
     {
         LP,
@@ -94,13 +94,13 @@ struct CytomicSVF
         res = std::clamp(res, 0.f, 0.98f);
         bellShelfAmp = std::max(bellShelfAmp, 0.001f);
 
-        g = _mm_set1_ps(sst::basic_blocks::dsp::fasttan(M_PI * conorm));
-        k = _mm_set1_ps(2.0 - 2 * res);
+        g = SIMD_MM(set1_ps)(sst::basic_blocks::dsp::fasttan(M_PI * conorm));
+        k = SIMD_MM(set1_ps)(2.0 - 2 * res);
         if (mode == BELL)
         {
-            k = _mm_div_ps(k, _mm_set1_ps(bellShelfAmp));
+            k = SIMD_MM(div_ps)(k, SIMD_MM(set1_ps)(bellShelfAmp));
         }
-        setCoeffPostGK(mode, _mm_set1_ps(bellShelfAmp));
+        setCoeffPostGK(mode, SIMD_MM(set1_ps)(bellShelfAmp));
     }
 
     void setCoeff(Mode mode, float freqL, float freqR, float resL, float resR, float srInv,
@@ -108,87 +108,88 @@ struct CytomicSVF
     {
         auto coL = M_PI * std::clamp(freqL * srInv, 0.f, 0.499f); // stable until nyquist
         auto coR = M_PI * std::clamp(freqR * srInv, 0.f, 0.499f); // stable until nyquist
-        g = sst::basic_blocks::dsp::fasttanhSSE(_mm_set_ps(0, 0, coR, coL));
-        auto res = _mm_set_ps(0, 0, std::clamp(resR, 0.f, 0.98f), std::clamp(resL, 0.f, 0.98f));
+        g = sst::basic_blocks::dsp::fasttanhSSE(SIMD_MM(set_ps)(0, 0, coR, coL));
+        auto res =
+            SIMD_MM(set_ps)(0, 0, std::clamp(resR, 0.f, 0.98f), std::clamp(resL, 0.f, 0.98f));
 
         auto bellShelfAmp =
-            _mm_set_ps(0, 0, std::max(bellShelfAmpL, 0.001f), std::max(bellShelfAmpR, 0.001f));
+            SIMD_MM(set_ps)(0, 0, std::max(bellShelfAmpL, 0.001f), std::max(bellShelfAmpR, 0.001f));
 
-        k = _mm_sub_ps(twoSSE, _mm_mul_ps(twoSSE, res));
+        k = SIMD_MM(sub_ps)(twoSSE, SIMD_MM(mul_ps)(twoSSE, res));
         if (mode == BELL)
         {
-            k = _mm_div_ps(k, bellShelfAmp);
+            k = SIMD_MM(div_ps)(k, bellShelfAmp);
         }
         setCoeffPostGK(mode, bellShelfAmp);
     }
 
-    void setCoeffPostGK(Mode mode, __m128 bellShelfSSE)
+    void setCoeffPostGK(Mode mode, SIMD_M128 bellShelfSSE)
     {
-        gk = _mm_add_ps(g, k);
-        a1 = _mm_div_ps(oneSSE, _mm_add_ps(oneSSE, _mm_mul_ps(g, gk)));
-        a2 = _mm_mul_ps(g, a1);
-        a3 = _mm_mul_ps(g, a2);
+        gk = SIMD_MM(add_ps)(g, k);
+        a1 = SIMD_MM(div_ps)(oneSSE, SIMD_MM(add_ps)(oneSSE, SIMD_MM(mul_ps)(g, gk)));
+        a2 = SIMD_MM(mul_ps)(g, a1);
+        a3 = SIMD_MM(mul_ps)(g, a2);
 
         switch (mode)
         {
         case LP:
-            m0 = _mm_setzero_ps();
-            m1 = _mm_setzero_ps();
+            m0 = SIMD_MM(setzero_ps)();
+            m1 = SIMD_MM(setzero_ps)();
             m2 = oneSSE;
             break;
         case BP:
-            m0 = _mm_setzero_ps();
+            m0 = SIMD_MM(setzero_ps)();
             m1 = oneSSE;
-            m2 = _mm_setzero_ps();
+            m2 = SIMD_MM(setzero_ps)();
             break;
         case HP:
             m0 = oneSSE;
-            m1 = _mm_sub_ps(_mm_setzero_ps(), k);
+            m1 = SIMD_MM(sub_ps)(SIMD_MM(setzero_ps)(), k);
             m2 = negoneSSE;
             break;
         case NOTCH:
             m0 = oneSSE;
-            m1 = _mm_sub_ps(_mm_setzero_ps(), k);
-            m2 = _mm_setzero_ps();
+            m1 = SIMD_MM(sub_ps)(SIMD_MM(setzero_ps)(), k);
+            m2 = SIMD_MM(setzero_ps)();
             break;
         case PEAK:
             m0 = oneSSE;
-            m1 = _mm_sub_ps(_mm_setzero_ps(), k);
+            m1 = SIMD_MM(sub_ps)(SIMD_MM(setzero_ps)(), k);
             m2 = negtwoSSE;
             break;
         case ALL:
             m0 = oneSSE;
-            m1 = _mm_mul_ps(negtwoSSE, k);
-            m2 = _mm_setzero_ps();
+            m1 = SIMD_MM(mul_ps)(negtwoSSE, k);
+            m2 = SIMD_MM(setzero_ps)();
             break;
         case BELL:
         {
             auto A = bellShelfSSE;
             m0 = oneSSE;
-            m1 = _mm_mul_ps(k, _mm_sub_ps(_mm_mul_ps(A, A), oneSSE));
-            m2 = _mm_setzero_ps();
+            m1 = SIMD_MM(mul_ps)(k, SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(A, A), oneSSE));
+            m2 = SIMD_MM(setzero_ps)();
         }
         break;
         case LOW_SHELF:
         {
             auto A = bellShelfSSE;
             m0 = oneSSE;
-            m1 = _mm_mul_ps(k, _mm_sub_ps(A, oneSSE));
-            m2 = _mm_sub_ps(_mm_mul_ps(A, A), oneSSE);
+            m1 = SIMD_MM(mul_ps)(k, SIMD_MM(sub_ps)(A, oneSSE));
+            m2 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(A, A), oneSSE);
         }
         break;
         case HIGH_SHELF:
         {
             auto A = bellShelfSSE;
-            m0 = _mm_mul_ps(A, A);
-            m1 = _mm_mul_ps(_mm_mul_ps(k, _mm_sub_ps(oneSSE, A)), A);
-            m2 = _mm_sub_ps(oneSSE, _mm_mul_ps(A, A));
+            m0 = SIMD_MM(mul_ps)(A, A);
+            m1 = SIMD_MM(mul_ps)(SIMD_MM(mul_ps)(k, SIMD_MM(sub_ps)(oneSSE, A)), A);
+            m2 = SIMD_MM(sub_ps)(oneSSE, SIMD_MM(mul_ps)(A, A));
         }
         break;
         default:
-            m0 = _mm_setzero_ps();
-            m1 = _mm_setzero_ps();
-            m2 = _mm_setzero_ps();
+            m0 = SIMD_MM(setzero_ps)();
+            m1 = SIMD_MM(setzero_ps)();
+            m2 = SIMD_MM(setzero_ps)();
             break;
         }
     }
@@ -208,41 +209,43 @@ struct CytomicSVF
 
     static void step(CytomicSVF &that, float &L, float &R)
     {
-        auto vin = _mm_set_ps(0, 0, R, L);
+        auto vin = SIMD_MM(set_ps)(0, 0, R, L);
         auto res = stepSSE(that, vin);
         float r4 alignas(16)[4];
-        _mm_store_ps(r4, res);
+        SIMD_MM(store_ps)(r4, res);
         L = r4[0];
         R = r4[1];
     }
 
-    static __m128 stepSSE(CytomicSVF &that, __m128 vin)
+    static SIMD_M128 stepSSE(CytomicSVF &that, SIMD_M128 vin)
     {
         // v3 = v0 - ic2eq
-        auto v3 = _mm_sub_ps(vin, that.ic2eq);
+        auto v3 = SIMD_MM(sub_ps)(vin, that.ic2eq);
 
         // v1 = a1 * ic1eq + a2 * v3
-        auto v1 = _mm_add_ps(_mm_mul_ps(that.a1, that.ic1eq), _mm_mul_ps(that.a2, v3));
+        auto v1 =
+            SIMD_MM(add_ps)(SIMD_MM(mul_ps)(that.a1, that.ic1eq), SIMD_MM(mul_ps)(that.a2, v3));
 
         // v2 = ic2eq + a2 * ic1eq + a3 * v3
-        auto v2 = _mm_add_ps(that.ic2eq,
-                             _mm_add_ps(_mm_mul_ps(that.a2, that.ic1eq), _mm_mul_ps(that.a3, v3)));
+        auto v2 = SIMD_MM(add_ps)(that.ic2eq, SIMD_MM(add_ps)(SIMD_MM(mul_ps)(that.a2, that.ic1eq),
+                                                              SIMD_MM(mul_ps)(that.a3, v3)));
 
         // ic1eq = 2 * v1 - ic1eq
-        that.ic1eq = _mm_sub_ps(_mm_mul_ps(that.twoSSE, v1), that.ic1eq);
+        that.ic1eq = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(that.twoSSE, v1), that.ic1eq);
 
         // ic2eq = 2 * v2 - ic2eq
-        that.ic2eq = _mm_sub_ps(_mm_mul_ps(that.twoSSE, v2), that.ic2eq);
+        that.ic2eq = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(that.twoSSE, v2), that.ic2eq);
 
-        return _mm_add_ps(_mm_mul_ps(that.m0, vin),
-                          _mm_add_ps(_mm_mul_ps(that.m1, v1), _mm_mul_ps(that.m2, v2)));
+        return SIMD_MM(add_ps)(
+            SIMD_MM(mul_ps)(that.m0, vin),
+            SIMD_MM(add_ps)(SIMD_MM(mul_ps)(that.m1, v1), SIMD_MM(mul_ps)(that.m2, v2)));
     }
 
     /*
      * Process across a block with smoothing
      */
-    __m128 a1_prior, a2_prior, a3_prior;
-    __m128 da1, da2, da3;
+    SIMD_M128 a1_prior, a2_prior, a3_prior;
+    SIMD_M128 da1, da2, da3;
     bool firstBlock{true};
 
     template <int blockSize>
@@ -267,16 +270,16 @@ struct CytomicSVF
 
         // then for each one calculate the change across the block
         static constexpr float obsf = 1.f / blockSize;
-        auto obs = _mm_set1_ps(obsf);
+        auto obs = SIMD_MM(set1_ps)(obsf);
 
         // and set the changeup, and reset a1 to the prior value so we move in the block
-        da1 = _mm_mul_ps(_mm_sub_ps(a1, a1_prior), obs);
+        da1 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a1, a1_prior), obs);
         a1 = a1_prior;
 
-        da2 = _mm_mul_ps(_mm_sub_ps(a2, a2_prior), obs);
+        da2 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a2, a2_prior), obs);
         a2 = a2_prior;
 
-        da3 = _mm_mul_ps(_mm_sub_ps(a3, a3_prior), obs);
+        da3 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a3, a3_prior), obs);
         a3 = a3_prior;
     }
 
@@ -304,41 +307,41 @@ struct CytomicSVF
 
         // then for each one calculate the change across the block
         static constexpr float obsf = 1.f / blockSize;
-        auto obs = _mm_set1_ps(obsf);
+        auto obs = SIMD_MM(set1_ps)(obsf);
 
         // and set the changeup, and reset a1 to the prior value so we move in the block
-        da1 = _mm_mul_ps(_mm_sub_ps(a1, a1_prior), obs);
+        da1 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a1, a1_prior), obs);
         a1 = a1_prior;
 
-        da2 = _mm_mul_ps(_mm_sub_ps(a2, a2_prior), obs);
+        da2 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a2, a2_prior), obs);
         a2 = a2_prior;
 
-        da3 = _mm_mul_ps(_mm_sub_ps(a3, a3_prior), obs);
+        da3 = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(a3, a3_prior), obs);
         a3 = a3_prior;
     }
 
     template <int blockSize> void retainCoeffForBlock()
     {
-        da1 = _mm_setzero_ps();
-        da2 = _mm_setzero_ps();
-        da3 = _mm_setzero_ps();
+        da1 = SIMD_MM(setzero_ps)();
+        da2 = SIMD_MM(setzero_ps)();
+        da3 = SIMD_MM(setzero_ps)();
     }
 
     void processBlockStep(float &L, float &R)
     {
         step(*this, L, R);
-        a1 = _mm_add_ps(a1, da1);
-        a2 = _mm_add_ps(a2, da2);
-        a3 = _mm_add_ps(a3, da3);
+        a1 = SIMD_MM(add_ps)(a1, da1);
+        a2 = SIMD_MM(add_ps)(a2, da2);
+        a3 = SIMD_MM(add_ps)(a3, da3);
     }
 
     void processBlockStep(float &L)
     {
         float tmp{0.f};
         step(*this, L, tmp);
-        a1 = _mm_add_ps(a1, da1);
-        a2 = _mm_add_ps(a2, da2);
-        a3 = _mm_add_ps(a3, da3);
+        a1 = SIMD_MM(add_ps)(a1, da1);
+        a2 = SIMD_MM(add_ps)(a2, da2);
+        a3 = SIMD_MM(add_ps)(a3, da3);
     }
 
     template <int blockSize>
@@ -363,8 +366,8 @@ struct CytomicSVF
 
     void init()
     {
-        ic1eq = _mm_setzero_ps();
-        ic2eq = _mm_setzero_ps();
+        ic1eq = SIMD_MM(setzero_ps)();
+        ic2eq = SIMD_MM(setzero_ps)();
     }
 };
 } // namespace sst::filters
diff --git a/include/sst/filters/DiodeLadder.h b/include/sst/filters/DiodeLadder.h
index 0f3ee24..cc39216 100644
--- a/include/sst/filters/DiodeLadder.h
+++ b/include/sst/filters/DiodeLadder.h
@@ -34,31 +34,31 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
     return freq;
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define D(a, b) _mm_div_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define D(a, b) SIMD_MM(div_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 // reciprocal
-#define reci(a) _mm_rcp_ps(a)
+#define reci(a) SIMD_MM(rcp_ps)(a)
 
-static inline __m128 getFO(const __m128 beta, const __m128 delta, const __m128 feedback,
-                           const __m128 z) noexcept
+static inline SIMD_M128 getFO(const SIMD_M128 beta, const SIMD_M128 delta, const SIMD_M128 feedback,
+                              const SIMD_M128 z) noexcept
 {
     // (feedback * delta + z) * beta
     return M(A(M(feedback, delta), z), beta);
 }
 
 // @TODO: it looks like the `beta` and `delta` arguments are not being used?
-static inline __m128 doLpf(const __m128 input, const __m128 alpha, const __m128 beta,
-                           const __m128 gamma, const __m128 delta, const __m128 epsilon,
-                           const __m128 ma0, const __m128 feedback, const __m128 feedback_output,
-                           __m128 &z) noexcept
+static inline SIMD_M128 doLpf(const SIMD_M128 input, const SIMD_M128 alpha, const SIMD_M128 beta,
+                              const SIMD_M128 gamma, const SIMD_M128 delta, const SIMD_M128 epsilon,
+                              const SIMD_M128 ma0, const SIMD_M128 feedback,
+                              const SIMD_M128 feedback_output, SIMD_M128 &z) noexcept
 {
     // input * gamma + feedback + epsilon * feedback_output
-    const __m128 i = A(A(M(input, gamma), feedback), M(epsilon, feedback_output));
-    const __m128 v = M(S(M(ma0, i), z), alpha);
-    const __m128 result = A(v, z);
+    const auto i = A(A(M(input, gamma), feedback), M(epsilon, feedback_output));
+    const auto v = M(S(M(ma0, i), z), alpha);
+    const auto result = A(v, z);
     z = A(v, result);
     return result;
 }
@@ -122,7 +122,7 @@ void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float freq, fl
 }
 
 template <FilterSubType subtype>
-inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     for (int i = 0; i < n_cm_coeffs; ++i)
     {
@@ -131,72 +131,70 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
 
     // hopefully the optimiser will take care of the duplicatey bits
 
-    const __m128 zero = F(0.0f);
-    const __m128 one = F(1.0f);
-    const __m128 half = F(0.5f);
+    const auto zero = F(0.0f);
+    const auto one = F(1.0f);
+    const auto half = F(0.5f);
 
-    const __m128 sg3 = f->C[dlf_G4];
-    const __m128 sg2 = M(sg3, f->C[dlf_G3]);
-    const __m128 sg1 = M(sg2, f->C[dlf_G2]);
+    const auto sg3 = f->C[dlf_G4];
+    const auto sg2 = M(sg3, f->C[dlf_G3]);
+    const auto sg1 = M(sg2, f->C[dlf_G2]);
     // sg4 is 1.0, just inline it
 
-    const __m128 g = f->C[dlf_g];
+    const auto g = f->C[dlf_g];
     // g plus one, common so do it only once
-    const __m128 gp1 = A(g, one);
+    const auto gp1 = A(g, one);
     // half of g
-    const __m128 hg = M(f->C[dlf_g], half);
+    const auto hg = M(f->C[dlf_g], half);
 
     // 1.0 / (gp1 - g * G2)
-    const __m128 beta1 = reci(S(gp1, M(g, f->C[dlf_G2])));
+    const auto beta1 = reci(S(gp1, M(g, f->C[dlf_G2])));
     // 1.0 / (gp1 - g * 0.5 * G3
-    const __m128 beta2 = reci(S(gp1, M(hg, f->C[dlf_G3])));
+    const auto beta2 = reci(S(gp1, M(hg, f->C[dlf_G3])));
     // 1.0 / (gp1 - g * 0.5 * G4
-    const __m128 beta3 = reci(S(gp1, M(hg, f->C[dlf_G4])));
+    const auto beta3 = reci(S(gp1, M(hg, f->C[dlf_G4])));
     // 1.0 / gp1
-    const __m128 beta4 = reci(gp1);
+    const auto beta4 = reci(gp1);
 
     // nothing to compute for deltas, inline them
 
     // G1 * G2 + 1.0
-    const __m128 gamma1 = A(M(f->C[dlf_G1], f->C[dlf_G2]), one);
+    const auto gamma1 = A(M(f->C[dlf_G1], f->C[dlf_G2]), one);
     // G2 * G3 + 1.0
-    const __m128 gamma2 = A(M(f->C[dlf_G2], f->C[dlf_G3]), one);
+    const auto gamma2 = A(M(f->C[dlf_G2], f->C[dlf_G3]), one);
     // G3 * G4 + 1.0
-    const __m128 gamma3 = A(M(f->C[dlf_G3], f->C[dlf_G4]), one);
+    const auto gamma3 = A(M(f->C[dlf_G3], f->C[dlf_G4]), one);
     // gamma4 is always 1.0, just inline it
 
     // nothing to compute for epsilons or ma0, inline them
 
     // feedback4 is always zero, inline it
-    const __m128 feedback3 = getFO(beta4, zero, zero, f->R[dlf_z4]);
-    const __m128 feedback2 = getFO(beta3, hg, f->R[dlf_feedback3], f->R[dlf_z3]);
-    const __m128 feedback1 = getFO(beta2, hg, f->R[dlf_feedback2], f->R[dlf_z2]);
+    const auto feedback3 = getFO(beta4, zero, zero, f->R[dlf_z4]);
+    const auto feedback2 = getFO(beta3, hg, f->R[dlf_feedback3], f->R[dlf_z3]);
+    const auto feedback1 = getFO(beta2, hg, f->R[dlf_feedback2], f->R[dlf_z2]);
 
-    const __m128 sigma = A(A(A(M(sg1, getFO(beta1, g, feedback1, f->R[dlf_z1])),
-                               M(sg2, getFO(beta2, hg, feedback2, f->R[dlf_z2]))),
-                             M(sg3, getFO(beta3, hg, feedback3, f->R[dlf_z3]))),
-                           M(one, getFO(beta4, zero, zero, f->R[dlf_z4])));
+    const auto sigma = A(A(A(M(sg1, getFO(beta1, g, feedback1, f->R[dlf_z1])),
+                             M(sg2, getFO(beta2, hg, feedback2, f->R[dlf_z2]))),
+                           M(sg3, getFO(beta3, hg, feedback3, f->R[dlf_z3]))),
+                         M(one, getFO(beta4, zero, zero, f->R[dlf_z4])));
 
     f->R[dlf_feedback3] = feedback3;
     f->R[dlf_feedback2] = feedback2;
     f->R[dlf_feedback1] = feedback1;
 
     // gain compensation
-    const __m128 comp = M(A(M(F(0.3f), f->C[dlf_km]), one), input);
+    const auto comp = M(A(M(F(0.3f), f->C[dlf_km]), one), input);
 
     // (comp - km * sigma) / (km * gamma + 1.0)
-    const __m128 u = D(S(comp, M(f->C[dlf_km], sigma)), A(M(f->C[dlf_km], f->C[dlf_gamma]), one));
-
-    const __m128 result1 = doLpf(u, f->C[dlf_alpha], beta1, gamma1, g, f->C[dlf_G2], one, feedback1,
-                                 getFO(beta1, g, feedback1, f->R[dlf_z1]), f->R[dlf_z1]);
-    const __m128 result2 =
-        doLpf(result1, f->C[dlf_alpha], beta2, gamma2, hg, f->C[dlf_G3], half, feedback2,
-              getFO(beta2, hg, feedback2, f->R[dlf_z2]), f->R[dlf_z2]);
-    const __m128 result3 =
-        doLpf(result2, f->C[dlf_alpha], beta3, gamma3, hg, f->C[dlf_G4], half, feedback3,
-              getFO(beta3, hg, feedback3, f->R[dlf_z3]), f->R[dlf_z3]);
-    const __m128 result4 = doLpf(result3, f->C[dlf_alpha], beta4, one, zero, zero, half, zero,
-                                 getFO(beta4, zero, zero, f->R[dlf_z4]), f->R[dlf_z4]);
+    const auto u = D(S(comp, M(f->C[dlf_km], sigma)), A(M(f->C[dlf_km], f->C[dlf_gamma]), one));
+
+    const auto result1 = doLpf(u, f->C[dlf_alpha], beta1, gamma1, g, f->C[dlf_G2], one, feedback1,
+                               getFO(beta1, g, feedback1, f->R[dlf_z1]), f->R[dlf_z1]);
+    const auto result2 = doLpf(result1, f->C[dlf_alpha], beta2, gamma2, hg, f->C[dlf_G3], half,
+                               feedback2, getFO(beta2, hg, feedback2, f->R[dlf_z2]), f->R[dlf_z2]);
+    const auto result3 = doLpf(result2, f->C[dlf_alpha], beta3, gamma3, hg, f->C[dlf_G4], half,
+                               feedback3, getFO(beta3, hg, feedback3, f->R[dlf_z3]), f->R[dlf_z3]);
+    const auto result4 = doLpf(result3, f->C[dlf_alpha], beta4, one, zero, zero, half, zero,
+                               getFO(beta4, zero, zero, f->R[dlf_z4]), f->R[dlf_z4]);
 
     // Just like in QuadFilterUnit.cpp/LPMOOGquad, it's fine for the whole quad to return the same
     // subtype because integer parameters like f->WP are not modulatable and QuadFilterUnit is only
diff --git a/include/sst/filters/FilterCoefficientMaker_Impl.h b/include/sst/filters/FilterCoefficientMaker_Impl.h
index 11dd4a8..0fa7e06 100644
--- a/include/sst/filters/FilterCoefficientMaker_Impl.h
+++ b/include/sst/filters/FilterCoefficientMaker_Impl.h
@@ -45,9 +45,9 @@ void FilterCoefficientMaker<TuningProvider>::setSampleRateAndBlockSize(float new
 
 namespace detail
 {
-inline void set1f(__m128 &m, int i, float f) { *((float *)&m + i) = f; }
+inline void set1f(SIMD_M128 &m, int i, float f) { *((float *)&m + i) = f; }
 
-inline float get1f(__m128 m, int i) { return *((float *)&m + i); }
+inline float get1f(SIMD_M128 m, int i) { return *((float *)&m + i); }
 } // namespace detail
 
 template <typename TuningProvider>
@@ -58,8 +58,8 @@ void FilterCoefficientMaker<TuningProvider>::updateState(StateType &state, int c
     {
         for (int i = 0; i < n_cm_coeffs; ++i)
         {
-            state.C[i] = _mm_set1_ps(C[i]);
-            state.dC[i] = _mm_set1_ps(dC[i]);
+            state.C[i] = SIMD_MM(set1_ps)(C[i]);
+            state.dC[i] = SIMD_MM(set1_ps)(dC[i]);
         }
     }
     else
diff --git a/include/sst/filters/HalfRateFilter.h b/include/sst/filters/HalfRateFilter.h
index 8a2fda5..2518659 100644
--- a/include/sst/filters/HalfRateFilter.h
+++ b/include/sst/filters/HalfRateFilter.h
@@ -16,6 +16,7 @@
 #define INCLUDE_SST_FILTERS_HALFRATEFILTER_H
 
 #include <cstdint>
+#include <cmath>
 #include "sst/utilities/globals.h"
 
 namespace sst::filters::HalfRate
@@ -28,16 +29,16 @@ class alignas(16) HalfRateFilter
 {
   private:
     // Remember leave these first so they stay aligned
-    __m128 va[halfrate_max_M];
-    __m128 vx0[halfrate_max_M];
-    __m128 vx1[halfrate_max_M];
-    __m128 vx2[halfrate_max_M];
-    __m128 vy0[halfrate_max_M];
-    __m128 vy1[halfrate_max_M];
-    __m128 vy2[halfrate_max_M];
-    __m128 oldout;
+    SIMD_M128 va[halfrate_max_M];
+    SIMD_M128 vx0[halfrate_max_M];
+    SIMD_M128 vx1[halfrate_max_M];
+    SIMD_M128 vx2[halfrate_max_M];
+    SIMD_M128 vy0[halfrate_max_M];
+    SIMD_M128 vy1[halfrate_max_M];
+    SIMD_M128 vy2[halfrate_max_M];
+    SIMD_M128 oldout;
 
-    const __m128 half = _mm_set_ps1(0.5f);
+    const SIMD_M128 half = SIMD_MM(set_ps1)(0.5f);
 
   public:
     /**
@@ -58,30 +59,30 @@ class alignas(16) HalfRateFilter
 
     void process_block(float *floatL, float *floatR, int nsamples)
     {
-        __m128 *__restrict L = (__m128 *)floatL;
-        __m128 *__restrict R = (__m128 *)floatR;
-        __m128 o[hr_BLOCK_SIZE];
+        SIMD_M128 *__restrict L = (SIMD_M128 *)floatL;
+        SIMD_M128 *__restrict R = (SIMD_M128 *)floatR;
+        SIMD_M128 o[hr_BLOCK_SIZE];
         auto N = nsamples;
         // fill the buffer with interleaved stereo samples
         for (int k = 0; k < N; k += 4)
         {
             //[o3,o2,o1,o0] = [L0,L0,R0,R0]
-            o[k] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(0, 0, 0, 0));
-            o[k + 1] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(1, 1, 1, 1));
-            o[k + 2] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(2, 2, 2, 2));
-            o[k + 3] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(3, 3, 3, 3));
+            o[k] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(0, 0, 0, 0));
+            o[k + 1] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            o[k + 2] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(2, 2, 2, 2));
+            o[k + 3] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(3, 3, 3, 3));
         }
 
         // process filters
         for (auto j = 0U; j < M; j++)
         {
-            __m128 tx0 = vx0[j];
-            __m128 tx1 = vx1[j];
-            __m128 tx2 = vx2[j];
-            __m128 ty0 = vy0[j];
-            __m128 ty1 = vy1[j];
-            __m128 ty2 = vy2[j];
-            __m128 ta = va[j];
+            auto tx0 = vx0[j];
+            auto tx1 = vx1[j];
+            auto tx2 = vx2[j];
+            auto ty0 = vy0[j];
+            auto ty1 = vy1[j];
+            auto ty2 = vy2[j];
+            auto ta = va[j];
 
             for (int k = 0; k < N; k += 2)
             {
@@ -93,7 +94,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k] = ty0;
 
                 // shuffle inputs
@@ -104,7 +105,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k + 1] = ty0;
             }
             vx0[j] = tx0;
@@ -135,40 +136,40 @@ class alignas(16) HalfRateFilter
 
         float *fL = (float *)L;
         float *fR = (float *)R;
-        __m128 faR = _mm_setzero_ps();
-        __m128 fbR = _mm_setzero_ps();
+        auto faR = SIMD_MM(setzero_ps)();
+        auto fbR = SIMD_MM(setzero_ps)();
 
         for (int k = 0; k < N; k++)
         {
             //	const double output=(filter_a.process(input)+oldout)*0.5;
             //	oldout=filter_b.process(input);
 
-            __m128 vL = _mm_add_ss(o[k], oldout);
-            vL = _mm_mul_ss(vL, half);
-            _mm_store_ss(&fL[k], vL);
+            auto vL = SIMD_MM(add_ss)(o[k], oldout);
+            vL = SIMD_MM(mul_ss)(vL, half);
+            SIMD_MM(store_ss)(&fL[k], vL);
 
-            faR = _mm_movehl_ps(faR, o[k]);
-            fbR = _mm_movehl_ps(fbR, oldout);
+            faR = SIMD_MM(movehl_ps)(faR, o[k]);
+            fbR = SIMD_MM(movehl_ps)(fbR, oldout);
 
-            __m128 vR = _mm_add_ss(faR, fbR);
-            vR = _mm_mul_ss(vR, half);
-            _mm_store_ss(&fR[k], vR);
+            auto vR = SIMD_MM(add_ss)(faR, fbR);
+            vR = SIMD_MM(mul_ss)(vR, half);
+            SIMD_MM(store_ss)(&fR[k], vR);
 
-            oldout = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(3, 3, 1, 1));
+            oldout = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(3, 3, 1, 1));
         }
     }
     void process_block_D2(float *floatL, float *floatR, int nsamples, float *outL = 0,
                           float *outR = 0) // process in-place. the new block will be half the size
     {
-        __m128 *L = (__m128 *)floatL;
-        __m128 *R = (__m128 *)floatR;
-        __m128 o[hr_BLOCK_SIZE];
+        auto *L = (SIMD_M128 *)floatL;
+        auto *R = (SIMD_M128 *)floatR;
+        SIMD_M128 o[hr_BLOCK_SIZE];
 
         /*
          * fill the buffer with interleaved stereo samples by rotating the
          * input simd-in-time a bit
          *
-         * _mm_shuffle_ps(a,b,_MM_SHUFFLE(i,j,k,l)) returns a[i], a[j], b[k], b[l]
+         * SIMD_MM(shuffle_ps)(a,b,SIMD_MM_SHUFFLE(i,j,k,l)) returns a[i], a[j], b[k], b[l]
          *
          * So this loop makes o look like the rotation of L and R. That is
          *
@@ -178,10 +179,10 @@ class alignas(16) HalfRateFilter
          */
         for (int k = 0; k < nsamples; k += 4)
         {
-            o[k] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(0, 0, 0, 0));
-            o[k + 1] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(1, 1, 1, 1));
-            o[k + 2] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(2, 2, 2, 2));
-            o[k + 3] = _mm_shuffle_ps(L[k >> 2], R[k >> 2], _MM_SHUFFLE(3, 3, 3, 3));
+            o[k] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(0, 0, 0, 0));
+            o[k + 1] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            o[k + 2] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(2, 2, 2, 2));
+            o[k + 3] = SIMD_MM(shuffle_ps)(L[k >> 2], R[k >> 2], SIMD_MM_SHUFFLE(3, 3, 3, 3));
         }
 
         /*
@@ -210,13 +211,13 @@ class alignas(16) HalfRateFilter
          */
         for (auto j = 0U; j < M; j++)
         {
-            __m128 tx0 = vx0[j];
-            __m128 tx1 = vx1[j];
-            __m128 tx2 = vx2[j];
-            __m128 ty0 = vy0[j];
-            __m128 ty1 = vy1[j];
-            __m128 ty2 = vy2[j];
-            __m128 ta = va[j];
+            auto tx0 = vx0[j];
+            auto tx1 = vx1[j];
+            auto tx2 = vx2[j];
+            auto ty0 = vy0[j];
+            auto ty1 = vy1[j];
+            auto ty2 = vy2[j];
+            auto ta = va[j];
 
             // Why is this loop hand-unrolled?
             for (int k = 0; k < nsamples; k += 2)
@@ -229,7 +230,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k] = ty0;
 
                 // shuffle inputs
@@ -240,7 +241,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k + 1] = ty0;
             }
             vx0[j] = tx0;
@@ -251,20 +252,20 @@ class alignas(16) HalfRateFilter
             vy2[j] = ty2;
         }
 
-        __m128 aR = _mm_setzero_ps();
-        __m128 bR = _mm_setzero_ps();
-        __m128 cR = _mm_setzero_ps();
-        __m128 dR = _mm_setzero_ps();
+        auto aR = SIMD_MM(setzero_ps)();
+        auto bR = SIMD_MM(setzero_ps)();
+        auto cR = SIMD_MM(setzero_ps)();
+        auto dR = SIMD_MM(setzero_ps)();
 
         if (outL)
-            L = (__m128 *)outL;
+            L = (SIMD_M128 *)outL;
         if (outR)
-            R = (__m128 *)outR;
+            R = (SIMD_M128 *)outR;
 
         /*
          * OK so now we have all the filtered signals we want to reconstruct the output.
          * This is basically the sample selection stage. To read this code you need
-         * to remember that _mm_movehl_ps(a,b) results in b[3], b[4], a[3], a[4] as the
+         * to remember that SIMD_MM(movehl_ps)(a,b) results in b[3], b[4], a[3], a[4] as the
          * simd output.
          *
          * The code had this comment
@@ -274,29 +275,29 @@ class alignas(16) HalfRateFilter
          *
          * atop this code
          *
-         * __m128 tL0 = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(1, 1, 1, 1));
-         * __m128 tR0 = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(3, 3, 3, 3));
-         * __m128 aL = _mm_add_ss(tL0, o[k + 1]);
-         * aR = _mm_movehl_ps(aR, o[k + 1]);
-         * aR = _mm_add_ss(aR, tR0);
+         * auto tL0 = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+         * auto tR0 = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+         * auto aL = SIMD_MM(add_ss)(tL0, o[k + 1]);
+         * aR = SIMD_MM(movehl_ps)(aR, o[k + 1]);
+         * aR = SIMD_MM(add_ss)(aR, tR0);
          *
          * So can we make that tie out? Remembering o now has the for B_L A_L, B_R, A_R
          *
-         * So tL0 = _mm_shuffle_ps(o[k], o[k], 11111)
+         * So tL0 = SIMD_MM(shuffle_ps)(o[k], o[k], 11111)
          * or tL0 = o[k][1] in every slot or tl0 is A_L across the board at sample k.
          * Similarly tR0 = A_R across the board at sample K.
          *
-         * Now recall _mm_add_ss(a,b) gives you (a[0]+b[0], a[1], a[2], a[3]) so now we do
+         * Now recall SIMD_MM(add_ss)(a,b) gives you (a[0]+b[0], a[1], a[2], a[3]) so now we do
          *
-         * __m128 aL = _mm_add_ss(tL0, o[k + 1]);
+         * auto aL = SIMD_MM(add_ss)(tL0, o[k + 1]);
          * aL = (A_L[k] + B_L[k+1], A_L[k],  A_L[k], A_L[k]);
          *
          * Now
          *
-         * aR = _mm_movehl_ps(aR, o[k + 1]);
+         * aR = SIMD_MM(movehl_ps)(aR, o[k + 1]);
          * aR = B_R[k+1], A_R[k+1], aR[3], aR[4]
          *
-         * aR = _mm_add_ss(aR, tR0) or
+         * aR = SIMD_MM(add_ss)(aR, tR0) or
          * aR = (A_R[k] + B_R[k+1], A_R[k+1], aR[3], aR[4])
          *
          * (At this point I'm suspecting that the rest of the SIMD registeres in A wont matter)
@@ -306,14 +307,14 @@ class alignas(16) HalfRateFilter
          *
          * So once those stages are assembled we do this
          *
-         * aL = _mm_movelh_ps(aL, bL);
-         * cL = _mm_movelh_ps(cL, dL);
-         * L[k >> 3] = _mm_shuffle_ps(aL, cL, _MM_SHUFFLE(2, 0, 2, 0));
+         * aL = SIMD_MM(movelh_ps)(aL, bL);
+         * cL = SIMD_MM(movelh_ps)(cL, dL);
+         * L[k >> 3] = SIMD_MM(shuffle_ps)(aL, cL, SIMD_MM_SHUFFLE(2, 0, 2, 0));
          *
          * And similarly for R. So what's that doing. So first of all _mm_novelh_ps [note lh
          * not hl] has signatlre
          *
-         * _mm_movelh_ps(a,b) = a[0],a[1],b[0],b[1]
+         * SIMD_MM(movelh_ps)(a,b) = a[0],a[1],b[0],b[1]
          *
          * so this sets
          *
@@ -342,78 +343,78 @@ class alignas(16) HalfRateFilter
             /*	const double output=(filter_a.process(input)+oldout)*0.5;
             oldout=filter_b.process(input);*/
 
-            __m128 tL0 = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(1, 1, 1, 1));
-            __m128 tR0 = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(3, 3, 3, 3));
-            __m128 aL = _mm_add_ss(tL0, o[k + 1]);
-            aR = _mm_movehl_ps(aR, o[k + 1]);
-            aR = _mm_add_ss(aR, tR0);
-
-            tL0 = _mm_shuffle_ps(o[k + 2], o[k + 2], _MM_SHUFFLE(1, 1, 1, 1));
-            tR0 = _mm_shuffle_ps(o[k + 2], o[k + 2], _MM_SHUFFLE(3, 3, 3, 3));
-            __m128 bL = _mm_add_ss(tL0, o[k + 3]);
-            bR = _mm_movehl_ps(aR, o[k + 3]);
-            bR = _mm_add_ss(bR, tR0);
-
-            tL0 = _mm_shuffle_ps(o[k + 4], o[k + 4], _MM_SHUFFLE(1, 1, 1, 1));
-            tR0 = _mm_shuffle_ps(o[k + 4], o[k + 4], _MM_SHUFFLE(3, 3, 3, 3));
-            __m128 cL = _mm_add_ss(tL0, o[k + 5]);
-            cR = _mm_movehl_ps(cR, o[k + 5]);
-            cR = _mm_add_ss(cR, tR0);
-
-            tL0 = _mm_shuffle_ps(o[k + 6], o[k + 6], _MM_SHUFFLE(1, 1, 1, 1));
-            tR0 = _mm_shuffle_ps(o[k + 6], o[k + 6], _MM_SHUFFLE(3, 3, 3, 3));
-            __m128 dL = _mm_add_ss(tL0, o[k + 7]);
-            dR = _mm_movehl_ps(dR, o[k + 7]);
-            dR = _mm_add_ss(dR, tR0);
-
-            aL = _mm_movelh_ps(aL, bL);
-            cL = _mm_movelh_ps(cL, dL);
-            aR = _mm_movelh_ps(aR, bR);
-            cR = _mm_movelh_ps(cR, dR);
-
-            L[k >> 3] = _mm_shuffle_ps(aL, cL, _MM_SHUFFLE(2, 0, 2, 0));
-            R[k >> 3] = _mm_shuffle_ps(aR, cR, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tL0 = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            auto tR0 = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+            auto aL = SIMD_MM(add_ss)(tL0, o[k + 1]);
+            aR = SIMD_MM(movehl_ps)(aR, o[k + 1]);
+            aR = SIMD_MM(add_ss)(aR, tR0);
+
+            tL0 = SIMD_MM(shuffle_ps)(o[k + 2], o[k + 2], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            tR0 = SIMD_MM(shuffle_ps)(o[k + 2], o[k + 2], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+            auto bL = SIMD_MM(add_ss)(tL0, o[k + 3]);
+            bR = SIMD_MM(movehl_ps)(aR, o[k + 3]);
+            bR = SIMD_MM(add_ss)(bR, tR0);
+
+            tL0 = SIMD_MM(shuffle_ps)(o[k + 4], o[k + 4], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            tR0 = SIMD_MM(shuffle_ps)(o[k + 4], o[k + 4], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+            auto cL = SIMD_MM(add_ss)(tL0, o[k + 5]);
+            cR = SIMD_MM(movehl_ps)(cR, o[k + 5]);
+            cR = SIMD_MM(add_ss)(cR, tR0);
+
+            tL0 = SIMD_MM(shuffle_ps)(o[k + 6], o[k + 6], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            tR0 = SIMD_MM(shuffle_ps)(o[k + 6], o[k + 6], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+            auto dL = SIMD_MM(add_ss)(tL0, o[k + 7]);
+            dR = SIMD_MM(movehl_ps)(dR, o[k + 7]);
+            dR = SIMD_MM(add_ss)(dR, tR0);
+
+            aL = SIMD_MM(movelh_ps)(aL, bL);
+            cL = SIMD_MM(movelh_ps)(cL, dL);
+            aR = SIMD_MM(movelh_ps)(aR, bR);
+            cR = SIMD_MM(movelh_ps)(cR, dR);
+
+            L[k >> 3] = SIMD_MM(shuffle_ps)(aL, cL, SIMD_MM_SHUFFLE(2, 0, 2, 0));
+            R[k >> 3] = SIMD_MM(shuffle_ps)(aR, cR, SIMD_MM_SHUFFLE(2, 0, 2, 0));
 
             // optional: *=0.5;
-            const __m128 half = _mm_set_ps1(0.5f);
-            L[k >> 3] = _mm_mul_ps(L[k >> 3], half);
-            R[k >> 3] = _mm_mul_ps(R[k >> 3], half);
+            const auto half = SIMD_MM(set_ps1)(0.5f);
+            L[k >> 3] = SIMD_MM(mul_ps)(L[k >> 3], half);
+            R[k >> 3] = SIMD_MM(mul_ps)(R[k >> 3], half);
         }
     }
 
     void process_block_U2(float *floatL_in, float *floatR_in, float *floatL, float *floatR,
                           int nsamples)
     {
-        __m128 *L = (__m128 *)floatL;
-        __m128 *R = (__m128 *)floatR;
-        __m128 *L_in = (__m128 *)floatL_in;
-        __m128 *R_in = (__m128 *)floatR_in;
+        auto *L = (SIMD_M128 *)floatL;
+        auto *R = (SIMD_M128 *)floatR;
+        auto *L_in = (SIMD_M128 *)floatL_in;
+        auto *R_in = (SIMD_M128 *)floatR_in;
 
-        __m128 o[hr_BLOCK_SIZE];
+        SIMD_M128 o[hr_BLOCK_SIZE];
         // fill the buffer with interleaved stereo samples
         for (int k = 0; k < nsamples; k += 8)
         {
             //[o3,o2,o1,o0] = [L0,L0,R0,R0]
-            o[k] = _mm_shuffle_ps(L_in[k >> 3], R_in[k >> 3], _MM_SHUFFLE(0, 0, 0, 0));
-            o[k + 1] = _mm_setzero_ps();
-            o[k + 2] = _mm_shuffle_ps(L_in[k >> 3], R_in[k >> 3], _MM_SHUFFLE(1, 1, 1, 1));
-            o[k + 3] = _mm_setzero_ps();
-            o[k + 4] = _mm_shuffle_ps(L_in[k >> 3], R_in[k >> 3], _MM_SHUFFLE(2, 2, 2, 2));
-            o[k + 5] = _mm_setzero_ps();
-            o[k + 6] = _mm_shuffle_ps(L_in[k >> 3], R_in[k >> 3], _MM_SHUFFLE(3, 3, 3, 3));
-            o[k + 7] = _mm_setzero_ps();
+            o[k] = SIMD_MM(shuffle_ps)(L_in[k >> 3], R_in[k >> 3], SIMD_MM_SHUFFLE(0, 0, 0, 0));
+            o[k + 1] = SIMD_MM(setzero_ps)();
+            o[k + 2] = SIMD_MM(shuffle_ps)(L_in[k >> 3], R_in[k >> 3], SIMD_MM_SHUFFLE(1, 1, 1, 1));
+            o[k + 3] = SIMD_MM(setzero_ps)();
+            o[k + 4] = SIMD_MM(shuffle_ps)(L_in[k >> 3], R_in[k >> 3], SIMD_MM_SHUFFLE(2, 2, 2, 2));
+            o[k + 5] = SIMD_MM(setzero_ps)();
+            o[k + 6] = SIMD_MM(shuffle_ps)(L_in[k >> 3], R_in[k >> 3], SIMD_MM_SHUFFLE(3, 3, 3, 3));
+            o[k + 7] = SIMD_MM(setzero_ps)();
         }
 
         // process filters
         for (auto j = 0U; j < M; j++)
         {
-            __m128 tx0 = vx0[j];
-            __m128 tx1 = vx1[j];
-            __m128 tx2 = vx2[j];
-            __m128 ty0 = vy0[j];
-            __m128 ty1 = vy1[j];
-            __m128 ty2 = vy2[j];
-            __m128 ta = va[j];
+            auto tx0 = vx0[j];
+            auto tx1 = vx1[j];
+            auto tx2 = vx2[j];
+            auto ty0 = vy0[j];
+            auto ty1 = vy1[j];
+            auto ty2 = vy2[j];
+            auto ta = va[j];
 
             for (int k = 0; k < nsamples; k += 2)
             {
@@ -425,7 +426,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k] = ty0;
 
                 // shuffle inputs
@@ -436,7 +437,7 @@ class alignas(16) HalfRateFilter
                 ty2 = ty1;
                 ty1 = ty0;
                 // allpass filter 1
-                ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
+                ty0 = SIMD_MM(add_ps)(tx2, SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(tx0, ty2), ta));
                 o[k + 1] = ty0;
             }
             vx0[j] = tx0;
@@ -447,33 +448,33 @@ class alignas(16) HalfRateFilter
             vy2[j] = ty2;
         }
 
-        /*__m128 aR = _mm_setzero_ps();
-        __m128 bR = _mm_setzero_ps();
-        __m128 cR = _mm_setzero_ps();
-        __m128 dR = _mm_setzero_ps();*/
+        /*auto aR = SIMD_MM(setzero_ps)();
+        auto bR = SIMD_MM(setzero_ps)();
+        auto cR = SIMD_MM(setzero_ps)();
+        auto dR = SIMD_MM(setzero_ps)();*/
 
         float *fL = (float *)L;
         float *fR = (float *)R;
-        __m128 faR = _mm_setzero_ps();
-        __m128 fbR = _mm_setzero_ps();
+        auto faR = SIMD_MM(setzero_ps)();
+        auto fbR = SIMD_MM(setzero_ps)();
 
         for (int k = 0; k < nsamples; k++)
         {
             //	const double output=(filter_a.process(input)+oldout)*0.5;
             //	oldout=filter_b.process(input);
 
-            __m128 vL = _mm_add_ss(o[k], oldout);
-            vL = _mm_mul_ss(vL, half);
-            _mm_store_ss(&fL[k], vL);
+            auto vL = SIMD_MM(add_ss)(o[k], oldout);
+            vL = SIMD_MM(mul_ss)(vL, half);
+            SIMD_MM(store_ss)(&fL[k], vL);
 
-            faR = _mm_movehl_ps(faR, o[k]);
-            fbR = _mm_movehl_ps(fbR, oldout);
+            faR = SIMD_MM(movehl_ps)(faR, o[k]);
+            fbR = SIMD_MM(movehl_ps)(fbR, oldout);
 
-            __m128 vR = _mm_add_ss(faR, fbR);
-            vR = _mm_mul_ss(vR, half);
-            _mm_store_ss(&fR[k], vR);
+            auto vR = SIMD_MM(add_ss)(faR, fbR);
+            vR = SIMD_MM(mul_ss)(vR, half);
+            SIMD_MM(store_ss)(&fR[k], vR);
 
-            oldout = _mm_shuffle_ps(o[k], o[k], _MM_SHUFFLE(3, 3, 1, 1));
+            oldout = SIMD_MM(shuffle_ps)(o[k], o[k], SIMD_MM_SHUFFLE(3, 3, 1, 1));
         }
 
         // If you want to avoid downsampling, do this
@@ -496,7 +497,7 @@ class alignas(16) HalfRateFilter
     {
         for (auto i = 0U; i < M; i++)
         {
-            va[i] = _mm_setzero_ps();
+            va[i] = SIMD_MM(setzero_ps)();
         }
 
         int order = M << 1;
@@ -631,22 +632,22 @@ class alignas(16) HalfRateFilter
     {
         for (auto i = 0U; i < M; i++)
         {
-            // va[i] = _mm_set_ps(cA[i],cB[i],cA[i],cB[i]);
-            va[i] = _mm_set_ps(cB[i], cA[i], cB[i], cA[i]);
+            // va[i] = SIMD_MM(set_ps)(cA[i],cB[i],cA[i],cB[i]);
+            va[i] = SIMD_MM(set_ps)(cB[i], cA[i], cB[i], cA[i]);
         }
     }
     void reset()
     {
         for (auto i = 0U; i < M; i++)
         {
-            vx0[i] = _mm_setzero_ps();
-            vx1[i] = _mm_setzero_ps();
-            vx2[i] = _mm_setzero_ps();
-            vy0[i] = _mm_setzero_ps();
-            vy1[i] = _mm_setzero_ps();
-            vy2[i] = _mm_setzero_ps();
+            vx0[i] = SIMD_MM(setzero_ps)();
+            vx1[i] = SIMD_MM(setzero_ps)();
+            vx2[i] = SIMD_MM(setzero_ps)();
+            vy0[i] = SIMD_MM(setzero_ps)();
+            vy1[i] = SIMD_MM(setzero_ps)();
+            vy2[i] = SIMD_MM(setzero_ps)();
         }
-        oldout = _mm_setzero_ps();
+        oldout = SIMD_MM(setzero_ps)();
     }
 
   private:
diff --git a/include/sst/filters/K35Filter.h b/include/sst/filters/K35Filter.h
index d5213aa..96dc03e 100644
--- a/include/sst/filters/K35Filter.h
+++ b/include/sst/filters/K35Filter.h
@@ -34,11 +34,11 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
     return std::clamp(freq, 5.f, (sampleRate * 0.3f));
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define D(a, b) _mm_div_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define D(a, b) SIMD_MM(div_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 
 // note that things that were NOPs in the Odin code have been removed.
 // m_gamma remains 1.0 so xn * m_gamma == xn; that's a NOP
@@ -47,14 +47,14 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
 // m_a_0 remains 1 so that's also a NOP
 // so we only need to compute:
 // (xn - z) * alpha + za
-static inline __m128 doLpf(const __m128 &G, const __m128 &input, __m128 &z) noexcept
+static inline SIMD_M128 doLpf(const SIMD_M128 &G, const SIMD_M128 &input, SIMD_M128 &z) noexcept
 {
-    const __m128 v = M(S(input, z), G);
-    const __m128 result = A(v, z);
+    const auto v = M(S(input, z), G);
+    const auto result = A(v, z);
     z = A(v, result);
     return result;
 }
-static inline __m128 doHpf(const __m128 &G, const __m128 &input, __m128 &z) noexcept
+static inline SIMD_M128 doHpf(const SIMD_M128 &G, const SIMD_M128 &input, SIMD_M128 &z) noexcept
 {
     return S(input, doLpf(G, input, z));
 }
@@ -125,47 +125,47 @@ inline void processCoeffs(QuadFilterUnitState *__restrict f)
         f->C[i] = A(f->C[i], f->dC[i]);
 }
 
-inline __m128 process_lp(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process_lp(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     processCoeffs(f);
 
-    const __m128 y1 = doLpf(f->C[k35_G], input, f->R[k35_lz]);
+    const auto y1 = doLpf(f->C[k35_G], input, f->R[k35_lz]);
     // (lpf beta * lpf2 feedback) + (hpf beta * hpf1 feedback)
-    const __m128 s35 = A(M(f->C[k35_lb], f->R[k35_2z]), M(f->C[k35_hb], f->R[k35_hz]));
+    const auto s35 = A(M(f->C[k35_lb], f->R[k35_2z]), M(f->C[k35_hb], f->R[k35_hz]));
     // alpha * (y1 + s35)
-    const __m128 u_clean = M(f->C[k35_alpha], A(y1, s35));
-    const __m128 u_driven = basic_blocks::dsp::fasttanhSSEclamped(M(u_clean, f->C[k35_saturation]));
-    const __m128 u =
+    const auto u_clean = M(f->C[k35_alpha], A(y1, s35));
+    const auto u_driven = basic_blocks::dsp::fasttanhSSEclamped(M(u_clean, f->C[k35_saturation]));
+    const auto u =
         A(M(u_clean, f->C[k35_saturation_blend_inv]), M(u_driven, f->C[k35_saturation_blend]));
 
     // mk * lpf2(u)
-    const __m128 y = M(f->C[k35_k], doLpf(f->C[k35_G], u, f->R[k35_2z]));
+    const auto y = M(f->C[k35_k], doLpf(f->C[k35_G], u, f->R[k35_2z]));
     doHpf(f->C[k35_G], y, f->R[k35_hz]);
 
-    const __m128 result = D(y, f->C[k35_k]);
+    const auto result = D(y, f->C[k35_k]);
 
     return result;
 }
 
-inline __m128 process_hp(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process_hp(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     processCoeffs(f);
 
-    const __m128 y1 = doHpf(f->C[k35_G], input, f->R[k35_hz]);
+    const auto y1 = doHpf(f->C[k35_G], input, f->R[k35_hz]);
     // (lpf beta * lpf2 feedback) + (hpf beta * hpf1 feedback)
-    const __m128 s35 = A(M(f->C[k35_hb], f->R[k35_2z]), M(f->C[k35_lb], f->R[k35_lz]));
+    const auto s35 = A(M(f->C[k35_hb], f->R[k35_2z]), M(f->C[k35_lb], f->R[k35_lz]));
     // alpha * (y1 + s35)
-    const __m128 u = M(f->C[k35_alpha], A(y1, s35));
+    const auto u = M(f->C[k35_alpha], A(y1, s35));
 
     // mk * lpf2(u)
-    const __m128 y_clean = M(f->C[k35_k], u);
-    const __m128 y_driven = basic_blocks::dsp::fasttanhSSEclamped(M(y_clean, f->C[k35_saturation]));
-    const __m128 y =
+    const auto y_clean = M(f->C[k35_k], u);
+    const auto y_driven = basic_blocks::dsp::fasttanhSSEclamped(M(y_clean, f->C[k35_saturation]));
+    const auto y =
         A(M(y_clean, f->C[k35_saturation_blend_inv]), M(y_driven, f->C[k35_saturation_blend]));
 
     doLpf(f->C[k35_G], doHpf(f->C[k35_G], y, f->R[k35_2z]), f->R[k35_lz]);
 
-    const __m128 result = D(y, f->C[k35_k]);
+    const auto result = D(y, f->C[k35_k]);
 
     return result;
 }
diff --git a/include/sst/filters/OBXDFilter.h b/include/sst/filters/OBXDFilter.h
index 5c72973..2fdfcdb 100644
--- a/include/sst/filters/OBXDFilter.h
+++ b/include/sst/filters/OBXDFilter.h
@@ -63,20 +63,20 @@ enum Params
 
 static constexpr int ssew = 4;
 
-const __m128 zero = _mm_set1_ps(0.0f);
-const __m128 nine_two_zero = _mm_set1_ps(0.00920833f);
-const __m128 zero_zero_five = _mm_set1_ps(0.05f);
-const __m128 eight_seven_six = _mm_set1_ps(0.0876f);
-const __m128 one_zero_three = _mm_set1_ps(0.0103592f);
-const __m128 one_eight_five = _mm_set1_ps(0.185f);
-const __m128 zero_four_five = _mm_set1_ps(0.45f);
-const __m128 zero_five = _mm_set1_ps(0.5f);
-const __m128 one = _mm_set1_ps(1.0f);
-const __m128 one_three_five = _mm_set1_ps(1.035f);
-const __m128 two = _mm_set1_ps(2.0f);
-const __m128 three = _mm_set1_ps(3.0f);
-const __m128 gainAdjustment2Pole = _mm_set1_ps(0.74f);
-const __m128 gainAdjustment4Pole = _mm_set1_ps(0.6f);
+const auto zero = SIMD_MM(set1_ps)(0.0f);
+const auto nine_two_zero = SIMD_MM(set1_ps)(0.00920833f);
+const auto zero_zero_five = SIMD_MM(set1_ps)(0.05f);
+const auto eight_seven_six = SIMD_MM(set1_ps)(0.0876f);
+const auto one_zero_three = SIMD_MM(set1_ps)(0.0103592f);
+const auto one_eight_five = SIMD_MM(set1_ps)(0.185f);
+const auto zero_four_five = SIMD_MM(set1_ps)(0.45f);
+const auto zero_five = SIMD_MM(set1_ps)(0.5f);
+const auto one = SIMD_MM(set1_ps)(1.0f);
+const auto one_three_five = SIMD_MM(set1_ps)(1.035f);
+const auto two = SIMD_MM(set1_ps)(2.0f);
+const auto three = SIMD_MM(set1_ps)(3.0f);
+const auto gainAdjustment2Pole = SIMD_MM(set1_ps)(0.74f);
+const auto gainAdjustment4Pole = SIMD_MM(set1_ps)(0.6f);
 
 template <typename TuningProvider>
 inline void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, Poles p, float freq,
@@ -133,141 +133,152 @@ inline void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, Poles p
     cm->FromDirect(lC);
 }
 
-inline __m128 diodePairResistanceApprox(__m128 x)
+inline SIMD_M128 diodePairResistanceApprox(SIMD_M128 x)
 {
     // return (((((0.0103592f * x) + 0.00920833f) * x + 0.185f) * x + 0.05f) * x + 1.0f);
-    return _mm_add_ps(
-        _mm_mul_ps(
-            _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(one_zero_three, x),
-                                                                   nine_two_zero),
-                                                        x),
-                                             one_eight_five),
-                                  x),
-                       zero_zero_five),
+    return SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(
+            SIMD_MM(add_ps)(
+                SIMD_MM(mul_ps)(
+                    SIMD_MM(add_ps)(
+                        SIMD_MM(mul_ps)(
+                            SIMD_MM(add_ps)(SIMD_MM(mul_ps)(one_zero_three, x), nine_two_zero), x),
+                        one_eight_five),
+                    x),
+                zero_zero_five),
             x),
         one);
     // Taylor approximation of a slightly mismatched diode pair
 }
 
 // resolve 0-delay feedback
-inline __m128 NewtonRaphson12dB(__m128 sample, QuadFilterUnitState *__restrict f)
+inline SIMD_M128 NewtonRaphson12dB(SIMD_M128 sample, QuadFilterUnitState *__restrict f)
 {
     // calculating feedback non-linear transconducance and compensated for R (-1)
     // boosting non-linearity
-    __m128 tCfb;
-    __m128 selfOscEnabledMask = _mm_cmpeq_ps(f->C[self_osc_push], one);
-    __m128 selfOscOffVal =
-        _mm_sub_ps(diodePairResistanceApprox(_mm_mul_ps(f->R[s1], eight_seven_six)), one);
-    __m128 selfOscOnVal = _mm_sub_ps(
-        diodePairResistanceApprox(_mm_mul_ps(f->R[s1], eight_seven_six)), one_three_five);
-    tCfb = _mm_add_ps(_mm_and_ps(selfOscEnabledMask, selfOscOnVal),
-                      _mm_andnot_ps(selfOscEnabledMask, selfOscOffVal));
+    SIMD_M128 tCfb;
+    auto selfOscEnabledMask = SIMD_MM(cmpeq_ps)(f->C[self_osc_push], one);
+    auto selfOscOffVal =
+        SIMD_MM(sub_ps)(diodePairResistanceApprox(SIMD_MM(mul_ps)(f->R[s1], eight_seven_six)), one);
+    auto selfOscOnVal = SIMD_MM(sub_ps)(
+        diodePairResistanceApprox(SIMD_MM(mul_ps)(f->R[s1], eight_seven_six)), one_three_five);
+    tCfb = SIMD_MM(add_ps)(SIMD_MM(and_ps)(selfOscEnabledMask, selfOscOnVal),
+                           SIMD_MM(andnot_ps)(selfOscEnabledMask, selfOscOffVal));
 
     // resolve linear feedback
     // float y = ((sample - 2*(s1*(R+tCfb)) - g*s1  - s2)/(1+ g*(2*(R+tCfb)+ g)));
-    __m128 y = _mm_div_ps(
-        _mm_sub_ps(
-            _mm_sub_ps(_mm_sub_ps(sample, _mm_mul_ps(two, _mm_mul_ps(f->R[s1],
-                                                                     _mm_add_ps(f->C[R12], tCfb)))),
-                       _mm_mul_ps(f->C[g12], f->R[s1])),
+    auto y = SIMD_MM(div_ps)(
+        SIMD_MM(sub_ps)(
+            SIMD_MM(sub_ps)(
+                SIMD_MM(sub_ps)(
+                    sample, SIMD_MM(mul_ps)(
+                                two, SIMD_MM(mul_ps)(f->R[s1], SIMD_MM(add_ps)(f->C[R12], tCfb)))),
+                SIMD_MM(mul_ps)(f->C[g12], f->R[s1])),
             f->R[s2]),
-        _mm_add_ps(
-            one, _mm_mul_ps(f->C[g12],
-                            _mm_add_ps(_mm_mul_ps(two, _mm_add_ps(f->C[R12], tCfb)), f->C[g12]))));
+        SIMD_MM(add_ps)(
+            one,
+            SIMD_MM(mul_ps)(f->C[g12],
+                            SIMD_MM(add_ps)(SIMD_MM(mul_ps)(two, SIMD_MM(add_ps)(f->C[R12], tCfb)),
+                                            f->C[g12]))));
 
     return y;
 }
 
-inline __m128 process_2_pole(QuadFilterUnitState *__restrict f, __m128 sample)
+inline SIMD_M128 process_2_pole(QuadFilterUnitState *__restrict f, SIMD_M128 sample)
 {
     for (int i = 0; i < n_obxd12_coeff; i++)
     {
-        f->C[i] = _mm_add_ps(f->C[i], f->dC[i]);
+        f->C[i] = SIMD_MM(add_ps)(f->C[i], f->dC[i]);
     }
 
     // float v = ((sample- R * s1*2 - g2*s1 - s2)/(1+ R*g1*2 + g1*g2));
-    __m128 v = NewtonRaphson12dB(sample, f);
+    auto v = NewtonRaphson12dB(sample, f);
     // float y1 = v * g + s1;
-    __m128 y1 = _mm_add_ps(_mm_mul_ps(v, f->C[g12]), f->R[s1]);
+    auto y1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(v, f->C[g12]), f->R[s1]);
     // s1 = v * g + y1;
-    f->R[s1] = _mm_add_ps(_mm_mul_ps(v, f->C[g12]), y1);
+    f->R[s1] = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(v, f->C[g12]), y1);
     // float y2 = y1 * g + s2;
-    __m128 y2 = _mm_add_ps(_mm_mul_ps(y1, f->C[g12]), f->R[s2]);
+    auto y2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(y1, f->C[g12]), f->R[s2]);
     // s2 = y1 * g + y2;
-    f->R[s2] = _mm_add_ps(_mm_mul_ps(y1, f->C[g12]), y2);
-
-    __m128 mc;
-    __m128 mask_bp = _mm_cmpeq_ps(f->C[bandpass], zero);
-    __m128 bp_false = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, f->C[multimode]), y2),
-                                 _mm_mul_ps(f->C[multimode], v));
-    __m128 mask = _mm_cmplt_ps(f->C[multimode], zero_five);
-    __m128 val1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(zero_five, f->C[multimode]), y2),
-                             _mm_mul_ps(f->C[multimode], y1));
-    __m128 val2 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, f->C[multimode]), y1),
-                             _mm_mul_ps(_mm_sub_ps(f->C[multimode], zero_five), v));
-    __m128 bp_true = _mm_add_ps(_mm_and_ps(mask, val1), _mm_andnot_ps(mask, val2));
-    mc = _mm_add_ps(_mm_and_ps(mask_bp, bp_false), _mm_andnot_ps(mask_bp, bp_true));
-    return _mm_mul_ps(mc, gainAdjustment2Pole);
+    f->R[s2] = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(y1, f->C[g12]), y2);
+
+    SIMD_M128 mc;
+    auto mask_bp = SIMD_MM(cmpeq_ps)(f->C[bandpass], zero);
+    auto bp_false = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, f->C[multimode]), y2),
+                                    SIMD_MM(mul_ps)(f->C[multimode], v));
+    auto mask = SIMD_MM(cmplt_ps)(f->C[multimode], zero_five);
+    auto val1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(zero_five, f->C[multimode]), y2),
+                                SIMD_MM(mul_ps)(f->C[multimode], y1));
+    auto val2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, f->C[multimode]), y1),
+                                SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(f->C[multimode], zero_five), v));
+    auto bp_true = SIMD_MM(add_ps)(SIMD_MM(and_ps)(mask, val1), SIMD_MM(andnot_ps)(mask, val2));
+    mc = SIMD_MM(add_ps)(SIMD_MM(and_ps)(mask_bp, bp_false), SIMD_MM(andnot_ps)(mask_bp, bp_true));
+    return SIMD_MM(mul_ps)(mc, gainAdjustment2Pole);
 }
 
-inline __m128 NewtonRaphsonR24dB(__m128 sample, __m128 lpc, QuadFilterUnitState *__restrict f)
+inline SIMD_M128 NewtonRaphsonR24dB(SIMD_M128 sample, SIMD_M128 lpc,
+                                    QuadFilterUnitState *__restrict f)
 {
     // float ml = 1 / (1+g24);
-    __m128 ml = _mm_div_ps(one, _mm_add_ps(one, f->C[g24]));
+    auto ml = SIMD_MM(div_ps)(one, SIMD_MM(add_ps)(one, f->C[g24]));
     // float S = (lpc * (lpc * (lpc * f->R[s1] + f->R[s2]) + f->R[s3]) + f->R[s4]) * ml;
-    __m128 S = _mm_mul_ps(
-        _mm_add_ps(_mm_mul_ps(lpc, _mm_add_ps(_mm_mul_ps(lpc, _mm_add_ps(_mm_mul_ps(lpc, f->R[s1]),
-                                                                         f->R[s2])),
-                                              f->R[s3])),
-                   f->R[s4]),
+    auto S = SIMD_MM(mul_ps)(
+        SIMD_MM(add_ps)(
+            SIMD_MM(mul_ps)(
+                lpc,
+                SIMD_MM(add_ps)(
+                    SIMD_MM(mul_ps)(lpc, SIMD_MM(add_ps)(SIMD_MM(mul_ps)(lpc, f->R[s1]), f->R[s2])),
+                    f->R[s3])),
+            f->R[s4]),
         ml);
     // float G = lpc * lpc * lpc * lpc;
-    __m128 G = _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(lpc, lpc), lpc), lpc);
+    auto G = SIMD_MM(mul_ps)(SIMD_MM(mul_ps)(SIMD_MM(mul_ps)(lpc, lpc), lpc), lpc);
     // float y = (sample - f->C[R24] * S) / (1 + f->C[R24] * G);
-    __m128 y = _mm_div_ps(_mm_sub_ps(sample, _mm_mul_ps(f->C[R24], S)),
-                          _mm_add_ps(one, _mm_mul_ps(f->C[R24], G)));
+    auto y = SIMD_MM(div_ps)(SIMD_MM(sub_ps)(sample, SIMD_MM(mul_ps)(f->C[R24], S)),
+                             SIMD_MM(add_ps)(one, SIMD_MM(mul_ps)(f->C[R24], G)));
 
     return y;
 }
 
-inline static __m128 tptpc(__m128 &state, __m128 inp, __m128 cutoff)
+inline static SIMD_M128 tptpc(SIMD_M128 &state, SIMD_M128 inp, SIMD_M128 cutoff)
 {
-    __m128 v = _mm_div_ps(_mm_mul_ps(_mm_sub_ps(inp, state), cutoff), _mm_add_ps(one, cutoff));
-    __m128 res = _mm_add_ps(v, state);
-    state = _mm_add_ps(res, v);
+    auto v = SIMD_MM(div_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(inp, state), cutoff),
+                             SIMD_MM(add_ps)(one, cutoff));
+    auto res = SIMD_MM(add_ps)(v, state);
+    state = SIMD_MM(add_ps)(res, v);
     return res;
 }
 
-inline __m128 process_4_pole(QuadFilterUnitState *__restrict f, __m128 sample)
+inline SIMD_M128 process_4_pole(QuadFilterUnitState *__restrict f, SIMD_M128 sample)
 {
     for (int i = 0; i < n_obxd24_coeff; i++)
     {
-        f->C[i] = _mm_add_ps(f->C[i], f->dC[i]);
+        f->C[i] = SIMD_MM(add_ps)(f->C[i], f->dC[i]);
     }
 
     // float lpc = f->C[g] / (1 + f->C[g]);
-    __m128 lpc = _mm_div_ps(f->C[g24], _mm_add_ps(one, f->C[g24]));
+    auto lpc = SIMD_MM(div_ps)(f->C[g24], SIMD_MM(add_ps)(one, f->C[g24]));
 
     // float y0 = NewtonRaphsonR24dB(sample,f->C[g],lpc);
-    __m128 y0 = NewtonRaphsonR24dB(sample, lpc, f);
+    auto y0 = NewtonRaphsonR24dB(sample, lpc, f);
 
     // first lowpass in cascade
     // double v = (y0 - f->R[s1]) * lpc;
-    __m128 v = _mm_mul_ps(_mm_sub_ps(y0, f->R[s1]), lpc);
+    auto v = SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(y0, f->R[s1]), lpc);
 
     // double res = v + f->R[s1];
-    __m128 res = _mm_add_ps(v, f->R[s1]);
+    auto res = SIMD_MM(add_ps)(v, f->R[s1]);
 
     // f->R[s1] = res + v;
-    f->R[s1] = _mm_add_ps(res, v);
+    f->R[s1] = SIMD_MM(add_ps)(res, v);
 
     // damping
     // f->R[s1] =atan(s1*rcor24)*rcor24inv;
-    __m128 s1_rcor24 = _mm_mul_ps(f->R[s1], f->C[rcor24]);
+    auto s1_rcor24 = SIMD_MM(mul_ps)(f->R[s1], f->C[rcor24]);
 
     // this array must be aligned to a 16-byte boundary for SSE store/load
     float s1_rcor24_arr alignas(16)[ssew];
-    _mm_store_ps(s1_rcor24_arr, s1_rcor24);
+    SIMD_MM(store_ps)(s1_rcor24_arr, s1_rcor24);
 
     for (int i = 0; i < ssew; i++)
     {
@@ -277,34 +288,36 @@ inline __m128 process_4_pole(QuadFilterUnitState *__restrict f, __m128 sample)
             s1_rcor24_arr[i] = 0.f;
     }
 
-    s1_rcor24 = _mm_load_ps(s1_rcor24_arr);
-    f->R[s1] = _mm_mul_ps(s1_rcor24, f->C[rcor24inv]);
-
-    __m128 y1 = res;
-    __m128 y2 = tptpc(f->R[s2], y1, f->C[g24]);
-    __m128 y3 = tptpc(f->R[s3], y2, f->C[g24]);
-    __m128 y4 = tptpc(f->R[s4], y3, f->C[g24]);
-
-    __m128 mc;
-
-    __m128 zero_val = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, f->C[pole_mix_scaled]), y4),
-                                 _mm_add_ps(f->C[pole_mix_scaled], y3));
-    __m128 zero_mask = _mm_cmpeq_ps(f->C[pole_mix_inv_int], zero);
-    __m128 one_mask = _mm_cmpeq_ps(f->C[pole_mix_inv_int], one);
-    __m128 one_val = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, f->C[pole_mix_scaled]), y3),
-                                _mm_mul_ps(f->C[pole_mix_scaled], y2));
-    __m128 two_mask = _mm_cmpeq_ps(f->C[pole_mix_inv_int], two);
-    __m128 two_val = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, f->C[pole_mix_scaled]), y2),
-                                _mm_mul_ps(f->C[pole_mix_scaled], y1));
-    __m128 three_mask = _mm_cmpeq_ps(f->C[pole_mix_inv_int], three);
-    __m128 three_val = y1;
-    mc = _mm_add_ps(_mm_and_ps(zero_mask, zero_val), _mm_and_ps(one_mask, one_val));
-    mc = _mm_add_ps(mc,
-                    _mm_add_ps(_mm_and_ps(two_mask, two_val), _mm_and_ps(three_mask, three_val)));
+    s1_rcor24 = SIMD_MM(load_ps)(s1_rcor24_arr);
+    f->R[s1] = SIMD_MM(mul_ps)(s1_rcor24, f->C[rcor24inv]);
+
+    auto y1 = res;
+    auto y2 = tptpc(f->R[s2], y1, f->C[g24]);
+    auto y3 = tptpc(f->R[s3], y2, f->C[g24]);
+    auto y4 = tptpc(f->R[s4], y3, f->C[g24]);
+
+    SIMD_M128 mc;
+
+    auto zero_val =
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, f->C[pole_mix_scaled]), y4),
+                        SIMD_MM(add_ps)(f->C[pole_mix_scaled], y3));
+    auto zero_mask = SIMD_MM(cmpeq_ps)(f->C[pole_mix_inv_int], zero);
+    auto one_mask = SIMD_MM(cmpeq_ps)(f->C[pole_mix_inv_int], one);
+    auto one_val = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, f->C[pole_mix_scaled]), y3),
+                                   SIMD_MM(mul_ps)(f->C[pole_mix_scaled], y2));
+    auto two_mask = SIMD_MM(cmpeq_ps)(f->C[pole_mix_inv_int], two);
+    auto two_val = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, f->C[pole_mix_scaled]), y2),
+                                   SIMD_MM(mul_ps)(f->C[pole_mix_scaled], y1));
+    auto three_mask = SIMD_MM(cmpeq_ps)(f->C[pole_mix_inv_int], three);
+    auto three_val = y1;
+    mc = SIMD_MM(add_ps)(SIMD_MM(and_ps)(zero_mask, zero_val), SIMD_MM(and_ps)(one_mask, one_val));
+    mc = SIMD_MM(add_ps)(mc, SIMD_MM(add_ps)(SIMD_MM(and_ps)(two_mask, two_val),
+                                             SIMD_MM(and_ps)(three_mask, three_val)));
 
     // half volume compensation
-    auto out = _mm_mul_ps(mc, _mm_add_ps(one, _mm_mul_ps(f->C[R24], zero_four_five)));
-    return _mm_mul_ps(out, gainAdjustment4Pole);
+    auto out =
+        SIMD_MM(mul_ps)(mc, SIMD_MM(add_ps)(one, SIMD_MM(mul_ps)(f->C[R24], zero_four_five)));
+    return SIMD_MM(mul_ps)(out, gainAdjustment4Pole);
 }
 } // namespace sst::filters::OBXDFilter
 
diff --git a/include/sst/filters/QuadFilterUnit.h b/include/sst/filters/QuadFilterUnit.h
index 379d998..c4895ec 100644
--- a/include/sst/filters/QuadFilterUnit.h
+++ b/include/sst/filters/QuadFilterUnit.h
@@ -29,13 +29,13 @@ constexpr int n_filter_registers = 16;
 struct alignas(16) QuadFilterUnitState
 {
     /** Filter coefficients */
-    __m128 C[n_cm_coeffs];
+    SIMD_M128 C[n_cm_coeffs];
 
     /** Filter coefficients "delta" */
-    __m128 dC[n_cm_coeffs];
+    SIMD_M128 dC[n_cm_coeffs];
 
     /** Filter state */
-    __m128 R[n_filter_registers];
+    SIMD_M128 R[n_filter_registers];
 
     /** Array of pointers to the filter's delay buffers */
     float *DB[4];
@@ -54,7 +54,7 @@ struct alignas(16) QuadFilterUnitState
 };
 
 /** Typedef alias for a filter unit processing method. */
-typedef __m128 (*FilterUnitQFPtr)(QuadFilterUnitState *__restrict, __m128 in);
+typedef SIMD_M128 (*FilterUnitQFPtr)(QuadFilterUnitState *__restrict, SIMD_M128 in);
 
 /**
  * Returns a filter unit pointer and optionally applies gain scaling. The gain
diff --git a/include/sst/filters/QuadFilterUnit_Impl.h b/include/sst/filters/QuadFilterUnit_Impl.h
index f03294c..6f033ea 100644
--- a/include/sst/filters/QuadFilterUnit_Impl.h
+++ b/include/sst/filters/QuadFilterUnit_Impl.h
@@ -30,212 +30,223 @@
 namespace sst::filters
 {
 
-inline __m128 SVFLP12Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFLP12Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    __m128 L2 = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    __m128 H2 = _mm_sub_ps(_mm_sub_ps(in, L2), _mm_mul_ps(f->C[1], B));
-    __m128 B2 = _mm_add_ps(B, _mm_mul_ps(f->C[0], H2));
+    auto L2 = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    auto H2 = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L2), SIMD_MM(mul_ps)(f->C[1], B));
+    auto B2 = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H2));
 
-    f->R[0] = _mm_mul_ps(B2, f->R[2]);
-    f->R[1] = _mm_mul_ps(L2, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B2, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L2, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(L2, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(L2, f->C[3]);
 }
 
-inline __m128 SVFLP24Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFLP24Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[0] = _mm_mul_ps(B, f->R[2]);
-    f->R[1] = _mm_mul_ps(L, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L, f->R[2]);
 
     in = L;
 
-    L = _mm_add_ps(f->R[4], _mm_mul_ps(f->C[0], f->R[3]));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[3]));
-    B = _mm_add_ps(f->R[3], _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(f->R[4], SIMD_MM(mul_ps)(f->C[0], f->R[3]));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[3]));
+    B = SIMD_MM(add_ps)(f->R[3], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[3] = _mm_mul_ps(B, f->R[2]);
-    f->R[4] = _mm_mul_ps(L, f->R[2]);
+    f->R[3] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[4] = SIMD_MM(mul_ps)(L, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(L, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(L, f->C[3]);
 }
 
-inline __m128 SVFHP24Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFHP24Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[0] = _mm_mul_ps(B, f->R[2]);
-    f->R[1] = _mm_mul_ps(L, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L, f->R[2]);
 
     in = H;
 
-    L = _mm_add_ps(f->R[4], _mm_mul_ps(f->C[0], f->R[3]));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[3]));
-    B = _mm_add_ps(f->R[3], _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(f->R[4], SIMD_MM(mul_ps)(f->C[0], f->R[3]));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[3]));
+    B = SIMD_MM(add_ps)(f->R[3], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[3] = _mm_mul_ps(B, f->R[2]);
-    f->R[4] = _mm_mul_ps(L, f->R[2]);
+    f->R[3] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[4] = SIMD_MM(mul_ps)(L, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(H, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(H, f->C[3]);
 }
 
-inline __m128 SVFBP24Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFBP24Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[0] = _mm_mul_ps(B, f->R[2]);
-    f->R[1] = _mm_mul_ps(L, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L, f->R[2]);
 
     in = B;
 
-    L = _mm_add_ps(f->R[4], _mm_mul_ps(f->C[0], f->R[3]));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[3]));
-    B = _mm_add_ps(f->R[3], _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(f->R[4], SIMD_MM(mul_ps)(f->C[0], f->R[3]));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[3]));
+    B = SIMD_MM(add_ps)(f->R[3], SIMD_MM(mul_ps)(f->C[0], H));
 
-    L = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], B));
-    B = _mm_add_ps(B, _mm_mul_ps(f->C[0], H));
+    L = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], B));
+    B = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H));
 
-    f->R[3] = _mm_mul_ps(B, f->R[2]);
-    f->R[4] = _mm_mul_ps(L, f->R[2]);
+    f->R[3] = SIMD_MM(mul_ps)(B, f->R[2]);
+    f->R[4] = SIMD_MM(mul_ps)(L, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(B, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(B, f->C[3]);
 }
 
-inline __m128 SVFHP12Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFHP12Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    __m128 L2 = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    __m128 H2 = _mm_sub_ps(_mm_sub_ps(in, L2), _mm_mul_ps(f->C[1], B));
-    __m128 B2 = _mm_add_ps(B, _mm_mul_ps(f->C[0], H2));
+    auto L2 = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    auto H2 = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L2), SIMD_MM(mul_ps)(f->C[1], B));
+    auto B2 = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H2));
 
-    f->R[0] = _mm_mul_ps(B2, f->R[2]);
-    f->R[1] = _mm_mul_ps(L2, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B2, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L2, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(H2, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(H2, f->C[3]);
 }
 
-inline __m128 SVFBP12Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SVFBP12Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // F1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // Q1
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // F1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // Q1
 
-    __m128 L = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[0], f->R[0]));
-    __m128 H = _mm_sub_ps(_mm_sub_ps(in, L), _mm_mul_ps(f->C[1], f->R[0]));
-    __m128 B = _mm_add_ps(f->R[0], _mm_mul_ps(f->C[0], H));
+    auto L = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[0], f->R[0]));
+    auto H = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L), SIMD_MM(mul_ps)(f->C[1], f->R[0]));
+    auto B = SIMD_MM(add_ps)(f->R[0], SIMD_MM(mul_ps)(f->C[0], H));
 
-    __m128 L2 = _mm_add_ps(L, _mm_mul_ps(f->C[0], B));
-    __m128 H2 = _mm_sub_ps(_mm_sub_ps(in, L2), _mm_mul_ps(f->C[1], B));
-    __m128 B2 = _mm_add_ps(B, _mm_mul_ps(f->C[0], H2));
+    auto L2 = SIMD_MM(add_ps)(L, SIMD_MM(mul_ps)(f->C[0], B));
+    auto H2 = SIMD_MM(sub_ps)(SIMD_MM(sub_ps)(in, L2), SIMD_MM(mul_ps)(f->C[1], B));
+    auto B2 = SIMD_MM(add_ps)(B, SIMD_MM(mul_ps)(f->C[0], H2));
 
-    f->R[0] = _mm_mul_ps(B2, f->R[2]);
-    f->R[1] = _mm_mul_ps(L2, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(B2, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(L2, f->R[2]);
 
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[2], _mm_mul_ps(B, B))));
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[2], SIMD_MM(mul_ps)(B, B))));
 
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Gain
-    return _mm_mul_ps(B2, f->C[3]);
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Gain
+    return SIMD_MM(mul_ps)(B2, f->C[3]);
 }
 
-inline __m128 IIR12Aquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR12Aquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]);                                       // K2
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]);                                       // Q2
-    __m128 f2 = _mm_sub_ps(_mm_mul_ps(f->C[3], in), _mm_mul_ps(f->C[1], f->R[1])); // Q2*in - K2*R1
-    __m128 g2 = _mm_add_ps(_mm_mul_ps(f->C[1], in), _mm_mul_ps(f->C[3], f->R[1])); // K2*in + Q2*R1
-
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]);                                       // K1
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);                                       // Q1
-    __m128 f1 = _mm_sub_ps(_mm_mul_ps(f->C[2], f2), _mm_mul_ps(f->C[0], f->R[0])); // Q1*f2 - K1*R0
-    __m128 g1 = _mm_add_ps(_mm_mul_ps(f->C[0], f2), _mm_mul_ps(f->C[2], f->R[0])); // K1*f2 + Q1*R0
-
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // V1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // V2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // V3
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[6], g2), _mm_mul_ps(f->C[5], g1)),
-                          _mm_mul_ps(f->C[4], f1));
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // K2
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Q2
+    auto f2 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[3], in),
+                              SIMD_MM(mul_ps)(f->C[1], f->R[1])); // Q2*in - K2*R1
+    auto g2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], in),
+                              SIMD_MM(mul_ps)(f->C[3], f->R[1])); // K2*in + Q2*R1
+
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // K1
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // Q1
+    auto f1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[2], f2),
+                              SIMD_MM(mul_ps)(f->C[0], f->R[0])); // Q1*f2 - K1*R0
+    auto g1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[0], f2),
+                              SIMD_MM(mul_ps)(f->C[2], f->R[0])); // K1*f2 + Q1*R0
+
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // V1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // V2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // V3
+    auto y =
+        SIMD_MM(add_ps)(SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[6], g2), SIMD_MM(mul_ps)(f->C[5], g1)),
+                        SIMD_MM(mul_ps)(f->C[4], f1));
 
     f->R[0] = f1;
     f->R[1] = g1;
@@ -243,122 +254,136 @@ inline __m128 IIR12Aquad(QuadFilterUnitState *__restrict f, __m128 in)
     return y;
 }
 
-inline __m128 IIR12Bquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR12Bquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    __m128 f2 = _mm_sub_ps(_mm_mul_ps(f->C[3], in), _mm_mul_ps(f->C[1], f->R[1])); // Q2*in - K2*R1
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]);                                       // K2
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]);                                       // Q2
-    __m128 g2 = _mm_add_ps(_mm_mul_ps(f->C[1], in), _mm_mul_ps(f->C[3], f->R[1])); // K2*in + Q2*R1
-
-    __m128 f1 = _mm_sub_ps(_mm_mul_ps(f->C[2], f2), _mm_mul_ps(f->C[0], f->R[0])); // Q1*f2 - K1*R0
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]);                                       // K1
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);                                       // Q1
-    __m128 g1 = _mm_add_ps(_mm_mul_ps(f->C[0], f2), _mm_mul_ps(f->C[2], f->R[0])); // K1*f2 + Q1*R0
-
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // V1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // V2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // V3
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[6], g2), _mm_mul_ps(f->C[5], g1)),
-                          _mm_mul_ps(f->C[4], f1));
-
-    f->R[0] = _mm_mul_ps(f1, f->R[2]);
-    f->R[1] = _mm_mul_ps(g1, f->R[2]);
-
-    f->C[7] = _mm_add_ps(f->C[7], f->dC[7]); // Clipgain
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[7], _mm_mul_ps(y, y))));
+    auto f2 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[3], in),
+                              SIMD_MM(mul_ps)(f->C[1], f->R[1])); // Q2*in - K2*R1
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]);                 // K2
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]);                 // Q2
+    auto g2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], in),
+                              SIMD_MM(mul_ps)(f->C[3], f->R[1])); // K2*in + Q2*R1
+
+    auto f1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[2], f2),
+                              SIMD_MM(mul_ps)(f->C[0], f->R[0])); // Q1*f2 - K1*R0
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]);                 // K1
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);                 // Q1
+    auto g1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[0], f2),
+                              SIMD_MM(mul_ps)(f->C[2], f->R[0])); // K1*f2 + Q1*R0
+
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // V1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // V2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // V3
+    auto y =
+        SIMD_MM(add_ps)(SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[6], g2), SIMD_MM(mul_ps)(f->C[5], g1)),
+                        SIMD_MM(mul_ps)(f->C[4], f1));
+
+    f->R[0] = SIMD_MM(mul_ps)(f1, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(g1, f->R[2]);
+
+    f->C[7] = SIMD_MM(add_ps)(f->C[7], f->dC[7]); // Clipgain
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[7], SIMD_MM(mul_ps)(y, y))));
 
     return y;
 }
 
-inline __m128 IIR12WDFquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR12WDFquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // E1 * sc
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // E2 * sc
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // -E1 / sc
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // -E2 / sc
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // C1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // C2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // D
-
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[0]), _mm_mul_ps(f->C[6], in)),
-                          _mm_mul_ps(f->C[5], f->R[1]));
-    __m128 t =
-        _mm_add_ps(in, _mm_add_ps(_mm_mul_ps(f->C[2], f->R[0]), _mm_mul_ps(f->C[3], f->R[1])));
-
-    __m128 s1 = _mm_add_ps(_mm_mul_ps(t, f->C[0]), f->R[0]);
-    __m128 s2 = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(t, f->C[1]), f->R[1]));
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // E1 * sc
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // E2 * sc
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // -E1 / sc
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // -E2 / sc
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // C1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // C2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // D
+
+    auto y = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[0]), SIMD_MM(mul_ps)(f->C[6], in)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[1]));
+    auto t = SIMD_MM(add_ps)(
+        in, SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[2], f->R[0]), SIMD_MM(mul_ps)(f->C[3], f->R[1])));
+
+    auto s1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(t, f->C[0]), f->R[0]);
+    auto s2 = SIMD_MM(sub_ps)(SIMD_MM(setzero_ps)(),
+                              SIMD_MM(add_ps)(SIMD_MM(mul_ps)(t, f->C[1]), f->R[1]));
 
     // f->R[0] = s1;
     // f->R[1] = s2;
 
-    f->R[0] = _mm_mul_ps(s1, f->R[2]);
-    f->R[1] = _mm_mul_ps(s2, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(s1, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(s2, f->R[2]);
 
-    f->C[7] = _mm_add_ps(f->C[7], f->dC[7]); // Clipgain
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[7], _mm_mul_ps(y, y))));
+    f->C[7] = SIMD_MM(add_ps)(f->C[7], f->dC[7]); // Clipgain
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[7], SIMD_MM(mul_ps)(y, y))));
 
     return y;
 }
 
-inline __m128 IIR12CFCquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR12CFCquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     // State-space with clipgain (2nd order, limit within register)
 
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // ar
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // ai
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // b1
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // c1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // c2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // d
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // ar
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // ai
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // b1
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // c1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // c2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // d
 
     // y(i) = c1.*s(1) + c2.*s(2) + d.*x(i);
     // s1 = ar.*s(1) - ai.*s(2) + x(i);
     // s2 = ai.*s(1) + ar.*s(2);
 
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[0]), _mm_mul_ps(f->C[6], in)),
-                          _mm_mul_ps(f->C[5], f->R[1]));
-    __m128 s1 = _mm_add_ps(_mm_mul_ps(in, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(f->C[0], f->R[0]), _mm_mul_ps(f->C[1], f->R[1])));
-    __m128 s2 = _mm_add_ps(_mm_mul_ps(f->C[1], f->R[0]), _mm_mul_ps(f->C[0], f->R[1]));
+    auto y = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[0]), SIMD_MM(mul_ps)(f->C[6], in)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[1]));
+    auto s1 = SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(in, f->C[2]),
+        SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[0], f->R[0]), SIMD_MM(mul_ps)(f->C[1], f->R[1])));
+    auto s2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], f->R[0]), SIMD_MM(mul_ps)(f->C[0], f->R[1]));
 
-    f->R[0] = _mm_mul_ps(s1, f->R[2]);
-    f->R[1] = _mm_mul_ps(s2, f->R[2]);
+    f->R[0] = SIMD_MM(mul_ps)(s1, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(s2, f->R[2]);
 
-    f->C[7] = _mm_add_ps(f->C[7], f->dC[7]); // Clipgain
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[7], _mm_mul_ps(y, y))));
+    f->C[7] = SIMD_MM(add_ps)(f->C[7], f->dC[7]); // Clipgain
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] =
+        SIMD_MM(max_ps)(m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[7], SIMD_MM(mul_ps)(y, y))));
 
     return y;
 }
 
-inline __m128 IIR12CFLquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR12CFLquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     // State-space with softer limiter
 
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // (ar)
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // (ai)
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // b1
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // c1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // c2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // d
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // (ar)
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // (ai)
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // b1
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // c1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // c2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // d
 
     // y(i) = c1.*s(1) + c2.*s(2) + d.*x(i);
     // s1 = ar.*s(1) - ai.*s(2) + x(i);
     // s2 = ai.*s(1) + ar.*s(2);
 
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[0]), _mm_mul_ps(f->C[6], in)),
-                          _mm_mul_ps(f->C[5], f->R[1]));
-    __m128 ar = _mm_mul_ps(f->C[0], f->R[2]);
-    __m128 ai = _mm_mul_ps(f->C[1], f->R[2]);
-    __m128 s1 = _mm_add_ps(_mm_mul_ps(in, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(ar, f->R[0]), _mm_mul_ps(ai, f->R[1])));
-    __m128 s2 = _mm_add_ps(_mm_mul_ps(ai, f->R[0]), _mm_mul_ps(ar, f->R[1]));
+    auto y = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[0]), SIMD_MM(mul_ps)(f->C[6], in)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[1]));
+    auto ar = SIMD_MM(mul_ps)(f->C[0], f->R[2]);
+    auto ai = SIMD_MM(mul_ps)(f->C[1], f->R[2]);
+    auto s1 = SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(in, f->C[2]),
+        SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(ar, f->R[0]), SIMD_MM(mul_ps)(ai, f->R[1])));
+    auto s2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(ai, f->R[0]), SIMD_MM(mul_ps)(ar, f->R[1]));
 
     f->R[0] = s1;
     f->R[1] = s2;
@@ -367,84 +392,93 @@ inline __m128 IIR12CFLquad(QuadFilterUnitState *__restrict f, __m128 in)
     mr = mr.*0.99 + m.*0.01;*/
 
     // Limiter
-    const __m128 m001 = _mm_set1_ps(0.001f);
-    const __m128 m099 = _mm_set1_ps(0.999f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    const __m128 m2 = _mm_set1_ps(2.0f);
+    const auto m001 = SIMD_MM(set1_ps)(0.001f);
+    const auto m099 = SIMD_MM(set1_ps)(0.999f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    const auto m2 = SIMD_MM(set1_ps)(2.0f);
 
-    __m128 m = _mm_rsqrt_ps(
-        _mm_max_ps(m1, _mm_mul_ps(m2, _mm_and_ps(y, basic_blocks::mechanics::m128_mask_absval))));
-    f->R[2] = _mm_add_ps(_mm_mul_ps(f->R[2], m099), _mm_mul_ps(m, m001));
+    auto m = SIMD_MM(rsqrt_ps)(SIMD_MM(max_ps)(
+        m1, SIMD_MM(mul_ps)(m2, SIMD_MM(and_ps)(y, basic_blocks::mechanics::m128_mask_absval))));
+    f->R[2] = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->R[2], m099), SIMD_MM(mul_ps)(m, m001));
 
     return y;
 }
 
-inline __m128 IIR24CFCquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR24CFCquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     // State-space with clipgain (2nd order, limit within register)
 
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // ar
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // ai
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // b1
-
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // c1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // c2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // d
-
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[0]), _mm_mul_ps(f->C[6], in)),
-                          _mm_mul_ps(f->C[5], f->R[1]));
-    __m128 s1 = _mm_add_ps(_mm_mul_ps(in, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(f->C[0], f->R[0]), _mm_mul_ps(f->C[1], f->R[1])));
-    __m128 s2 = _mm_add_ps(_mm_mul_ps(f->C[1], f->R[0]), _mm_mul_ps(f->C[0], f->R[1]));
-
-    f->R[0] = _mm_mul_ps(s1, f->R[2]);
-    f->R[1] = _mm_mul_ps(s2, f->R[2]);
-
-    __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[3]), _mm_mul_ps(f->C[6], y)),
-                           _mm_mul_ps(f->C[5], f->R[4]));
-    __m128 s3 = _mm_add_ps(_mm_mul_ps(y, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(f->C[0], f->R[3]), _mm_mul_ps(f->C[1], f->R[4])));
-    __m128 s4 = _mm_add_ps(_mm_mul_ps(f->C[1], f->R[3]), _mm_mul_ps(f->C[0], f->R[4]));
-
-    f->R[3] = _mm_mul_ps(s3, f->R[2]);
-    f->R[4] = _mm_mul_ps(s4, f->R[2]);
-
-    f->C[7] = _mm_add_ps(f->C[7], f->dC[7]); // Clipgain
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[2] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[7], _mm_mul_ps(y2, y2))));
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // ar
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // ai
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // b1
+
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // c1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // c2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // d
+
+    auto y = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[0]), SIMD_MM(mul_ps)(f->C[6], in)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[1]));
+    auto s1 = SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(in, f->C[2]),
+        SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[0], f->R[0]), SIMD_MM(mul_ps)(f->C[1], f->R[1])));
+    auto s2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], f->R[0]), SIMD_MM(mul_ps)(f->C[0], f->R[1]));
+
+    f->R[0] = SIMD_MM(mul_ps)(s1, f->R[2]);
+    f->R[1] = SIMD_MM(mul_ps)(s2, f->R[2]);
+
+    auto y2 = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[3]), SIMD_MM(mul_ps)(f->C[6], y)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[4]));
+    auto s3 = SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(y, f->C[2]),
+        SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[0], f->R[3]), SIMD_MM(mul_ps)(f->C[1], f->R[4])));
+    auto s4 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], f->R[3]), SIMD_MM(mul_ps)(f->C[0], f->R[4]));
+
+    f->R[3] = SIMD_MM(mul_ps)(s3, f->R[2]);
+    f->R[4] = SIMD_MM(mul_ps)(s4, f->R[2]);
+
+    f->C[7] = SIMD_MM(add_ps)(f->C[7], f->dC[7]); // Clipgain
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[2] = SIMD_MM(max_ps)(
+        m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[7], SIMD_MM(mul_ps)(y2, y2))));
 
     return y2;
 }
 
-inline __m128 IIR24CFLquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR24CFLquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     // State-space with softer limiter
 
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // (ar)
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // (ai)
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // b1
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // c1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // c2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // d
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // (ar)
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // (ai)
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // b1
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // c1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // c2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // d
 
-    __m128 ar = _mm_mul_ps(f->C[0], f->R[2]);
-    __m128 ai = _mm_mul_ps(f->C[1], f->R[2]);
+    auto ar = SIMD_MM(mul_ps)(f->C[0], f->R[2]);
+    auto ai = SIMD_MM(mul_ps)(f->C[1], f->R[2]);
 
-    __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[0]), _mm_mul_ps(f->C[6], in)),
-                          _mm_mul_ps(f->C[5], f->R[1]));
-    __m128 s1 = _mm_add_ps(_mm_mul_ps(in, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(ar, f->R[0]), _mm_mul_ps(ai, f->R[1])));
-    __m128 s2 = _mm_add_ps(_mm_mul_ps(ai, f->R[0]), _mm_mul_ps(ar, f->R[1]));
+    auto y = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[0]), SIMD_MM(mul_ps)(f->C[6], in)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[1]));
+    auto s1 = SIMD_MM(add_ps)(
+        SIMD_MM(mul_ps)(in, f->C[2]),
+        SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(ar, f->R[0]), SIMD_MM(mul_ps)(ai, f->R[1])));
+    auto s2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(ai, f->R[0]), SIMD_MM(mul_ps)(ar, f->R[1]));
 
     f->R[0] = s1;
     f->R[1] = s2;
 
-    __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[4], f->R[3]), _mm_mul_ps(f->C[6], y)),
-                           _mm_mul_ps(f->C[5], f->R[4]));
-    __m128 s3 = _mm_add_ps(_mm_mul_ps(y, f->C[2]),
-                           _mm_sub_ps(_mm_mul_ps(ar, f->R[3]), _mm_mul_ps(ai, f->R[4])));
-    __m128 s4 = _mm_add_ps(_mm_mul_ps(ai, f->R[3]), _mm_mul_ps(ar, f->R[4]));
+    auto y2 = SIMD_MM(add_ps)(
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[4], f->R[3]), SIMD_MM(mul_ps)(f->C[6], y)),
+        SIMD_MM(mul_ps)(f->C[5], f->R[4]));
+    auto s3 =
+        SIMD_MM(add_ps)(SIMD_MM(mul_ps)(y, f->C[2]), SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(ar, f->R[3]),
+                                                                     SIMD_MM(mul_ps)(ai, f->R[4])));
+    auto s4 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(ai, f->R[3]), SIMD_MM(mul_ps)(ar, f->R[4]));
 
     f->R[3] = s3;
     f->R[4] = s4;
@@ -453,114 +487,126 @@ inline __m128 IIR24CFLquad(QuadFilterUnitState *__restrict f, __m128 in)
     mr = mr.*0.99 + m.*0.01;*/
 
     // Limiter
-    const __m128 m001 = _mm_set1_ps(0.001f);
-    const __m128 m099 = _mm_set1_ps(0.999f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    const __m128 m2 = _mm_set1_ps(2.0f);
+    const auto m001 = SIMD_MM(set1_ps)(0.001f);
+    const auto m099 = SIMD_MM(set1_ps)(0.999f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    const auto m2 = SIMD_MM(set1_ps)(2.0f);
 
-    __m128 m = _mm_rsqrt_ps(
-        _mm_max_ps(m1, _mm_mul_ps(m2, _mm_and_ps(y2, basic_blocks::mechanics::m128_mask_absval))));
-    f->R[2] = _mm_add_ps(_mm_mul_ps(f->R[2], m099), _mm_mul_ps(m, m001));
+    auto m = SIMD_MM(rsqrt_ps)(SIMD_MM(max_ps)(
+        m1, SIMD_MM(mul_ps)(m2, SIMD_MM(and_ps)(y2, basic_blocks::mechanics::m128_mask_absval))));
+    f->R[2] = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->R[2], m099), SIMD_MM(mul_ps)(m, m001));
 
     return y2;
 }
 
-inline __m128 IIR24Bquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 IIR24Bquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]); // K2
-    f->C[3] = _mm_add_ps(f->C[3], f->dC[3]); // Q2
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]); // K1
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]); // Q1
-    f->C[4] = _mm_add_ps(f->C[4], f->dC[4]); // V1
-    f->C[5] = _mm_add_ps(f->C[5], f->dC[5]); // V2
-    f->C[6] = _mm_add_ps(f->C[6], f->dC[6]); // V3
-
-    __m128 f2 = _mm_sub_ps(_mm_mul_ps(f->C[3], in), _mm_mul_ps(f->C[1], f->R[1])); // Q2*in - K2*R1
-    __m128 g2 = _mm_add_ps(_mm_mul_ps(f->C[1], in), _mm_mul_ps(f->C[3], f->R[1])); // K2*in + Q2*R1
-    __m128 f1 = _mm_sub_ps(_mm_mul_ps(f->C[2], f2), _mm_mul_ps(f->C[0], f->R[0])); // Q1*f2 - K1*R0
-    __m128 g1 = _mm_add_ps(_mm_mul_ps(f->C[0], f2), _mm_mul_ps(f->C[2], f->R[0])); // K1*f2 + Q1*R0
-    f->R[0] = _mm_mul_ps(f1, f->R[4]);
-    f->R[1] = _mm_mul_ps(g1, f->R[4]);
-    __m128 y1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[6], g2), _mm_mul_ps(f->C[5], g1)),
-                           _mm_mul_ps(f->C[4], f1));
-
-    f2 = _mm_sub_ps(_mm_mul_ps(f->C[3], y1), _mm_mul_ps(f->C[1], f->R[3])); // Q2*in - K2*R1
-    g2 = _mm_add_ps(_mm_mul_ps(f->C[1], y1), _mm_mul_ps(f->C[3], f->R[3])); // K2*in + Q2*R1
-    f1 = _mm_sub_ps(_mm_mul_ps(f->C[2], f2), _mm_mul_ps(f->C[0], f->R[2])); // Q1*f2 - K1*R0
-    g1 = _mm_add_ps(_mm_mul_ps(f->C[0], f2), _mm_mul_ps(f->C[2], f->R[2])); // K1*f2 + Q1*R0
-    f->R[2] = _mm_mul_ps(f1, f->R[4]);
-    f->R[3] = _mm_mul_ps(g1, f->R[4]);
-    __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(f->C[6], g2), _mm_mul_ps(f->C[5], g1)),
-                           _mm_mul_ps(f->C[4], f1));
-
-    f->C[7] = _mm_add_ps(f->C[7], f->dC[7]); // Clipgain
-    const __m128 m01 = _mm_set1_ps(0.1f);
-    const __m128 m1 = _mm_set1_ps(1.0f);
-    f->R[4] = _mm_max_ps(m01, _mm_sub_ps(m1, _mm_mul_ps(f->C[7], _mm_mul_ps(y2, y2))));
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]); // K2
+    f->C[3] = SIMD_MM(add_ps)(f->C[3], f->dC[3]); // Q2
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]); // K1
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]); // Q1
+    f->C[4] = SIMD_MM(add_ps)(f->C[4], f->dC[4]); // V1
+    f->C[5] = SIMD_MM(add_ps)(f->C[5], f->dC[5]); // V2
+    f->C[6] = SIMD_MM(add_ps)(f->C[6], f->dC[6]); // V3
+
+    auto f2 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[3], in),
+                              SIMD_MM(mul_ps)(f->C[1], f->R[1])); // Q2*in - K2*R1
+    auto g2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], in),
+                              SIMD_MM(mul_ps)(f->C[3], f->R[1])); // K2*in + Q2*R1
+    auto f1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[2], f2),
+                              SIMD_MM(mul_ps)(f->C[0], f->R[0])); // Q1*f2 - K1*R0
+    auto g1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[0], f2),
+                              SIMD_MM(mul_ps)(f->C[2], f->R[0])); // K1*f2 + Q1*R0
+    f->R[0] = SIMD_MM(mul_ps)(f1, f->R[4]);
+    f->R[1] = SIMD_MM(mul_ps)(g1, f->R[4]);
+    auto y1 =
+        SIMD_MM(add_ps)(SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[6], g2), SIMD_MM(mul_ps)(f->C[5], g1)),
+                        SIMD_MM(mul_ps)(f->C[4], f1));
+
+    f2 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[3], y1),
+                         SIMD_MM(mul_ps)(f->C[1], f->R[3])); // Q2*in - K2*R1
+    g2 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[1], y1),
+                         SIMD_MM(mul_ps)(f->C[3], f->R[3])); // K2*in + Q2*R1
+    f1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(f->C[2], f2),
+                         SIMD_MM(mul_ps)(f->C[0], f->R[2])); // Q1*f2 - K1*R0
+    g1 = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[0], f2),
+                         SIMD_MM(mul_ps)(f->C[2], f->R[2])); // K1*f2 + Q1*R0
+    f->R[2] = SIMD_MM(mul_ps)(f1, f->R[4]);
+    f->R[3] = SIMD_MM(mul_ps)(g1, f->R[4]);
+    auto y2 =
+        SIMD_MM(add_ps)(SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[6], g2), SIMD_MM(mul_ps)(f->C[5], g1)),
+                        SIMD_MM(mul_ps)(f->C[4], f1));
+
+    f->C[7] = SIMD_MM(add_ps)(f->C[7], f->dC[7]); // Clipgain
+    const auto m01 = SIMD_MM(set1_ps)(0.1f);
+    const auto m1 = SIMD_MM(set1_ps)(1.0f);
+    f->R[4] = SIMD_MM(max_ps)(
+        m01, SIMD_MM(sub_ps)(m1, SIMD_MM(mul_ps)(f->C[7], SIMD_MM(mul_ps)(y2, y2))));
 
     return y2;
 }
 
 template <FilterSubType subtype>
-inline __m128 LPMOOGquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 LPMOOGquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]);
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]);
-    f->C[2] = _mm_add_ps(f->C[2], f->dC[2]);
-
-    f->R[0] = basic_blocks::dsp::softclip8_ps(_mm_add_ps(
-        f->R[0],
-        _mm_mul_ps(f->C[1],
-                   _mm_sub_ps(_mm_sub_ps(_mm_mul_ps(in, f->C[0]),
-                                         _mm_mul_ps(f->C[2], _mm_add_ps(f->R[3], f->R[4]))),
-                              f->R[0]))));
-    f->R[1] = _mm_add_ps(f->R[1], _mm_mul_ps(f->C[1], _mm_sub_ps(f->R[0], f->R[1])));
-    f->R[2] = _mm_add_ps(f->R[2], _mm_mul_ps(f->C[1], _mm_sub_ps(f->R[1], f->R[2])));
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]);
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]);
+    f->C[2] = SIMD_MM(add_ps)(f->C[2], f->dC[2]);
+
+    f->R[0] = basic_blocks::dsp::softclip8_ps(SIMD_MM(add_ps)(
+        f->R[0], SIMD_MM(mul_ps)(
+                     f->C[1], SIMD_MM(sub_ps)(
+                                  SIMD_MM(sub_ps)(
+                                      SIMD_MM(mul_ps)(in, f->C[0]),
+                                      SIMD_MM(mul_ps)(f->C[2], SIMD_MM(add_ps)(f->R[3], f->R[4]))),
+                                  f->R[0]))));
+    f->R[1] = SIMD_MM(add_ps)(f->R[1], SIMD_MM(mul_ps)(f->C[1], SIMD_MM(sub_ps)(f->R[0], f->R[1])));
+    f->R[2] = SIMD_MM(add_ps)(f->R[2], SIMD_MM(mul_ps)(f->C[1], SIMD_MM(sub_ps)(f->R[1], f->R[2])));
     f->R[4] = f->R[3];
-    f->R[3] = _mm_add_ps(f->R[3], _mm_mul_ps(f->C[1], _mm_sub_ps(f->R[2], f->R[3])));
+    f->R[3] = SIMD_MM(add_ps)(f->R[3], SIMD_MM(mul_ps)(f->C[1], SIMD_MM(sub_ps)(f->R[2], f->R[3])));
 
     return f->R[subtype];
 }
 
-inline __m128 SNHquad(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 SNHquad(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]);
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]);
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]);
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]);
 
-    f->R[0] = _mm_add_ps(f->R[0], f->C[0]);
+    f->R[0] = SIMD_MM(add_ps)(f->R[0], f->C[0]);
 
-    __m128 mask = _mm_cmpgt_ps(f->R[0], _mm_setzero_ps());
+    auto mask = SIMD_MM(cmpgt_ps)(f->R[0], SIMD_MM(setzero_ps)());
 
-    f->R[1] = _mm_or_ps(_mm_andnot_ps(mask, f->R[1]),
-                        _mm_and_ps(mask, basic_blocks::dsp::softclip_ps(
-                                             _mm_sub_ps(in, _mm_mul_ps(f->C[1], f->R[1])))));
+    f->R[1] = SIMD_MM(or_ps)(SIMD_MM(andnot_ps)(mask, f->R[1]),
+                             SIMD_MM(and_ps)(mask, basic_blocks::dsp::softclip_ps(SIMD_MM(sub_ps)(
+                                                       in, SIMD_MM(mul_ps)(f->C[1], f->R[1])))));
 
-    const __m128 m1 = _mm_set1_ps(-1.f);
-    f->R[0] = _mm_add_ps(f->R[0], _mm_and_ps(m1, mask));
+    const auto m1 = SIMD_MM(set1_ps)(-1.f);
+    f->R[0] = SIMD_MM(add_ps)(f->R[0], SIMD_MM(and_ps)(m1, mask));
 
     return f->R[1];
 }
 
 template <int COMB_SIZE> // COMB_SIZE must be a power of 2
-__m128 COMBquad_SSE2(QuadFilterUnitState *__restrict f, __m128 in)
+SIMD_M128 COMBquad_SSE2(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     static_assert(utilities::SincTable::FIRipol_M ==
                   256); // changing the constant requires updating the code below
-    const __m128 m256 = _mm_set1_ps(256.f);
-    const __m128i m0xff = _mm_set1_epi32(0xff);
+    const auto m256 = SIMD_MM(set1_ps)(256.f);
+    const SIMD_M128I m0xff = SIMD_MM(set1_epi32)(0xff);
 
-    f->C[0] = _mm_add_ps(f->C[0], f->dC[0]);
-    f->C[1] = _mm_add_ps(f->C[1], f->dC[1]);
+    f->C[0] = SIMD_MM(add_ps)(f->C[0], f->dC[0]);
+    f->C[1] = SIMD_MM(add_ps)(f->C[1], f->dC[1]);
 
-    __m128 a = _mm_mul_ps(f->C[0], m256);
-    __m128i e = _mm_cvtps_epi32(a);
+    auto a = SIMD_MM(mul_ps)(f->C[0], m256);
+    SIMD_M128I e = SIMD_MM(cvtps_epi32)(a);
     int DTi alignas(16)[4], SEi alignas(16)[4];
-    __m128i DT = _mm_srli_epi32(e, 8);
-    _mm_store_si128((__m128i *)DTi, DT);
-    __m128i SE = _mm_and_si128(e, m0xff);
-    SE = _mm_sub_epi32(m0xff, SE);
-    _mm_store_si128((__m128i *)SEi, SE);
-    __m128 DBRead = _mm_setzero_ps();
+    SIMD_M128I DT = SIMD_MM(srli_epi32)(e, 8);
+    SIMD_MM(store_si128)((SIMD_M128I *)DTi, DT);
+    SIMD_M128I SE = SIMD_MM(and_si128)(e, m0xff);
+    SE = SIMD_MM(sub_epi32)(m0xff, SE);
+    SIMD_MM(store_si128)((SIMD_M128I *)SEi, SE);
+    auto DBRead = SIMD_MM(setzero_ps)();
 
     for (int i = 0; i < 4; i++)
     {
@@ -569,24 +615,24 @@ __m128 COMBquad_SSE2(QuadFilterUnitState *__restrict f, __m128 in)
             int RP = (f->WP[i] - DTi[i] - utilities::SincTable::FIRoffset) & (COMB_SIZE - 1);
 
             // SINC interpolation (12 samples)
-            __m128 a = _mm_loadu_ps(&f->DB[i][RP]);
+            auto a = SIMD_MM(loadu_ps)(&f->DB[i][RP]);
             SEi[i] *= (utilities::SincTable::FIRipol_N << 1);
-            __m128 b = _mm_load_ps(&utilities::globalSincTable.sinctable[SEi[i]]);
-            __m128 o = _mm_mul_ps(a, b);
+            auto b = SIMD_MM(load_ps)(&utilities::globalSincTable.sinctable[SEi[i]]);
+            auto o = SIMD_MM(mul_ps)(a, b);
 
-            a = _mm_loadu_ps(&f->DB[i][RP + 4]);
-            b = _mm_load_ps(&utilities::globalSincTable.sinctable[SEi[i] + 4]);
-            o = _mm_add_ps(o, _mm_mul_ps(a, b));
+            a = SIMD_MM(loadu_ps)(&f->DB[i][RP + 4]);
+            b = SIMD_MM(load_ps)(&utilities::globalSincTable.sinctable[SEi[i] + 4]);
+            o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b));
 
-            a = _mm_loadu_ps(&f->DB[i][RP + 8]);
-            b = _mm_load_ps(&utilities::globalSincTable.sinctable[SEi[i] + 8]);
-            o = _mm_add_ps(o, _mm_mul_ps(a, b));
+            a = SIMD_MM(loadu_ps)(&f->DB[i][RP + 8]);
+            b = SIMD_MM(load_ps)(&utilities::globalSincTable.sinctable[SEi[i] + 8]);
+            o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b));
 
-            _mm_store_ss((float *)&DBRead + i, sst::basic_blocks::mechanics::sum_ps_to_ss(o));
+            SIMD_MM(store_ss)((float *)&DBRead + i, sst::basic_blocks::mechanics::sum_ps_to_ss(o));
         }
     }
 
-    __m128 d = _mm_add_ps(in, _mm_mul_ps(DBRead, f->C[1]));
+    auto d = SIMD_MM(add_ps)(in, SIMD_MM(mul_ps)(DBRead, f->C[1]));
     d = basic_blocks::dsp::softclip_ps(d);
 
     for (int i = 0; i < 4; i++)
@@ -594,24 +640,24 @@ __m128 COMBquad_SSE2(QuadFilterUnitState *__restrict f, __m128 in)
         if (f->active[i])
         {
             // Write to delaybuffer (with "anti-wrapping")
-            __m128 t = _mm_load_ss((float *)&d + i);
-            _mm_store_ss(&f->DB[i][f->WP[i]], t);
+            auto t = SIMD_MM(load_ss)((float *)&d + i);
+            SIMD_MM(store_ss)(&f->DB[i][f->WP[i]], t);
             if (f->WP[i] < utilities::SincTable::FIRipol_N)
-                _mm_store_ss(&f->DB[i][f->WP[i] + COMB_SIZE], t);
+                SIMD_MM(store_ss)(&f->DB[i][f->WP[i] + COMB_SIZE], t);
 
             // Increment write position
             f->WP[i] = (f->WP[i] + 1) & (COMB_SIZE - 1);
         }
     }
-    return _mm_add_ps(_mm_mul_ps(f->C[3], DBRead), _mm_mul_ps(f->C[2], in));
+    return SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f->C[3], DBRead), SIMD_MM(mul_ps)(f->C[2], in));
 }
 
-template <int32_t scaleTimes1000, __m128 (*F)(QuadFilterUnitState *__restrict, __m128)>
-__m128 ScaleQFPtr(QuadFilterUnitState *__restrict s, __m128 in)
+template <int32_t scaleTimes1000, SIMD_M128 (*F)(QuadFilterUnitState *__restrict, SIMD_M128)>
+SIMD_M128 ScaleQFPtr(QuadFilterUnitState *__restrict s, SIMD_M128 in)
 {
-    const auto scale = _mm_set1_ps(scaleTimes1000 / 1000.f);
+    const auto scale = SIMD_MM(set1_ps)(scaleTimes1000 / 1000.f);
     auto res = F(s, in);
-    return _mm_mul_ps(res, scale);
+    return SIMD_MM(mul_ps)(res, scale);
 }
 
 template <bool Compensated>
diff --git a/include/sst/filters/ResonanceWarp.h b/include/sst/filters/ResonanceWarp.h
index 33ddca4..57070c8 100644
--- a/include/sst/filters/ResonanceWarp.h
+++ b/include/sst/filters/ResonanceWarp.h
@@ -40,10 +40,10 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
     return freq;
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 
 enum Saturator
 {
@@ -51,12 +51,12 @@ enum Saturator
     SAT_SOFT
 };
 
-static inline __m128 doNLFilter(const __m128 input, const __m128 a1, const __m128 a2,
-                                const __m128 b0, const __m128 b1, const __m128 b2, const int sat,
-                                __m128 &z1, __m128 &z2) noexcept
+static inline SIMD_M128 doNLFilter(const SIMD_M128 input, const SIMD_M128 a1, const SIMD_M128 a2,
+                                   const SIMD_M128 b0, const SIMD_M128 b1, const SIMD_M128 b2,
+                                   const int sat, SIMD_M128 &z1, SIMD_M128 &z2) noexcept
 {
     // out = z1 + b0 * input
-    const __m128 out = A(z1, M(b0, input));
+    const auto out = A(z1, M(b0, input));
 
     // z1 = z2 + b1 * input - a1 * out
     z1 = A(z2, S(M(b1, input), M(a1, out)));
@@ -157,7 +157,7 @@ void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float freq, fl
 }
 
 template <FilterSubType subtype>
-inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     // lower 2 bits of subtype is the stage count
     const int stages = subtype & 3;
diff --git a/include/sst/filters/TriPoleFilter.h b/include/sst/filters/TriPoleFilter.h
index 0a6d4df..bde1913 100644
--- a/include/sst/filters/TriPoleFilter.h
+++ b/include/sst/filters/TriPoleFilter.h
@@ -66,23 +66,23 @@ static float clampedFrequency(float pitch, float sampleRate, TuningProvider *pro
     return freq;
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define D(a, b) _mm_div_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define D(a, b) SIMD_MM(div_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 #define N(a) S(F(0.0f), a)
 
 /** inverse square root sigmoid */
-static inline __m128 thr_sigmoid(__m128 x, float beta)
+static inline SIMD_M128 thr_sigmoid(SIMD_M128 x, float beta)
 {
-    __m128 vtmp = _mm_mul_ps(x, x);           // calculate in*in
-    __m128 vtmp2 = _mm_add_ps(vtmp, F(beta)); // in*in+1.f
-    vtmp = _mm_rsqrt_ps(vtmp2);               // 1/sqrt(in*in+1.f)
-    return _mm_mul_ps(vtmp, x);               // in*1/sqrt(in*in+1)
+    auto vtmp = SIMD_MM(mul_ps)(x, x);           // calculate in*in
+    auto vtmp2 = SIMD_MM(add_ps)(vtmp, F(beta)); // in*in+1.f
+    vtmp = SIMD_MM(rsqrt_ps)(vtmp2);             // 1/sqrt(in*in+1.f)
+    return SIMD_MM(mul_ps)(vtmp, x);             // in*1/sqrt(in*in+1)
 }
 
-static inline __m128 sech2_with_tanh(__m128 tanh_value)
+static inline SIMD_M128 sech2_with_tanh(SIMD_M128 tanh_value)
 {
     const auto one = F(1.0f);
     return S(one, M(tanh_value, tanh_value));
@@ -90,29 +90,30 @@ static inline __m128 sech2_with_tanh(__m128 tanh_value)
 
 namespace OnePoleLPF
 {
-static inline __m128 linOutput(__m128 x, __m128 z, __m128 b_coeff, __m128 a_coeff)
+static inline SIMD_M128 linOutput(SIMD_M128 x, SIMD_M128 z, SIMD_M128 b_coeff, SIMD_M128 a_coeff)
 {
     return M(a_coeff, A(M(b_coeff, x), z));
 }
 
-static inline __m128 nonlinOutput(__m128 tanh_x, __m128 tanh_y, __m128 z, __m128 b_coeff)
+static inline SIMD_M128 nonlinOutput(SIMD_M128 tanh_x, SIMD_M128 tanh_y, SIMD_M128 z,
+                                     SIMD_M128 b_coeff)
 {
     return A(M(b_coeff, S(tanh_x, tanh_y)), z);
 }
 
-static inline __m128 getDerivative(__m128 tanh_y, __m128 b_coeff)
+static inline SIMD_M128 getDerivative(SIMD_M128 tanh_y, SIMD_M128 b_coeff)
 {
     const auto one = F(1.0f);
     return S(M(N(b_coeff), sech2_with_tanh(tanh_y)), one);
 }
 
-static inline __m128 getXDerivative(__m128 tanh_x, __m128 b_coeff)
+static inline SIMD_M128 getXDerivative(SIMD_M128 tanh_x, SIMD_M128 b_coeff)
 {
     return M(b_coeff, sech2_with_tanh(tanh_x));
 }
 
-static inline __m128 process(__m128 tanh_x, __m128 z, __m128 estimate, __m128 b_coeff,
-                             __m128 a_coeff, float beta)
+static inline SIMD_M128 process(SIMD_M128 tanh_x, SIMD_M128 z, SIMD_M128 estimate,
+                                SIMD_M128 b_coeff, SIMD_M128 a_coeff, float beta)
 {
     estimate = linOutput(tanh_x, z, b_coeff, a_coeff);
     for (int i = 0; i < nIterStage; ++i)
@@ -128,26 +129,27 @@ static inline __m128 process(__m128 tanh_x, __m128 z, __m128 estimate, __m128 b_
 
 namespace OnePoleHPF
 {
-static inline __m128 linOutput(__m128 x_minus_x1_plus_z, __m128 a_coeff)
+static inline SIMD_M128 linOutput(SIMD_M128 x_minus_x1_plus_z, SIMD_M128 a_coeff)
 {
     return M(a_coeff, x_minus_x1_plus_z);
 }
 
-static inline __m128 nonlinOutput(__m128 x_minus_x1_plus_z, __m128 tanh_y, __m128 b_coeff)
+static inline SIMD_M128 nonlinOutput(SIMD_M128 x_minus_x1_plus_z, SIMD_M128 tanh_y,
+                                     SIMD_M128 b_coeff)
 {
     return A(M(N(b_coeff), tanh_y), x_minus_x1_plus_z);
 }
 
-static inline __m128 getDerivative(__m128 tanh_y, __m128 b_coeff)
+static inline SIMD_M128 getDerivative(SIMD_M128 tanh_y, SIMD_M128 b_coeff)
 {
     const auto neg_one = F(-1.0f);
     return A(M(N(b_coeff), sech2_with_tanh(tanh_y)), neg_one);
 }
 
-static inline __m128 getXDerivative() { return F(2.0f); }
+static inline SIMD_M128 getXDerivative() { return F(2.0f); }
 
-static inline __m128 process(__m128 x, __m128 x1, __m128 z, __m128 estimate, __m128 b_coeff,
-                             __m128 a_coeff, float beta)
+static inline SIMD_M128 process(SIMD_M128 x, SIMD_M128 x1, SIMD_M128 z, SIMD_M128 estimate,
+                                SIMD_M128 b_coeff, SIMD_M128 a_coeff, float beta)
 {
     auto x_minus_x1_plus_z = A(S(x, x1), z);
     estimate = linOutput(x_minus_x1_plus_z, a_coeff);
@@ -164,30 +166,31 @@ static inline __m128 process(__m128 x, __m128 x1, __m128 z, __m128 estimate, __m
 
 namespace OnePoleLPF_FB
 {
-static inline __m128 linOutput(__m128 bx, __m128 z_minus_fb_plus_fb1, __m128 a_coeff)
+static inline SIMD_M128 linOutput(SIMD_M128 bx, SIMD_M128 z_minus_fb_plus_fb1, SIMD_M128 a_coeff)
 {
     return M(a_coeff, A(bx, z_minus_fb_plus_fb1));
 }
 
-static inline __m128 nonlinOutput(__m128 tanh_x, __m128 tanh_y, __m128 z_minus_fb_plus_fb1,
-                                  __m128 b_coeff)
+static inline SIMD_M128 nonlinOutput(SIMD_M128 tanh_x, SIMD_M128 tanh_y,
+                                     SIMD_M128 z_minus_fb_plus_fb1, SIMD_M128 b_coeff)
 {
     return A(M(b_coeff, S(tanh_x, tanh_y)), z_minus_fb_plus_fb1);
 }
 
-static inline __m128 getDerivative(__m128 tanh_y, __m128 b_coeff)
+static inline SIMD_M128 getDerivative(SIMD_M128 tanh_y, SIMD_M128 b_coeff)
 {
     return OnePoleLPF::getDerivative(tanh_y, b_coeff);
 }
 
-static inline __m128 getXDerivative()
+static inline SIMD_M128 getXDerivative()
 {
     const auto two = F(2.0f);
     return two;
 }
 
-static inline __m128 process(__m128 tanh_x, __m128 z, __m128 fb, __m128 fb1, __m128 estimate,
-                             __m128 b_coeff, __m128 a_coeff, __m128 bx)
+static inline SIMD_M128 process(SIMD_M128 tanh_x, SIMD_M128 z, SIMD_M128 fb, SIMD_M128 fb1,
+                                SIMD_M128 estimate, SIMD_M128 b_coeff, SIMD_M128 a_coeff,
+                                SIMD_M128 bx)
 {
     auto z_minus_fb_plus_fb1 = A(S(z, fb), fb1);
     estimate = linOutput(bx, z_minus_fb_plus_fb1, a_coeff);
@@ -204,31 +207,31 @@ static inline __m128 process(__m128 tanh_x, __m128 z, __m128 fb, __m128 fb1, __m
 
 namespace OnePoleHPF_FB
 {
-static inline __m128 linOutput(__m128 x_minus_x1_plus_z, __m128 tanh_fb, __m128 a_coeff,
-                               __m128 b_coeff)
+static inline SIMD_M128 linOutput(SIMD_M128 x_minus_x1_plus_z, SIMD_M128 tanh_fb, SIMD_M128 a_coeff,
+                                  SIMD_M128 b_coeff)
 {
     return M(a_coeff, A(M(b_coeff, tanh_fb), x_minus_x1_plus_z));
 }
 
-static inline __m128 nonlinOutput(__m128 x_minus_x1_plus_z, __m128 tanh_y, __m128 tanh_fb,
-                                  __m128 b_coeff)
+static inline SIMD_M128 nonlinOutput(SIMD_M128 x_minus_x1_plus_z, SIMD_M128 tanh_y,
+                                     SIMD_M128 tanh_fb, SIMD_M128 b_coeff)
 {
     return A(M(b_coeff, S(tanh_fb, tanh_y)), x_minus_x1_plus_z);
 }
 
-static inline __m128 getDerivative(__m128 tanh_y, __m128 b_coeff)
+static inline SIMD_M128 getDerivative(SIMD_M128 tanh_y, SIMD_M128 b_coeff)
 {
     const auto neg_one = F(-1.0f);
     return A(M(N(b_coeff), sech2_with_tanh(tanh_y)), neg_one);
 }
 
-static inline __m128 getFBDerivative(__m128 tanh_fb, __m128 b_coeff)
+static inline SIMD_M128 getFBDerivative(SIMD_M128 tanh_fb, SIMD_M128 b_coeff)
 {
     return M(b_coeff, sech2_with_tanh(tanh_fb));
 }
 
-static inline __m128 process(__m128 x_minus_x1_plus_z, __m128 tanh_fb, __m128 estimate,
-                             __m128 b_coeff, __m128 a_coeff)
+static inline SIMD_M128 process(SIMD_M128 x_minus_x1_plus_z, SIMD_M128 tanh_fb, SIMD_M128 estimate,
+                                SIMD_M128 b_coeff, SIMD_M128 a_coeff)
 {
     estimate = linOutput(x_minus_x1_plus_z, tanh_fb, a_coeff, b_coeff);
     for (int i = 0; i < nIterStage; ++i)
@@ -257,45 +260,46 @@ constexpr float one = 0.99f;
 constexpr float oneOverMult = one / mult;
 const float betaExpOverMult = beta_exp / mult;
 
-static inline __m128 sign_ps(__m128 x)
+static inline SIMD_M128 sign_ps(SIMD_M128 x)
 {
-    const __m128 zero = _mm_setzero_ps();
-    const __m128 one = _mm_set1_ps(1.0f);
-    const __m128 neg_one = _mm_set1_ps(-1.0f);
+    const auto zero = SIMD_MM(setzero_ps)();
+    const auto one = SIMD_MM(set1_ps)(1.0f);
+    const auto neg_one = SIMD_MM(set1_ps)(-1.0f);
 
-    __m128 positive = _mm_and_ps(_mm_cmpgt_ps(x, zero), one);
-    __m128 negative = _mm_and_ps(_mm_cmplt_ps(x, zero), neg_one);
+    auto positive = SIMD_MM(and_ps)(SIMD_MM(cmpgt_ps)(x, zero), one);
+    auto negative = SIMD_MM(and_ps)(SIMD_MM(cmplt_ps)(x, zero), neg_one);
 
-    return _mm_or_ps(positive, negative);
+    return SIMD_MM(or_ps)(positive, negative);
 }
 
-static inline __m128 res_func_ps(__m128 x)
+static inline SIMD_M128 res_func_ps(SIMD_M128 x)
 {
     x = M(F(mult), x);
 
     auto x_abs = basic_blocks::mechanics::abs_ps(x);
-    auto x_less_than = _mm_cmplt_ps(x_abs, F(max_val));
+    auto x_less_than = SIMD_MM(cmplt_ps)(x_abs, F(max_val));
 
     auto y = A(N(basic_blocks::dsp::fastexpSSE(
                    M(F(beta_exp), N(basic_blocks::mechanics::abs_ps(A(x, F(c))))))),
                F(bias));
     y = M(sign_ps(x), M(y, F(oneOverMult)));
 
-    return _mm_or_ps(_mm_and_ps(x_less_than, M(x, F(oneOverMult))), _mm_andnot_ps(x_less_than, y));
+    return SIMD_MM(or_ps)(SIMD_MM(and_ps)(x_less_than, M(x, F(oneOverMult))),
+                          SIMD_MM(andnot_ps)(x_less_than, y));
 }
 
-static inline __m128 res_deriv_ps(__m128 x)
+static inline SIMD_M128 res_deriv_ps(SIMD_M128 x)
 {
     x = M(F(mult), x);
 
     auto x_abs = basic_blocks::mechanics::abs_ps(x);
-    auto x_less_than = _mm_cmplt_ps(x_abs, F(max_val));
+    auto x_less_than = SIMD_MM(cmplt_ps)(x_abs, F(max_val));
 
     auto y = A(basic_blocks::dsp::fastexpSSE(
                    M(F(beta_exp), N(basic_blocks::mechanics::abs_ps(A(x, F(c)))))),
                F(betaExpOverMult));
 
-    return _mm_or_ps(_mm_and_ps(x_less_than, F(one)), _mm_andnot_ps(x_less_than, y));
+    return SIMD_MM(or_ps)(SIMD_MM(and_ps)(x_less_than, F(one)), SIMD_MM(andnot_ps)(x_less_than, y));
 }
 } // namespace ResWaveshaper
 
@@ -354,7 +358,8 @@ void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float freq, fl
     cm->FromDirect(C);
 }
 
-template <FilterSubType subtype> inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in)
+template <FilterSubType subtype>
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
     // input gain
     in = M(F(in_gain), in);
@@ -389,7 +394,7 @@ template <FilterSubType subtype> inline __m128 process(QuadFilterUnitState *__re
     const auto k_ps = f->C[thr_k];
 
     // define local variables
-    __m128 tanh_x0, tanh_x1, tanh_x2, tanh_fb, f0_deriv, f1_deriv, f2_deriv, bx, hpf_in;
+    SIMD_M128 tanh_x0, tanh_x1, tanh_x2, tanh_fb, f0_deriv, f1_deriv, f2_deriv, bx, hpf_in;
     switch (mode)
     {
     case 0: // lowpass
diff --git a/include/sst/filters/VintageLadders.h b/include/sst/filters/VintageLadders.h
index 0f92634..d7240bb 100644
--- a/include/sst/filters/VintageLadders.h
+++ b/include/sst/filters/VintageLadders.h
@@ -140,16 +140,16 @@ inline void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float f
     cm->FromDirect(lc);
 }
 
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 
-inline __m128 clip(__m128 value, __m128 _saturation, __m128 _saturationinverse)
+inline SIMD_M128 clip(SIMD_M128 value, SIMD_M128 _saturation, SIMD_M128 _saturationinverse)
 {
-    const __m128 minusone = F(-1), one = F(1), onethird = F(1.f / 3.f);
+    const auto minusone = F(-1), one = F(1), onethird = F(1.f / 3.f);
     auto vtsi = M(value, _saturationinverse);
-    auto v2 = _mm_min_ps(one, _mm_max_ps(minusone, vtsi));
+    auto v2 = SIMD_MM(min_ps)(one, SIMD_MM(max_ps)(minusone, vtsi));
     auto v23 = M(v2, M(v2, v2));
     auto vkern = S(v2, M(onethird, v23));
     auto res = M(_saturation, vkern);
@@ -157,9 +157,9 @@ inline __m128 clip(__m128 value, __m128 _saturation, __m128 _saturationinverse)
     return res;
 }
 
-inline void calculateDerivatives(__m128 input, __m128 *dstate, __m128 *state, __m128 cutoff,
-                                 __m128 resonance, __m128 _saturation, __m128 _saturationInv,
-                                 __m128 gComp)
+inline void calculateDerivatives(SIMD_M128 input, SIMD_M128 *dstate, SIMD_M128 *state,
+                                 SIMD_M128 cutoff, SIMD_M128 resonance, SIMD_M128 _saturation,
+                                 SIMD_M128 _saturationInv, SIMD_M128 gComp)
 {
     auto satstate0 = clip(state[0], _saturation, _saturationInv);
     auto satstate1 = clip(state[1], _saturation, _saturationInv);
@@ -182,20 +182,20 @@ inline void calculateDerivatives(__m128 input, __m128 *dstate, __m128 *state, __
     dstate[3] = M(cutoff, S(satstate2, clip(state[3], _saturation, _saturationInv)));
 }
 
-inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 input)
 {
     int i;
-    __m128 deriv1[4], deriv2[4], deriv3[4], deriv4[4], tempState[4];
+    SIMD_M128 deriv1[4], deriv2[4], deriv3[4], deriv4[4], tempState[4];
 
-    __m128 *state = &(f->R[0]);
+    auto *state = &(f->R[0]);
 
     auto stepSize = F(f->sampleRateInv * extraOversampleInv),
          halfStepSize = F(0.5f * f->sampleRateInv * extraOversampleInv);
 
-    const __m128 oneoversix = F(1.f / 6.f), two = F(2.f), dFac = F(extraOversampleInv),
-                 sat = F(saturation), satInv = F(saturationInverse);
+    const auto oneoversix = F(1.f / 6.f), two = F(2.f), dFac = F(extraOversampleInv),
+               sat = F(saturation), satInv = F(saturationInverse);
 
-    __m128 outputOS[extraOversample];
+    SIMD_M128 outputOS[extraOversample];
 
     for (int osi = 0; osi < extraOversample; ++osi)
     {
@@ -204,9 +204,9 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
             f->C[j] = A(f->C[j], M(dFac, f->dC[j]));
         }
 
-        __m128 cutoff = f->C[rkm_cutoff];
-        __m128 resonance = f->C[rkm_reso];
-        __m128 gComp = f->C[rkm_gComp];
+        auto cutoff = f->C[rkm_cutoff];
+        auto resonance = f->C[rkm_reso];
+        auto gComp = f->C[rkm_gComp];
 
         calculateDerivatives(input, deriv1, state, cutoff, resonance, sat, satInv, gComp);
         for (i = 0; i < 4; i++)
@@ -240,7 +240,7 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
         outputOS[osi] = state[3];
 
         // Zero stuffing
-        input = _mm_setzero_ps();
+        input = SIMD_MM(setzero_ps)();
     }
 
     /*
@@ -254,10 +254,10 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 input)
     ** Anyway: (2 * sin(pi * x) * sin((pi * x) / 2)) / (pi^2 * x^2), for points -1.5, -1, 0.5, and 0
     **
     */
-    auto ov = _mm_setzero_ps();
-    __m128 windowFactors[4];
+    auto ov = SIMD_MM(setzero_ps)();
+    SIMD_M128 windowFactors[4];
     windowFactors[0] = F(-0.0636844f);
-    windowFactors[1] = _mm_setzero_ps();
+    windowFactors[1] = SIMD_MM(setzero_ps)();
     windowFactors[2] = F(0.57315917f);
     windowFactors[3] = F(1);
 
@@ -347,20 +347,19 @@ inline void makeCoefficients(FilterCoefficientMaker<TuningProvider> *cm, float f
     cm->FromDirect(lC);
 }
 
-inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in)
+inline SIMD_M128 process(QuadFilterUnitState *__restrict f, SIMD_M128 in)
 {
-#define F(a) _mm_set_ps1(a)
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
 
-    const __m128 dFac = F(0.5f), half = F(0.5f), one = F(1.0f), four = F(4.0f), m18730 = F(1.8730f),
-                 m04955 = F(0.4995f), mneg06490 = F(-0.6490f), m09988 = F(0.9988f),
-                 mneg39364 = F(-3.9364f), m18409 = F(1.8409f), m09968 = F(0.9968f),
-                 thermal = F(1.f / 70.f), oneoverthermal = F(70.0f),
-                 neg2pi = F(-2.0f * (float)M_PI);
+    const auto dFac = F(0.5f), half = F(0.5f), one = F(1.0f), four = F(4.0f), m18730 = F(1.8730f),
+               m04955 = F(0.4995f), mneg06490 = F(-0.6490f), m09988 = F(0.9988f),
+               mneg39364 = F(-3.9364f), m18409 = F(1.8409f), m09968 = F(0.9968f),
+               thermal = F(1.f / 70.f), oneoverthermal = F(70.0f), neg2pi = F(-2.0f * (float)M_PI);
 
-    __m128 outputOS[2];
+    SIMD_M128 outputOS[2];
 
     for (int j = 0; j < 2; ++j)
     {
@@ -383,12 +382,12 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in)
 
         for (int k = 0; k < n_hcoeffs; ++k)
         {
-            f->C[k] = _mm_add_ps(f->C[k], _mm_mul_ps(dFac, f->dC[k]));
+            f->C[k] = SIMD_MM(add_ps)(f->C[k], SIMD_MM(mul_ps)(dFac, f->dC[k]));
         }
 
         // float input = in - resQuad * ( delay[5] - gComp * in )   // Model as an impulse stream
-        auto input =
-            _mm_sub_ps(in, _mm_mul_ps(resquad, S(f->R[h_delay + 5], M(f->C[h_gComp], in))));
+        auto input = SIMD_MM(sub_ps)(
+            in, SIMD_MM(mul_ps)(resquad, S(f->R[h_delay + 5], M(f->C[h_gComp], in))));
 
         // delay[0] = stage[0] = delay[0] + tune * (tanh(input * thermal) - stageTanh[0]);
         f->R[h_stage + 0] =
@@ -416,7 +415,7 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in)
 
         // 0.5 sample delay for phase compensation
         // delay[5] = (stage[3] + delay[4]) * 0.5;
-        f->R[h_delay + 5] = M(_mm_set_ps1(0.5), A(f->R[h_stage + 3], f->R[h_delay + 4]));
+        f->R[h_delay + 5] = M(SIMD_MM(set_ps1)(0.5), A(f->R[h_stage + 3], f->R[h_delay + 4]));
 
         // delay[4] = stage[3];
         f->R[h_delay + 4] = f->R[h_stage + 3];
diff --git a/include/sst/utilities/globals.h b/include/sst/utilities/globals.h
index b9a360a..53dbcc3 100644
--- a/include/sst/utilities/globals.h
+++ b/include/sst/utilities/globals.h
@@ -21,28 +21,7 @@
 #include <algorithm>
 #include <cstring> // needed for memset/memcpy on GCC
 
-#if MAC
-
-#if defined(__x86_64__)
-#else
-#define ARM_NEON 1
-#endif
-
-#endif
-
-#if LINUX
-#if defined(__aarch64__) || defined(__arm__)
-#define ARM_NEON 1
-#endif
-#endif
-
-#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) ||                                   \
-    (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#include <emmintrin.h>
-#else
-#define SIMDE_ENABLE_NATIVE_ALIASES
-#include "simde/x86/sse2.h"
-#endif
+#include "sst/basic-blocks/simd/setup.h"
 
 namespace sst::filters::utilities
 {
diff --git a/include/sst/utilities/shared.h b/include/sst/utilities/shared.h
index 22dd02b..d0e9d72 100644
--- a/include/sst/utilities/shared.h
+++ b/include/sst/utilities/shared.h
@@ -26,15 +26,15 @@ inline float i2f_binary_cast(int i)
     return *f;
 }
 
-const __m128 m128_mask_signbit = _mm_set1_ps(i2f_binary_cast(0x80000000));
-const __m128 m128_mask_absval = _mm_set1_ps(i2f_binary_cast(0x7fffffff));
-const __m128 m128_zero = _mm_set1_ps(0.0f);
-const __m128 m128_half = _mm_set1_ps(0.5f);
-const __m128 m128_one = _mm_set1_ps(1.0f);
-const __m128 m128_two = _mm_set1_ps(2.0f);
-const __m128 m128_four = _mm_set1_ps(4.0f);
-const __m128 m128_1234 = _mm_set_ps(1.f, 2.f, 3.f, 4.f);
-const __m128 m128_0123 = _mm_set_ps(0.f, 1.f, 2.f, 3.f);
+const auto m128_mask_signbit = SIMD_MM(set1_ps)(i2f_binary_cast(0x80000000));
+const auto m128_mask_absval = SIMD_MM(set1_ps)(i2f_binary_cast(0x7fffffff));
+const auto m128_zero = SIMD_MM(set1_ps)(0.0f);
+const auto m128_half = SIMD_MM(set1_ps)(0.5f);
+const auto m128_one = SIMD_MM(set1_ps)(1.0f);
+const auto m128_two = SIMD_MM(set1_ps)(2.0f);
+const auto m128_four = SIMD_MM(set1_ps)(4.0f);
+const auto m128_1234 = SIMD_MM(set_ps)(1.f, 2.f, 3.f, 4.f);
+const auto m128_0123 = SIMD_MM(set_ps)(0.f, 1.f, 2.f, 3.f);
 
 } // namespace sst::filters::utilities
 
diff --git a/scripts/fix_file_comments.pl b/scripts/fix_file_comments.pl
index baca832..d95a964 100644
--- a/scripts/fix_file_comments.pl
+++ b/scripts/fix_file_comments.pl
@@ -11,11 +11,11 @@
     'include'
 );
 
-ind(
+find(
     {
         wanted => \&findfiles,
     },
-    'include0extras'
+    'include-extras'
 );
 
 
diff --git a/tests/BasicFiltersTest.cpp b/tests/BasicFiltersTest.cpp
index 1d1955c..e1f7ed4 100644
--- a/tests/BasicFiltersTest.cpp
+++ b/tests/BasicFiltersTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("Basic Filters")
diff --git a/tests/BiquadTest.cpp b/tests/BiquadTest.cpp
index 882c205..089e234 100644
--- a/tests/BiquadTest.cpp
+++ b/tests/BiquadTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "sst/filters/BiquadFilter.h"
 
 #include "TestUtils.h"
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c7bf43d..d758b91 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
+message(STATUS "Add sst-filters-tests")
 add_executable(sst-filters-tests)
 target_include_directories(sst-filters-tests PRIVATE . ../libs)
 target_link_libraries(sst-filters-tests PRIVATE ${PROJECT_NAME} simde)
diff --git a/tests/CutoffWarpTest.cpp b/tests/CutoffWarpTest.cpp
index e78f43d..616d49b 100644
--- a/tests/CutoffWarpTest.cpp
+++ b/tests/CutoffWarpTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("Cutoff Warp")
diff --git a/tests/CytomicSVFTests.cpp b/tests/CytomicSVFTests.cpp
index b26c2ad..bef926b 100644
--- a/tests/CytomicSVFTests.cpp
+++ b/tests/CytomicSVFTests.cpp
@@ -1,6 +1,17 @@
-//
-// Created by Paul Walker on 4/7/24.
-//
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 
 #include "catch2/catch2.hpp"
 #include "sst/filters/CytomicSVF.h"
diff --git a/tests/DiodeLadderTest.cpp b/tests/DiodeLadderTest.cpp
index f09e0f3..bbae286 100644
--- a/tests/DiodeLadderTest.cpp
+++ b/tests/DiodeLadderTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("Diode Ladder")
diff --git a/tests/HalfRateTest.cpp b/tests/HalfRateTest.cpp
index 984f5dd..84f0274 100644
--- a/tests/HalfRateTest.cpp
+++ b/tests/HalfRateTest.cpp
@@ -1,5 +1,18 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "sst/filters/HalfRateFilter.h"
-
 #include "TestUtils.h"
 
 template <int BS = 32>
diff --git a/tests/K35FilterTest.cpp b/tests/K35FilterTest.cpp
index d336218..8920714 100644
--- a/tests/K35FilterTest.cpp
+++ b/tests/K35FilterTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("K35 Filter")
diff --git a/tests/OBXDFilterTest.cpp b/tests/OBXDFilterTest.cpp
index 3f8b11d..8544a56 100644
--- a/tests/OBXDFilterTest.cpp
+++ b/tests/OBXDFilterTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("OBXD Filter")
diff --git a/tests/ResonanceWarpTest.cpp b/tests/ResonanceWarpTest.cpp
index 968e9a3..7f39f01 100644
--- a/tests/ResonanceWarpTest.cpp
+++ b/tests/ResonanceWarpTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 
 #include "TestUtils.h"
 
diff --git a/tests/TestUtils.h b/tests/TestUtils.h
index 477a73b..976fdab 100644
--- a/tests/TestUtils.h
+++ b/tests/TestUtils.h
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #ifndef TESTS_TESTUTILS_H
 #define TESTS_TESTUTILS_H
 
@@ -27,17 +41,18 @@ inline float runSine(sst::filters::QuadFilterUnitState &filterState,
                      sst::filters::FilterUnitQFPtr &filterUnitPtr, float testFreq, int numSamples)
 {
     // reset filter state
-    std::fill(filterState.R, &filterState.R[sst::filters::n_filter_registers], _mm_setzero_ps());
+    std::fill(filterState.R, &filterState.R[sst::filters::n_filter_registers],
+              SIMD_MM(setzero_ps)());
 
     std::vector<float> y(numSamples, 0.0f);
     for (int i = 0; i < numSamples; ++i)
     {
         auto x = (float)std::sin(2.0 * M_PI * (double)i * testFreq / sampleRate);
 
-        auto yVec = filterUnitPtr(&filterState, _mm_set_ps1(x));
+        auto yVec = filterUnitPtr(&filterState, SIMD_MM(set_ps1)(x));
 
         float yArr alignas(16)[4];
-        _mm_store_ps(yArr, yVec);
+        SIMD_MM(store_ps)(yArr, yVec);
         y[i] = yArr[0];
     }
 
diff --git a/tests/TriPoleFilterTest.cpp b/tests/TriPoleFilterTest.cpp
index e31be89..08967e6 100644
--- a/tests/TriPoleFilterTest.cpp
+++ b/tests/TriPoleFilterTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("TriPole Filter")
diff --git a/tests/VintageLaddersTest.cpp b/tests/VintageLaddersTest.cpp
index ee9cf54..954817b 100644
--- a/tests/VintageLaddersTest.cpp
+++ b/tests/VintageLaddersTest.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #include "TestUtils.h"
 
 TEST_CASE("Vintage Ladders")
diff --git a/tests/tests.cpp b/tests/tests.cpp
index e70a825..98c8424 100644
--- a/tests/tests.cpp
+++ b/tests/tests.cpp
@@ -1,3 +1,17 @@
+/*
+ * sst-filters - A header-only collection of SIMD filter
+ * implementations by the Surge Synth Team
+ *
+ * Copyright 2019-2024, various authors, as described in the GitHub
+ * transaction log.
+ *
+ * sst-filters is released under the Gnu General Public Licens
+ * version 3 or later. Some of the filters in this package
+ * originated in the version of Surge open sourced in 2018.
+ *
+ * All source in sst-filters available at
+ * https://github.com/surge-synthesizer/sst-filters
+ */
 #define CATCH_CONFIG_RUNNER
 #include "catch2/catch2.hpp"