From 12995820478df9d2d388f1c633202f6a9ef8dcbd Mon Sep 17 00:00:00 2001
From: Lucas Saavedra Vaz <32426024+lucasssvaz@users.noreply.github.com>
Date: Tue, 7 May 2024 08:16:13 -0300
Subject: [PATCH] ci(performance): Add performance tests to CI (#9560)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ci(performance): Add performance tests to CI

* ci(req): Fix requirements

* ci(pre-commit): Apply automatic fixes

* ci(pre-commit): Increase maximum allowed complexity for python

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Co-authored-by: Jan Procházka <90197375+P-R-O-C-H-Y@users.noreply.github.com>
---
 .flake8                                       |    4 +-
 .github/scripts/sketch_utils.sh               |   17 +-
 .github/scripts/tests_build.sh                |   32 +-
 .github/scripts/tests_run.sh                  |   41 +-
 .github/workflows/hil.yml                     |   88 +-
 .pre-commit-config.yaml                       |    6 +-
 tests/.gitignore                              |    1 +
 tests/performance/coremark/core_list_join.c   |  495 ++++
 tests/performance/coremark/core_main.c        |  356 +++
 tests/performance/coremark/core_matrix.c      |  308 +++
 tests/performance/coremark/core_portme.c      |  168 ++
 tests/performance/coremark/core_portme.h      |  217 ++
 tests/performance/coremark/core_state.c       |  277 ++
 tests/performance/coremark/core_util.c        |  210 ++
 tests/performance/coremark/coremark.h         |  174 ++
 tests/performance/coremark/coremark.ino       |  118 +
 tests/performance/coremark/test_coremark.py   |   58 +
 tests/performance/fibonacci/fibonacci.ino     |   48 +
 tests/performance/fibonacci/test_fibonacci.py |   78 +
 tests/performance/psramspeed/.skip.esp32c3    |    0
 tests/performance/psramspeed/.skip.esp32c6    |    0
 tests/performance/psramspeed/.skip.esp32h2    |    0
 tests/performance/psramspeed/psramspeed.ino   |  266 ++
 .../performance/psramspeed/test_psramspeed.py |  105 +
 tests/performance/ramspeed/cfg.json           |   40 +
 tests/performance/ramspeed/ramspeed.ino       |  262 ++
 tests/performance/ramspeed/test_ramspeed.py   |  105 +
 tests/performance/superpi/fftsg_h.cpp         | 2329 +++++++++++++++++
 tests/performance/superpi/fftsg_h.h           |   88 +
 tests/performance/superpi/pi_fftcs.cpp        | 2214 ++++++++++++++++
 tests/performance/superpi/pi_fftcs.h          |   47 +
 tests/performance/superpi/superpi.ino         |   41 +
 tests/performance/superpi/test_superpi.py     |   53 +
 tests/requirements.txt                        |    4 +-
 tests/{ => validation}/democfg/cfg.json       |    0
 tests/{ => validation}/democfg/democfg.ino    |    0
 .../{ => validation}/democfg/test_democfg.py  |    0
 .../hello_world/hello_world.ino               |    0
 .../hello_world/test_hello_world.py           |    0
 tests/{ => validation}/nvs/cfg.json           |    0
 tests/{ => validation}/nvs/nvs.ino            |    0
 tests/{ => validation}/nvs/test_nvs.py        |    0
 tests/{ => validation}/periman/periman.ino    |    0
 .../{ => validation}/periman/test_periman.py  |    0
 tests/{ => validation}/timer/test_timer.py    |    0
 tests/{ => validation}/timer/timer.ino        |    0
 tests/{ => validation}/touch/test_touch.py    |    0
 tests/{ => validation}/touch/touch.ino        |    0
 tests/{ => validation}/uart/test_uart.py      |    0
 tests/{ => validation}/uart/uart.ino          |    0
 tests/{ => validation}/unity/test_unity.py    |    0
 tests/{ => validation}/unity/unity.ino        |    0
 52 files changed, 8193 insertions(+), 57 deletions(-)
 create mode 100644 tests/performance/coremark/core_list_join.c
 create mode 100644 tests/performance/coremark/core_main.c
 create mode 100644 tests/performance/coremark/core_matrix.c
 create mode 100644 tests/performance/coremark/core_portme.c
 create mode 100644 tests/performance/coremark/core_portme.h
 create mode 100644 tests/performance/coremark/core_state.c
 create mode 100644 tests/performance/coremark/core_util.c
 create mode 100644 tests/performance/coremark/coremark.h
 create mode 100644 tests/performance/coremark/coremark.ino
 create mode 100644 tests/performance/coremark/test_coremark.py
 create mode 100644 tests/performance/fibonacci/fibonacci.ino
 create mode 100644 tests/performance/fibonacci/test_fibonacci.py
 create mode 100644 tests/performance/psramspeed/.skip.esp32c3
 create mode 100644 tests/performance/psramspeed/.skip.esp32c6
 create mode 100644 tests/performance/psramspeed/.skip.esp32h2
 create mode 100644 tests/performance/psramspeed/psramspeed.ino
 create mode 100644 tests/performance/psramspeed/test_psramspeed.py
 create mode 100644 tests/performance/ramspeed/cfg.json
 create mode 100644 tests/performance/ramspeed/ramspeed.ino
 create mode 100644 tests/performance/ramspeed/test_ramspeed.py
 create mode 100644 tests/performance/superpi/fftsg_h.cpp
 create mode 100644 tests/performance/superpi/fftsg_h.h
 create mode 100644 tests/performance/superpi/pi_fftcs.cpp
 create mode 100644 tests/performance/superpi/pi_fftcs.h
 create mode 100644 tests/performance/superpi/superpi.ino
 create mode 100644 tests/performance/superpi/test_superpi.py
 rename tests/{ => validation}/democfg/cfg.json (100%)
 rename tests/{ => validation}/democfg/democfg.ino (100%)
 rename tests/{ => validation}/democfg/test_democfg.py (100%)
 rename tests/{ => validation}/hello_world/hello_world.ino (100%)
 rename tests/{ => validation}/hello_world/test_hello_world.py (100%)
 rename tests/{ => validation}/nvs/cfg.json (100%)
 rename tests/{ => validation}/nvs/nvs.ino (100%)
 rename tests/{ => validation}/nvs/test_nvs.py (100%)
 rename tests/{ => validation}/periman/periman.ino (100%)
 rename tests/{ => validation}/periman/test_periman.py (100%)
 rename tests/{ => validation}/timer/test_timer.py (100%)
 rename tests/{ => validation}/timer/timer.ino (100%)
 rename tests/{ => validation}/touch/test_touch.py (100%)
 rename tests/{ => validation}/touch/touch.ino (100%)
 rename tests/{ => validation}/uart/test_uart.py (100%)
 rename tests/{ => validation}/uart/uart.ino (100%)
 rename tests/{ => validation}/unity/test_unity.py (100%)
 rename tests/{ => validation}/unity/unity.ino (100%)

diff --git a/.flake8 b/.flake8
index 881c4c629..5a2ed0b5b 100644
--- a/.flake8
+++ b/.flake8
@@ -1,12 +1,10 @@
 # Source: https://github.com/arduino/tooling-project-assets/blob/main/workflow-templates/assets/check-python/.flake8
 # See: https://flake8.pycqa.org/en/latest/user/configuration.html
-# The code style defined in this file is the official standardized style to be used in all Arduino tooling projects and
-# should not be modified.
 
 [flake8]
 doctests = True
 # W503 and W504 are mutually exclusive. PEP 8 recommends line break before.
 ignore = W503,E203
-max-complexity = 10
+max-complexity = 20
 max-line-length = 120
 select = E,W,F,C,N
diff --git a/.github/scripts/sketch_utils.sh b/.github/scripts/sketch_utils.sh
index e8da865a0..73a9ef8a7 100755
--- a/.github/scripts/sketch_utils.sh
+++ b/.github/scripts/sketch_utils.sh
@@ -121,7 +121,7 @@ function build_sketch(){ # build_sketch <ide_path> <user_path> <path-to-ino> [ex
     fi
 
     if [ -z "$fqbn" ]; then
-        echo "No FQBN passed or unvalid chip: $target"
+        echo "No FQBN passed or invalid chip: $target"
         exit 1
     fi
 
@@ -139,7 +139,7 @@ function build_sketch(){ # build_sketch <ide_path> <user_path> <path-to-ino> [ex
         echo "Skipping $sketchname for target $target"
         exit 0
     fi
-    
+
     ARDUINO_CACHE_DIR="$HOME/.arduino/cache.tmp"
     if [ -n "$ARDUINO_BUILD_DIR" ]; then
         build_dir="$ARDUINO_BUILD_DIR"
@@ -177,7 +177,7 @@ function build_sketch(){ # build_sketch <ide_path> <user_path> <path-to-ino> [ex
                 --build-path "$build_dir" \
                 $xtra_opts "${sketchdir}" \
                 > $output_file
-            
+
             exit_status=$?
             if [ $exit_status -ne 0 ]; then
                 echo ""ERROR: Compilation failed with error code $exit_status""
@@ -198,11 +198,11 @@ function build_sketch(){ # build_sketch <ide_path> <user_path> <path-to-ino> [ex
                 # Extract the desired substring using sed
                 lib_sketch_name=$(echo "$directory_path" | sed "s|$constant_part||")
                 #append json file where key is fqbn, sketch name, sizes -> extracted values
-                echo "{\"name\": \"$lib_sketch_name\", 
+                echo "{\"name\": \"$lib_sketch_name\",
                     \"sizes\": [{
-                            \"flash_bytes\": $flash_bytes, 
-                            \"flash_percentage\": $flash_percentage, 
-                            \"ram_bytes\": $ram_bytes, 
+                            \"flash_bytes\": $flash_bytes,
+                            \"flash_percentage\": $flash_percentage,
+                            \"ram_bytes\": $ram_bytes,
                             \"ram_percentage\": $ram_percentage
                             }]
                     }," >> "$sizes_file"
@@ -365,6 +365,7 @@ function build_sketches(){ # build_sketches <ide_path> <user_path> <target> <pat
         start_index=$(( $chunk_index * $chunk_size ))
         if [ "$sketchcount" -le "$start_index" ]; then
             echo "Skipping job"
+            touch ~/.build_skipped
             return 0
         fi
 
@@ -386,7 +387,7 @@ function build_sketches(){ # build_sketches <ide_path> <user_path> <target> <pat
     if [ $log_compilation ]; then
         #echo board,target and start of sketches to sizes_file json
         echo "{ \"board\": \"$fqbn\",
-                \"target\": \"$target\", 
+                \"target\": \"$target\",
                 \"sketches\": [" >> "$sizes_file"
     fi
 
diff --git a/.github/scripts/tests_build.sh b/.github/scripts/tests_build.sh
index 724e2171b..54778fab2 100755
--- a/.github/scripts/tests_build.sh
+++ b/.github/scripts/tests_build.sh
@@ -2,8 +2,8 @@
 
 USAGE="
 USAGE:
-    ${0} -c <chunk_build_opts>
-       Example: ${0} -c -t esp32 -i 0 -m 15
+    ${0} -c -type <test_type> <chunk_build_opts>
+       Example: ${0} -c -type validation -t esp32 -i 0 -m 15
     ${0} -s sketch_name <build_opts>
        Example: ${0} -s hello_world -t esp32
     ${0} -clean
@@ -11,10 +11,11 @@ USAGE:
 "
 
 function clean(){
-    rm -rf tests/*/build*/
     rm -rf tests/.pytest_cache
-    rm -rf tests/*/__pycache__/
-    rm -rf tests/*/*.xml
+    find tests/ -type d -name 'build*' -exec rm -rf "{}" \+
+    find tests/ -type d -name '__pycache__' -exec rm -rf "{}" \+
+    find tests/ -name '*.xml' -exec rm -rf "{}" \+
+    find tests/ -name 'result_*.json' -exec rm -rf "{}" \+
 }
 
 SCRIPTS_DIR="./.github/scripts"
@@ -35,6 +36,10 @@ while [ ! -z "$1" ]; do
         echo "$USAGE"
         exit 0
         ;;
+    -type )
+        shift
+        test_type=$1
+        ;;
     -clean )
         clean
         exit 0
@@ -52,12 +57,25 @@ source ${SCRIPTS_DIR}/install-arduino-core-esp32.sh
 
 args="-ai $ARDUINO_IDE_PATH -au $ARDUINO_USR_PATH"
 
+if [[ $test_type == "all" ]] || [[ -z $test_type ]]; then
+    if [ -n "$sketch" ]; then
+        tmp_sketch_path=$(find tests -name $sketch.ino)
+        test_type=$(basename $(dirname $(dirname "$tmp_sketch_path")))
+        echo "Sketch $sketch test type: $test_type"
+        test_folder="$PWD/tests/$test_type"
+    else
+      test_folder="$PWD/tests"
+    fi
+else
+    test_folder="$PWD/tests/$test_type"
+fi
+
 if [ $chunk_build -eq 1 ]; then
     BUILD_CMD="${SCRIPTS_DIR}/sketch_utils.sh chunk_build"
-    args+=" -p $PWD/tests"
+    args+=" -p $test_folder"
 else
     BUILD_CMD="${SCRIPTS_DIR}/sketch_utils.sh build"
-    args+=" -s $PWD/tests/$sketch"
+    args+=" -s $test_folder/$sketch"
 fi
 
 ${BUILD_CMD} ${args} $*
diff --git a/.github/scripts/tests_run.sh b/.github/scripts/tests_run.sh
index ef56fcf2d..0e2d8b01f 100755
--- a/.github/scripts/tests_run.sh
+++ b/.github/scripts/tests_run.sh
@@ -15,9 +15,9 @@ function run_test() {
     fi
 
     if [ $len -eq 1 ]; then
-      # build_dir="tests/$sketchname/build"
+      # build_dir="$sketchdir/build"
       build_dir="$HOME/.arduino/tests/$sketchname/build.tmp"
-      report_file="tests/$sketchname/$sketchname.xml"
+      report_file="$sketchdir/$sketchname.xml"
     fi
 
     for i in `seq 0 $(($len - 1))`
@@ -28,9 +28,9 @@ function run_test() {
         fi
 
         if [ $len -ne 1 ]; then
-            # build_dir="tests/$sketchname/build$i"
+            # build_dir="$sketchdir/build$i"
             build_dir="$HOME/.arduino/tests/$sketchname/build$i.tmp"
-            report_file="tests/$sketchname/$sketchname$i.xml"
+            report_file="$sketchdir/$sketchname$i.xml"
         fi
 
         pytest tests --build-dir $build_dir -k test_$sketchname --junit-xml=$report_file
@@ -79,6 +79,10 @@ while [ ! -z "$1" ]; do
         echo "$USAGE"
         exit 0
         ;;
+    -type )
+        shift
+        test_type=$1
+        ;;
     * )
       break
       ;;
@@ -88,21 +92,39 @@ done
 
 source ${SCRIPTS_DIR}/install-arduino-ide.sh
 
+# If sketch is provided and test type is not, test type is inferred from the sketch path
+if [[ $test_type == "all" ]] || [[ -z $test_type ]]; then
+    if [ -n "$sketch" ]; then
+        tmp_sketch_path=$(find tests -name $sketch.ino)
+        test_type=$(basename $(dirname $(dirname "$tmp_sketch_path")))
+        echo "Sketch $sketch test type: $test_type"
+        test_folder="$PWD/tests/$test_type"
+    else
+      test_folder="$PWD/tests"
+    fi
+else
+    test_folder="$PWD/tests/$test_type"
+fi
+
 if [ $chunk_run -eq 0 ]; then
-    run_test $target $PWD/tests/$sketch/$sketch.ino $options $erase
+    if [ -z $sketch ]; then
+        echo "ERROR: Sketch name is required for single test run"
+        exit 1
+    fi
+    run_test $target $test_folder/$sketch/$sketch.ino $options $erase
 else
   if [ "$chunk_max" -le 0 ]; then
       echo "ERROR: Chunks count must be positive number"
-      return 1
+      exit 1
   fi
 
   if [ "$chunk_index" -ge "$chunk_max" ] && [ "$chunk_max" -ge 2 ]; then
       echo "ERROR: Chunk index must be less than chunks count"
-      return 1
+      exit 1
   fi
 
   set +e
-  ${COUNT_SKETCHES} $PWD/tests $target
+  ${COUNT_SKETCHES} $test_folder $target
   sketchcount=$?
   set -e
   sketches=$(cat sketches.txt)
@@ -123,7 +145,8 @@ else
       start_index=$(( $chunk_index * $chunk_size ))
       if [ "$sketchcount" -le "$start_index" ]; then
           echo "Skipping job"
-          return 0
+          touch ~/.test_skipped
+          exit 0
       fi
 
       end_index=$(( $(( $chunk_index + 1 )) * $chunk_size ))
diff --git a/.github/workflows/hil.yml b/.github/workflows/hil.yml
index bc3afe419..f86bf8017 100644
--- a/.github/workflows/hil.yml
+++ b/.github/workflows/hil.yml
@@ -18,11 +18,14 @@ jobs:
   gen_chunks:
     if: |
       contains(github.event.pull_request.labels.*.name, 'hil_test') ||
+      contains(github.event.pull_request.labels.*.name, 'perf_test') ||
       (github.event_name == 'schedule' && github.repository == 'espressif/arduino-esp32')
     name: Generate Chunks matrix
     runs-on: ubuntu-latest
     outputs:
       chunks: ${{ steps.gen-chunks.outputs.chunks }}
+      test_folder: ${{ steps.gen-chunks.outputs.test_folder }}
+      test_type: ${{ steps.gen-chunks.outputs.test_type }}
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
@@ -31,7 +34,19 @@ jobs:
         id: gen-chunks
         run: |
           set +e
-          .github/scripts/sketch_utils.sh count tests
+          if [ "${{contains(github.event.pull_request.labels.*.name, 'hil_test')}}" == "true" ] && \
+             [ "${{contains(github.event.pull_request.labels.*.name, 'perf_test')}}" == "false" ]; then
+            test_folder="tests/validation"
+            test_type="validation"
+          elif [ "${{contains(github.event.pull_request.labels.*.name, 'hil_test')}}" == "false" ] && \
+               [ "${{contains(github.event.pull_request.labels.*.name, 'perf_test')}}" == "true" ]; then
+            test_folder="tests/performance"
+            test_type="performance"
+          else
+            test_folder="tests"
+            test_type="all"
+          fi
+          .github/scripts/sketch_utils.sh count $test_folder
           sketches=$?
           if [[ $sketches -ge ${{env.MAX_CHUNKS}} ]]; then
             $sketches=${{env.MAX_CHUNKS}}
@@ -39,7 +54,9 @@ jobs:
           set -e
           rm sketches.txt
           CHUNKS=$(jq -c -n '$ARGS.positional' --args `seq 0 1 $((sketches - 1))`)
-          echo "chunks=${CHUNKS}" >>$GITHUB_OUTPUT
+          echo "chunks=${CHUNKS}" >> $GITHUB_OUTPUT
+          echo "test_folder=${test_folder}" >> $GITHUB_OUTPUT
+          echo "test_type=${test_type}" >> $GITHUB_OUTPUT
 
   Build:
     needs: gen_chunks
@@ -52,17 +69,21 @@ jobs:
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
+
       - name: Build sketches
         run: |
-          bash .github/scripts/tests_build.sh -c -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}}
+          bash .github/scripts/tests_build.sh -c -type ${{ needs.gen_chunks.outputs.test_type }} -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}}
+
       - name: Upload ${{matrix.chip}}-${{matrix.chunks}} artifacts
         uses: actions/upload-artifact@v4
         with:
           name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts
-          path: |
-             ~/.arduino/tests/*/build*.tmp/*.bin
-             ~/.arduino/tests/*/build*.tmp/*.json
           if-no-files-found: error
+          path: |
+            ~/.build_skipped
+            ~/.arduino/tests/**/build*.tmp/*.bin
+            ~/.arduino/tests/**/build*.tmp/*.json
+
   Test:
     needs: [gen_chunks, Build]
     name: ${{matrix.chip}}-Test#${{matrix.chunks}}
@@ -77,36 +98,49 @@ jobs:
       options: --privileged
 
     steps:
-       - name: Checkout repository
-         uses: actions/checkout@v4
+      - name: Checkout repository
+        uses: actions/checkout@v4
 
-       - name: Download ${{matrix.chip}}-${{matrix.chunks}} artifacts
-         uses: actions/download-artifact@v4
-         with:
-           name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts
-           path: ~/.arduino/tests/
+      - name: Download ${{matrix.chip}}-${{matrix.chunks}} artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts
+          path: ~/
 
-       - name: Install dependencies
-         run: |
-           pip install -U pip
-           pip install -r tests/requirements.txt --extra-index-url https://dl.espressif.com/pypi
-           apt update && apt install -y -qq jq
+      - name: Install dependencies
+        run: |
+          pip install -U pip
+          pip install -r tests/requirements.txt --extra-index-url https://dl.espressif.com/pypi
+          apt update && apt install -y -qq jq
 
-       - name: Run Tests
-         run: |
-           bash .github/scripts/tests_run.sh -c -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} -e
+      - name: Run Tests
+        run: |
+          bash .github/scripts/tests_run.sh -c -type ${{ needs.gen_chunks.outputs.test_type }} -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} -e
 
-       - name: Upload test result artifacts
-         uses: actions/upload-artifact@v4
-         if: always()
-         with:
-           name: test_results-${{matrix.chip}}-${{matrix.chunks}}
-           path: tests/*/*.xml
+      - name: Check if tests were skipped
+        id: check-test-skipped
+        run: |
+          if [ -f ~/.test_skipped ]; then
+            echo "skipped=true" >> $GITHUB_OUTPUT
+          else
+            echo "skipped=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload test result artifacts
+        uses: actions/upload-artifact@v4
+        if: ${{ always() && steps.check-test-skipped.outputs.skipped == 'false' }}
+        with:
+          name: test_results-${{matrix.chip}}-${{matrix.chunks}}
+          if-no-files-found: error
+          path: |
+            tests/**/*.xml
+            tests/**/result_*.json
 
   event_file:
     name: "Event File"
     if: |
       contains(github.event.pull_request.labels.*.name, 'hil_test') ||
+      contains(github.event.pull_request.labels.*.name, 'perf_test') ||
       github.event_name == 'schedule'
     needs: Test
     runs-on: ubuntu-latest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3fa63413a..b06629896 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,8 @@
-exclude: ".github/.*"
+exclude: |
+  (?x)(
+      ^\.github\/|
+      ^tests\/performance\/coremark\/.*\.[ch]$
+  )
 
 default_language_version:
   # force all unspecified python hooks to run python3
diff --git a/tests/.gitignore b/tests/.gitignore
index d9333804a..4b548d270 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,4 @@
 build*/
 __pycache__/
 *.xml
+result_*.json
diff --git a/tests/performance/coremark/core_list_join.c b/tests/performance/coremark/core_list_join.c
new file mode 100644
index 000000000..a5154284a
--- /dev/null
+++ b/tests/performance/coremark/core_list_join.c
@@ -0,0 +1,495 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+	Benchmark using a linked list.
+
+	Linked list is a common data structure used in many applications.
+	
+	For our purposes, this will excercise the memory units of the processor.
+	In particular, usage of the list pointers to find and alter data.
+	
+	We are not using Malloc since some platforms do not support this library.
+	
+	Instead, the memory block being passed in is used to create a list,
+	and the benchmark takes care not to add more items then can be
+	accomodated by the memory block. The porting layer will make sure
+	that we have a valid memory block.
+	
+	All operations are done in place, without using any extra memory.
+	
+	The list itself contains list pointers and pointers to data items.
+	Data items contain the following:
+	
+	idx - An index that captures the initial order of the list.
+	data - Variable data initialized based on the input parameters. The 16b are divided as follows:
+	o Upper 8b are backup of original data.
+	o Bit 7 indicates if the lower 7 bits are to be used as is or calculated.
+	o Bits 0-2 indicate type of operation to perform to get a 7b value.
+	o Bits 3-6 provide input for the operation.
+	
+*/
+
+/* local functions */
+
+list_head *core_list_find(list_head *list,list_data *info);
+list_head *core_list_reverse(list_head *list);
+list_head *core_list_remove(list_head *item);
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified);
+list_head *core_list_insert_new(list_head *insert_point
+	, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end);
+typedef ee_s32(*list_cmp)(list_data *a, list_data *b, core_results *res);
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res);
+
+ee_s16 calc_func(ee_s16 *pdata, core_results *res) {
+	ee_s16 data=*pdata;
+	ee_s16 retval;
+	ee_u8 optype=(data>>7) & 1; /* bit 7 indicates if the function result has been cached */
+	if (optype) /* if cached, use cache */
+		return (data & 0x007f);
+	else { /* otherwise calculate and cache the result */
+		ee_s16 flag=data & 0x7; /* bits 0-2 is type of function to perform */
+		ee_s16 dtype=((data>>3) & 0xf); /* bits 3-6 is specific data for the operation */
+		dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */
+		switch (flag) {
+			case 0:
+				if (dtype<0x22) /* set min period for bit corruption */
+					dtype=0x22;
+				retval=core_bench_state(res->size,res->memblock[3],res->seed1,res->seed2,dtype,res->crc);
+				if (res->crcstate==0)
+					res->crcstate=retval;
+				break;
+			case 1:
+				retval=core_bench_matrix(&(res->mat),dtype,res->crc);
+				if (res->crcmatrix==0)
+					res->crcmatrix=retval;
+				break;
+			default:
+				retval=data;
+				break;
+		}
+		res->crc=crcu16(retval,res->crc);
+		retval &= 0x007f; 
+		*pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */
+		return retval;
+	}
+}
+/* Function: cmp_complex
+	Compare the data item in a list cell.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_complex(list_data *a, list_data *b, core_results *res) {
+	ee_s16 val1=calc_func(&(a->data16),res);
+	ee_s16 val2=calc_func(&(b->data16),res);
+	return val1 - val2;
+}
+
+/* Function: cmp_idx
+	Compare the idx item in a list cell, and regen the data.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) {
+	if (res==NULL) {
+		a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16>>8));
+		b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16>>8));
+	}
+	return a->idx - b->idx;
+}
+
+void copy_info(list_data *to,list_data *from) {
+	to->data16=from->data16;
+	to->idx=from->idx;
+}
+
+/* Benchmark for linked list:
+	- Try to find multiple data items.
+	- List sort
+	- Operate on data from list (crc)
+	- Single remove/reinsert
+	* At the end of this function, the list is back to original state
+*/
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
+	ee_u16 retval=0;
+	ee_u16 found=0,missed=0;
+	list_head *list=res->list;
+	ee_s16 find_num=res->seed3;
+	list_head *this_find;
+	list_head *finder, *remover;
+	list_data info;
+	ee_s16 i;
+
+	info.idx=finder_idx;
+	/* find <find_num> values in the list, and change the list each time (reverse and cache if value found) */
+	for (i=0; i<find_num; i++) {
+		info.data16= (i & 0xff) ;
+		this_find=core_list_find(list,&info);
+		list=core_list_reverse(list);
+		if (this_find==NULL) {
+			missed++;
+			retval+=(list->next->info->data16 >> 8) & 1;
+		}
+		else {
+			found++;
+			if (this_find->info->data16 & 0x1) /* use found value */
+				retval+=(this_find->info->data16 >> 9) & 1;
+			/* and cache next item at the head of the list (if any) */
+			if (this_find->next != NULL) {
+				finder = this_find->next;
+				this_find->next = finder->next;
+				finder->next=list->next;
+				list->next=finder;
+			}
+		}
+		if (info.idx>=0)
+			info.idx++;
+#if CORE_DEBUG
+	ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
+#endif
+	}
+	retval+=found*4-missed;
+	/* sort the list by data content and remove one item*/
+	if (finder_idx>0)
+		list=core_list_mergesort(list,cmp_complex,res);
+	remover=core_list_remove(list->next);
+	/* CRC data content of list from location of index N forward, and then undo remove */
+	finder=core_list_find(list,&info);
+	if (!finder)
+		finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 1: %04x\n",retval);
+#endif
+	remover=core_list_undo_remove(remover,list->next);
+	/* sort the list by index, in effect returning the list to original state */
+	list=core_list_mergesort(list,cmp_idx,NULL);
+	/* CRC data content of list */
+	finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 2: %04x\n",retval);
+#endif
+	return retval;
+}
+/* Function: core_list_init
+	Initialize list with data.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblock - Pointer to memory block.
+	seed - 	Actual values chosen depend on the seed parameter.
+		The seed parameter MUST be supplied from a source that cannot be determined at compile time
+
+	Returns:
+	Pointer to the head of the list.
+
+*/
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
+	/* calculated pointers for the list */
+	ee_u32 per_item=16+sizeof(struct list_data_s);
+	ee_u32 size=(blksize/per_item)-2; /* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
+	list_head *memblock_end=memblock+size;
+	list_data *datablock=(list_data *)(memblock_end);
+	list_data *datablock_end=datablock+size;
+	/* some useful variables */
+	ee_u32 i;
+	list_head *finder,*list=memblock;
+	list_data info;
+
+	/* create a fake items for the list head and tail */
+	list->next=NULL;
+	list->info=datablock;
+	list->info->idx=0x0000;
+	list->info->data16=(ee_s16)0x8080;
+	memblock++;
+	datablock++;
+	info.idx=0x7fff;
+	info.data16=(ee_s16)0xffff;
+	core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+	
+	/* then insert size items */
+	for (i=0; i<size; i++) {
+		ee_u16 datpat=((ee_u16)(seed^i) & 0xf);
+		ee_u16 dat=(datpat<<3) | (i&0x7); /* alternate between algorithms */
+		info.data16=(dat<<8) | dat;		/* fill the data with actual data and upper bits with rebuild value */
+		core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+	}
+	/* and now index the list so we know initial seed order of the list */
+	finder=list->next;
+	i=1;
+	while (finder->next!=NULL) {
+		if (i<size/5) /* first 20% of the list in order */
+			finder->info->idx=i++;
+		else { 
+			ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */
+			finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */
+		}
+		finder=finder->next;
+	}
+	list = core_list_mergesort(list,cmp_idx,NULL);
+#if CORE_DEBUG
+	ee_printf("Initialized list:\n");
+	finder=list;
+	while (finder) {
+		ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
+		finder=finder->next;
+	}
+	ee_printf("\n");
+#endif
+	return list;
+}
+
+/* Function: core_list_insert
+	Insert an item to the list
+
+	Parameters:
+	insert_point - where to insert the item.
+	info - data for the cell.
+	memblock - pointer for the list header
+	datablock - pointer for the list data
+	memblock_end - end of region for list headers
+	datablock_end - end of region for list data
+
+	Returns:
+	Pointer to new item.
+*/
+list_head *core_list_insert_new(list_head *insert_point, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end) {
+	list_head *newitem;
+	
+	if ((*memblock+1) >= memblock_end)
+		return NULL;
+	if ((*datablock+1) >= datablock_end)
+		return NULL;
+		
+	newitem=*memblock;
+	(*memblock)++;
+	newitem->next=insert_point->next;
+	insert_point->next=newitem;
+	
+	newitem->info=*datablock;
+	(*datablock)++;
+	copy_info(newitem->info,info);
+	
+	return newitem;
+}
+
+/* Function: core_list_remove
+	Remove an item from the list.
+
+	Operation:
+	For a singly linked list, remove by copying the data from the next item 
+	over to the current cell, and unlinking the next item.
+
+	Note: 
+	since there is always a fake item at the end of the list, no need to check for NULL.
+
+	Returns:
+	Removed item.
+*/
+list_head *core_list_remove(list_head *item) {
+	list_data *tmp;
+	list_head *ret=item->next;
+	/* swap data pointers */
+	tmp=item->info;
+	item->info=ret->info;
+	ret->info=tmp;
+	/* and eliminate item */
+	item->next=item->next->next;
+	ret->next=NULL;
+	return ret;
+}
+
+/* Function: core_list_undo_remove
+	Undo a remove operation.
+
+	Operation:
+	Since we want each iteration of the benchmark to be exactly the same,
+	we need to be able to undo a remove. 
+	Link the removed item back into the list, and switch the info items.
+
+	Parameters:
+	item_removed - Return value from the <core_list_remove>
+	item_modified - List item that was modified during <core_list_remove>
+
+	Returns:
+	The item that was linked back to the list.
+	
+*/
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified) {
+	list_data *tmp;
+	/* swap data pointers */
+	tmp=item_removed->info;
+	item_removed->info=item_modified->info;
+	item_modified->info=tmp;
+	/* and insert item */
+	item_removed->next=item_modified->next;
+	item_modified->next=item_removed;
+	return item_removed;
+}
+
+/* Function: core_list_find
+	Find an item in the list
+
+	Operation:
+	Find an item by idx (if not 0) or specific data value
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+list_head *core_list_find(list_head *list,list_data *info) {
+	if (info->idx>=0) {
+		while (list && (list->info->idx != info->idx))
+			list=list->next;
+		return list;
+	} else {
+		while (list && ((list->info->data16 & 0xff) != info->data16))
+			list=list->next;
+		return list;
+	}
+}
+/* Function: core_list_reverse
+	Reverse a list
+
+	Operation:
+	Rearrange the pointers so the list is reversed.
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+
+list_head *core_list_reverse(list_head *list) {
+	list_head *next=NULL, *tmp;
+	while (list) {
+		tmp=list->next;
+		list->next=next;
+		next=list;
+		list=tmp;
+	}
+	return next;
+}
+/* Function: core_list_mergesort
+	Sort the list in place without recursion.
+
+	Description:
+	Use mergesort, as for linked list this is a realistic solution. 
+	Also, since this is aimed at embedded, care was taken to use iterative rather then recursive algorithm.
+	The sort can either return the list to original order (by idx) ,
+	or use the data item to invoke other other algorithms and change the order of the list.
+
+	Parameters:
+	list - list to be sorted.
+	cmp - cmp function to use
+
+	Returns:
+	New head of the list.
+
+	Note: 
+	We have a special header for the list that will always be first,
+	but the algorithm could theoretically modify where the list starts.
+
+ */
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res) {
+    list_head *p, *q, *e, *tail;
+    ee_s32 insize, nmerges, psize, qsize, i;
+
+    insize = 1;
+
+    while (1) {
+        p = list;
+        list = NULL;
+        tail = NULL;
+
+        nmerges = 0;  /* count number of merges we do in this pass */
+
+        while (p) {
+            nmerges++;  /* there exists a merge to be done */
+            /* step `insize' places along from p */
+            q = p;
+            psize = 0;
+            for (i = 0; i < insize; i++) {
+                psize++;
+			    q = q->next;
+                if (!q) break;
+            }
+
+            /* if q hasn't fallen off end, we have two lists to merge */
+            qsize = insize;
+
+            /* now we have two lists; merge them */
+            while (psize > 0 || (qsize > 0 && q)) {
+
+				/* decide whether next element of merge comes from p or q */
+				if (psize == 0) {
+				    /* p is empty; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				} else if (qsize == 0 || !q) {
+				    /* q is empty; e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else if (cmp(p->info,q->info,res) <= 0) {
+				    /* First element of p is lower (or same); e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else {
+				    /* First element of q is lower; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				}
+
+		        /* add the next element to the merged list */
+				if (tail) {
+				    tail->next = e;
+				} else {
+				    list = e;
+				}
+				tail = e;
+	        }
+
+			/* now p has stepped `insize' places along, and q has too */
+			p = q;
+        }
+		
+	    tail->next = NULL;
+
+        /* If we have done only one merge, we're finished. */
+        if (nmerges <= 1)   /* allow for nmerges==0, the empty list case */
+            return list;
+
+        /* Otherwise repeat, merging lists twice the size */
+        insize *= 2;
+    }
+#if COMPILER_REQUIRES_SORT_RETURN
+	return list;
+#endif
+}
diff --git a/tests/performance/coremark/core_main.c b/tests/performance/coremark/core_main.c
new file mode 100644
index 000000000..61619744e
--- /dev/null
+++ b/tests/performance/coremark/core_main.c
@@ -0,0 +1,356 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* File: core_main.c
+	This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results.
+*/
+#include "coremark.h"
+
+/* Function: iterate
+	Run the benchmark for a specified number of iterations.
+
+	Operation:
+	For each type of benchmarked algorithm:
+		a - Initialize the data block for the algorithm.
+		b - Execute the algorithm N times.
+
+	Returns:
+	NULL.
+*/
+static ee_u16 list_known_crc[]   =      {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1};
+static ee_u16 matrix_known_crc[] =      {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747};
+static ee_u16 state_known_crc[]  =      {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84};
+void *iterate(void *pres) {
+	ee_u32 i;
+	ee_u16 crc;
+	core_results *res=(core_results *)pres;
+	ee_u32 iterations=res->iterations;
+	res->crc=0;
+	res->crclist=0;
+	res->crcmatrix=0;
+	res->crcstate=0;
+
+	for (i=0; i<iterations; i++) {
+		crc=core_bench_list(res,1);
+		res->crc=crcu16(crc,res->crc);
+		crc=core_bench_list(res,-1);
+		res->crc=crcu16(crc,res->crc);
+		if (i==0) res->crclist=res->crc;
+	}
+	return NULL;
+}
+
+#if (SEED_METHOD==SEED_ARG)
+ee_s32 get_seed_args(int i, int argc, char *argv[]);
+#define get_seed(x) (ee_s16)get_seed_args(x,argc,argv)
+#define get_seed_32(x) get_seed_args(x,argc,argv)
+#else /* via function or volatile */
+ee_s32 get_seed_32(int i);
+#define get_seed(x) (ee_s16)get_seed_32(x)
+#endif
+
+#if (MEM_METHOD==MEM_STATIC)
+ee_u8 static_memblk[TOTAL_DATA_SIZE];
+#endif
+char *mem_name[3] = {"Static","Heap","Stack"};
+/* Function: main
+	Main entry routine for the benchmark.
+	This function is responsible for the following steps:
+
+	1 - Initialize input seeds from a source that cannot be determined at compile time.
+	2 - Initialize memory block for use.
+	3 - Run and time the benchmark.
+	4 - Report results, testing the validity of the output if the seeds are known.
+
+	Arguments:
+	1 - first seed  : Any value
+	2 - second seed : Must be identical to first for iterations to be identical
+	3 - third seed  : Any value, should be at least an order of magnitude less then the input size, but bigger then 32.
+	4 - Iterations  : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs
+
+*/
+
+#if MAIN_HAS_NOARGC
+MAIN_RETURN_TYPE main(void) {
+	int argc=0;
+	char *argv[1];
+#else
+MAIN_RETURN_TYPE main(int argc, char *argv[]) {
+#endif
+	ee_u16 i,j=0,num_algorithms=0;
+	ee_s16 known_id=-1,total_errors=0;
+	ee_u16 seedcrc=0;
+	CORE_TICKS total_time;
+	core_results results[MULTITHREAD];
+#if (MEM_METHOD==MEM_STACK)
+	ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD];
+#endif
+	/* first call any initializations needed */
+	portable_init(&(results[0].port), &argc, argv);
+	/* First some checks to make sure benchmark will run ok */
+	if (sizeof(struct list_head_s)>128) {
+		ee_printf("list_head structure too big for comparable data!\n");
+		return MAIN_RETURN_VAL;
+	}
+	results[0].seed1=get_seed(1);
+	results[0].seed2=get_seed(2);
+	results[0].seed3=get_seed(3);
+	results[0].iterations=get_seed_32(4);
+#if CORE_DEBUG
+	results[0].iterations=1;
+#endif
+	results[0].execs=get_seed_32(5);
+	if (results[0].execs==0) { /* if not supplied, execute all algorithms */
+		results[0].execs=ALL_ALGORITHMS_MASK;
+	}
+		/* put in some default values based on one seed only for easy testing */
+	if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */
+		results[0].seed1=0;
+		results[0].seed2=0;
+		results[0].seed3=0x66;
+	}
+	if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */
+		results[0].seed1=0x3415;
+		results[0].seed2=0x3415;
+		results[0].seed3=0x66;
+	}
+#if (MEM_METHOD==MEM_STATIC)
+	results[0].memblock[0]=(void *)static_memblk;
+	results[0].size=TOTAL_DATA_SIZE;
+	results[0].err=0;
+	#if (MULTITHREAD>1)
+	#error "Cannot use a static data area with multiple contexts!"
+	#endif
+#elif (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		ee_s32 malloc_override=get_seed(7);
+		if (malloc_override != 0) 
+			results[i].size=malloc_override;
+		else
+			results[i].size=TOTAL_DATA_SIZE;
+		results[i].memblock[0]=portable_malloc(results[i].size);
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#elif (MEM_METHOD==MEM_STACK)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		results[i].memblock[0]=stack_memblock+i*TOTAL_DATA_SIZE;
+		results[i].size=TOTAL_DATA_SIZE;
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#else
+#error "Please define a way to initialize a memory block."
+#endif
+	/* Data init */ 
+	/* Find out how space much we have based on number of algorithms */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		if ((1<<(ee_u32)i) & results[0].execs)
+			num_algorithms++;
+	}
+	for (i=0 ; i<MULTITHREAD; i++) 
+		results[i].size=results[i].size/num_algorithms;
+	/* Assign pointers */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		ee_u32 ctx;
+		if ((1<<(ee_u32)i) & results[0].execs) {
+			for (ctx=0 ; ctx<MULTITHREAD; ctx++)
+				results[ctx].memblock[i+1]=(char *)(results[ctx].memblock[0])+results[0].size*j;
+			j++;
+		}
+	}
+	/* call inits */
+	for (i=0 ; i<MULTITHREAD; i++) {
+		if (results[i].execs & ID_LIST) {
+			results[i].list=core_list_init(results[0].size,results[i].memblock[1],results[i].seed1);
+		}
+		if (results[i].execs & ID_MATRIX) {
+			core_init_matrix(results[0].size, results[i].memblock[2], (ee_s32)results[i].seed1 | (((ee_s32)results[i].seed2) << 16), &(results[i].mat) );
+		}
+		if (results[i].execs & ID_STATE) {
+			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
+		}
+	}
+	
+	/* automatically determine number of iterations if not set */
+	if (results[0].iterations==0) { 
+		secs_ret secs_passed=0;
+		ee_u32 divisor;
+		results[0].iterations=1;
+		while (secs_passed < (secs_ret)1) {
+			results[0].iterations*=10;
+			start_time();
+			iterate(&results[0]);
+			stop_time();
+			secs_passed=time_in_secs(get_time());
+		}
+		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
+		divisor=(ee_u32)secs_passed;
+		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
+			divisor=1;
+		results[0].iterations*=1+10/divisor;
+	}
+	/* perform actual benchmark */
+	start_time();
+#if (MULTITHREAD>1)
+	if (default_num_contexts>MULTITHREAD) {
+		default_num_contexts=MULTITHREAD;
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		results[i].iterations=results[0].iterations;
+		results[i].execs=results[0].execs;
+		core_start_parallel(&results[i]);
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		core_stop_parallel(&results[i]);
+	}
+#else
+	iterate(&results[0]);
+#endif
+	stop_time();
+	total_time=get_time();
+	/* get a function of the input to report */
+	seedcrc=crc16(results[0].seed1,seedcrc);
+	seedcrc=crc16(results[0].seed2,seedcrc);
+	seedcrc=crc16(results[0].seed3,seedcrc);
+	seedcrc=crc16(results[0].size,seedcrc);
+	
+	switch (seedcrc) { /* test known output for common seeds */
+		case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
+			known_id=0;
+			ee_printf("6k performance run parameters for coremark.\n");
+			break;
+		case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per algorithm */
+			known_id=1;
+			ee_printf("6k validation run parameters for coremark.\n");
+			break;
+		case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm */
+			known_id=2;
+			ee_printf("Profile generation run parameters for coremark.\n");
+			break;
+		case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
+			known_id=3;
+			ee_printf("2K performance run parameters for coremark.\n");
+			break;
+		case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per algorithm */
+			known_id=4;
+			ee_printf("2K validation run parameters for coremark.\n");
+			break;
+		default:
+			total_errors=-1;
+			break;
+	}
+	if (known_id>=0) {
+		for (i=0 ; i<default_num_contexts; i++) {
+			results[i].err=0;
+			if ((results[i].execs & ID_LIST) && 
+				(results[i].crclist!=list_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",i,results[i].crclist,list_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_MATRIX) &&
+				(results[i].crcmatrix!=matrix_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",i,results[i].crcmatrix,matrix_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_STATE) &&
+				(results[i].crcstate!=state_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",i,results[i].crcstate,state_known_crc[known_id]);
+				results[i].err++;
+			}
+			total_errors+=results[i].err;
+		}
+	}
+	total_errors+=check_data_types();
+	/* and report results */
+	ee_printf("CoreMark Size    : %lu\n", (long unsigned) results[0].size);
+	ee_printf("Total ticks      : %lu\n", (long unsigned) total_time);
+#if HAS_FLOAT
+	ee_printf("Total time (secs): %f\n",time_in_secs(total_time));
+	if (time_in_secs(total_time) > 0)
+		ee_printf("Iterations/Sec   : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
+#else 
+	ee_printf("Total time (secs): %d\n",time_in_secs(total_time));
+	if (time_in_secs(total_time) > 0)
+		ee_printf("Iterations/Sec   : %d\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
+#endif
+	if (time_in_secs(total_time) < 10) {
+		ee_printf("ERROR! Must execute for at least 10 secs for a valid result!\n");
+		total_errors++;
+	}
+
+	ee_printf("Iterations       : %lu\n", (long unsigned) default_num_contexts*results[0].iterations);
+	ee_printf("Compiler version : %s\n",COMPILER_VERSION);
+	ee_printf("Compiler flags   : %s\n",COMPILER_FLAGS);
+#if (MULTITHREAD>1)
+	ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts);
+#endif
+	ee_printf("Memory location  : %s\n",MEM_LOCATION);
+	/* output for verification */
+	ee_printf("seedcrc          : 0x%04x\n",seedcrc);
+	if (results[0].execs & ID_LIST)
+		for (i=0 ; i<default_num_contexts; i++) 
+			ee_printf("[%d]crclist       : 0x%04x\n",i,results[i].crclist);
+	if (results[0].execs & ID_MATRIX) 
+		for (i=0 ; i<default_num_contexts; i++) 
+			ee_printf("[%d]crcmatrix     : 0x%04x\n",i,results[i].crcmatrix);
+	if (results[0].execs & ID_STATE)
+		for (i=0 ; i<default_num_contexts; i++) 
+			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
+	for (i=0 ; i<default_num_contexts; i++) 
+		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
+	if (total_errors==0) {
+		ee_printf("Correct operation validated. See README.md for run and reporting rules.\n");
+#if HAS_FLOAT
+		if (known_id==3) {
+			ee_printf("CoreMark 1.0 : %f / %s %s",default_num_contexts*results[0].iterations/time_in_secs(total_time),COMPILER_VERSION,COMPILER_FLAGS);
+#if defined(MEM_LOCATION) && !defined(MEM_LOCATION_UNSPEC)
+			ee_printf(" / %s",MEM_LOCATION);
+#else
+			ee_printf(" / %s",mem_name[MEM_METHOD]);
+#endif
+
+#if (MULTITHREAD>1)
+			ee_printf(" / %d:%s",default_num_contexts,PARALLEL_METHOD);
+#endif
+			ee_printf("\n");
+		}
+#endif
+	}
+	if (total_errors>0)
+		ee_printf("Errors detected\n");
+	if (total_errors<0)
+		ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n");
+
+#if (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++) 
+		portable_free(results[i].memblock[0]);
+#endif
+	/* And last call any target specific code for finalizing */
+	portable_fini(&(results[0].port));
+
+	return MAIN_RETURN_VAL;	
+}
+
+
diff --git a/tests/performance/coremark/core_matrix.c b/tests/performance/coremark/core_matrix.c
new file mode 100644
index 000000000..ebfa1d7e5
--- /dev/null
+++ b/tests/performance/coremark/core_matrix.c
@@ -0,0 +1,308 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+	Matrix manipulation benchmark
+	
+	This very simple algorithm forms the basis of many more complex algorithms. 
+	
+	The tight inner loop is the focus of many optimizations (compiler as well as hardware based) 
+	and is thus relevant for embedded processing. 
+	
+	The total available data space will be divided to 3 parts:
+	NxN Matrix A - initialized with small values (upper 3/4 of the bits all zero).
+	NxN Matrix B - initialized with medium values (upper half of the bits all zero).
+	NxN Matrix C - used for the result.
+
+	The actual values for A and B must be derived based on input that is not available at compile time.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
+
+#define matrix_test_next(x) (x+1)
+#define matrix_clip(x,y) ((y) ? (x) & 0x0ff : (x) & 0x0ffff)
+#define matrix_big(x) (0xf000 | (x))
+#define bit_extract(x,from,to) (((x)>>(from)) & (~(0xffffffff << (to))))
+
+#if CORE_DEBUG
+void printmat(MATDAT *A, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",A[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+void printmatC(MATRES *C, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",C[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+#endif
+/* Function: core_bench_matrix
+	Benchmark function
+
+	Iterate <matrix_test> N times, 
+	changing the matrix values slightly by a constant amount each time.
+*/
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) {
+	ee_u32 N=p->N;
+	MATRES *C=p->C;
+	MATDAT *A=p->A;
+	MATDAT *B=p->B;
+	MATDAT val=(MATDAT)seed;
+
+	crc=crc16(matrix_test(N,C,A,B,val),crc);
+
+	return crc;
+}
+
+/* Function: matrix_test
+	Perform matrix manipulation.
+
+	Parameters:
+	N - Dimensions of the matrix.
+	C - memory for result matrix.
+	A - input matrix
+	B - operator matrix (not changed during operations)
+
+	Returns:
+	A CRC value that captures all results calculated in the function.
+	In particular, crc of the value calculated on the result matrix 
+	after each step by <matrix_sum>.
+
+	Operation:
+	
+	1 - Add a constant value to all elements of a matrix.
+	2 - Multiply a matrix by a constant.
+	3 - Multiply a matrix by a vector.
+	4 - Multiply a matrix by a matrix.
+	5 - Add a constant value to all elements of a matrix.
+
+	After the last step, matrix A is back to original contents.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) {
+	ee_u16 crc=0;
+	MATDAT clipval=matrix_big(val);
+
+	matrix_add_const(N,A,val); /* make sure data changes  */
+#if CORE_DEBUG
+	printmat(A,N,"matrix_add_const");
+#endif
+	matrix_mul_const(N,C,A,val);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_const");
+#endif
+	matrix_mul_vect(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_vect");
+#endif
+	matrix_mul_matrix(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix");
+#endif
+	matrix_mul_matrix_bitextract(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix_bitextract");
+#endif
+	
+	matrix_add_const(N,A,-val); /* return matrix to initial value */
+	return crc;
+}
+
+/* Function : matrix_init
+	Initialize the memory block for matrix benchmarking.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblk - Pointer to memory block.
+	seed - Actual values chosen depend on the seed parameter.
+	p - pointers to <mat_params> containing initialized matrixes.
+
+	Returns:
+	Matrix dimensions.
+	
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) {
+	ee_u32 N=0;
+	MATDAT *A;
+	MATDAT *B;
+	ee_s32 order=1;
+	MATDAT val;
+	ee_u32 i=0,j=0;
+	if (seed==0)
+		seed=1;
+	while (j<blksize) {
+		i++;
+		j=i*i*2*4;		
+	}
+	N=i-1;
+	A=(MATDAT *)align_mem(memblk);
+	B=A+N*N;
+
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			seed = ( ( order * seed ) % 65536 );
+			val = (seed + order);
+			val=matrix_clip(val,0);
+			B[i*N+j] = val;
+			val =  (val + order);
+			val=matrix_clip(val,1);
+			A[i*N+j] = val;
+			order++;
+		}
+	}
+
+	p->A=A;
+	p->B=B;
+	p->C=(MATRES *)align_mem(B+N*N);
+	p->N=N;
+#if CORE_DEBUG
+	printmat(A,N,"A");
+	printmat(B,N,"B");
+#endif
+	return N;
+}
+
+/* Function: matrix_sum
+	Calculate a function that depends on the values of elements in the matrix.
+
+	For each element, accumulate into a temporary variable.
+	
+	As long as this value is under the parameter clipval, 
+	add 1 to the result if the element is bigger then the previous.
+	
+	Otherwise, reset the accumulator and add 10 to the result.
+*/
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) {
+	MATRES tmp=0,prev=0,cur=0;
+	ee_s16 ret=0;
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			cur=C[i*N+j];
+			tmp+=cur;
+			if (tmp>clipval) {
+				ret+=10;
+				tmp=0;
+			} else {
+				ret += (cur>prev) ? 1 : 0;
+			}
+			prev=cur;
+		}
+	}
+	return ret;
+}
+
+/* Function: matrix_mul_const
+	Multiply a matrix by a constant.
+	This could be used as a scaler for instance.
+*/
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=(MATRES)A[i*N+j] * (MATRES)val;
+		}
+	}
+}
+
+/* Function: matrix_add_const
+	Add a constant value to all elements of a matrix.
+*/
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			A[i*N+j] += val;
+		}
+	}
+}
+
+/* Function: matrix_mul_vect
+	Multiply a matrix by a vector.
+	This is common in many simple filters (e.g. fir where a vector of coefficients is applied to the matrix.)
+*/
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		C[i]=0;
+		for (j=0; j<N; j++) {
+			C[i]+=(MATRES)A[i*N+j] * (MATRES)B[j];
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix
+	Multiply a matrix by a matrix.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				C[i*N+j]+=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+			}
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix_bitextract
+	Multiply a matrix by a matrix, and extract some bits from the result.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				MATRES tmp=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+				C[i*N+j]+=bit_extract(tmp,2,4)*bit_extract(tmp,5,7);
+			}
+		}
+	}
+}
diff --git a/tests/performance/coremark/core_portme.c b/tests/performance/coremark/core_portme.c
new file mode 100644
index 000000000..198d75dfe
--- /dev/null
+++ b/tests/performance/coremark/core_portme.c
@@ -0,0 +1,168 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+#include "coremark.h"
+#include "core_portme.h"
+
+extern uint32_t Arduino_millis();
+
+#if (MULTITHREAD > 1)
+static uint8_t next_core = 0;
+#endif
+
+#if VALIDATION_RUN
+	volatile ee_s32 seed1_volatile=0x3415;
+	volatile ee_s32 seed2_volatile=0x3415;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PERFORMANCE_RUN
+	volatile ee_s32 seed1_volatile=0x0;
+	volatile ee_s32 seed2_volatile=0x0;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PROFILE_RUN
+	volatile ee_s32 seed1_volatile=0x8;
+	volatile ee_s32 seed2_volatile=0x8;
+	volatile ee_s32 seed3_volatile=0x8;
+#endif
+	volatile ee_s32 seed4_volatile=ITERATIONS;
+	volatile ee_s32 seed5_volatile=0;
+/* Porting : Timing functions
+	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
+	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc.
+	Sample implementation for standard time.h and windows.h definitions included.
+*/
+CORETIMETYPE barebones_clock() {
+	return Arduino_millis();
+}
+/* Define : TIMER_RES_DIVIDER
+	Divider to trade off timer resolution and total time that can be measured.
+
+	Use lower values to increase resolution, but make sure that overflow does not occur.
+	If there are issues with the return value overflowing, increase this value.
+	*/
+#define CLOCKS_PER_SEC 1000.0
+#define TIMER_RES_DIVIDER 1
+
+#define GETMYTIME(_t) (*_t=barebones_clock())
+#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
+#define TIMER_RES_DIVIDER 1
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#define EE_TICKS_PER_SEC (CLOCKS_PER_SEC / TIMER_RES_DIVIDER)
+
+/** Define Host specific (POSIX), or target specific global time variables. */
+static CORETIMETYPE start_time_val, stop_time_val;
+
+/* Function : start_time
+	This function will be called right before starting the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
+*/
+void start_time(void) {
+	GETMYTIME(&start_time_val );
+}
+/* Function : stop_time
+	This function will be called right after ending the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or other system parameters - e.g. reading the current value of cpu cycles counter.
+*/
+void stop_time(void) {
+	GETMYTIME(&stop_time_val );
+}
+/* Function : get_time
+	Return an abstract "ticks" number that signifies time on the system.
+
+	Actual value returned may be cpu cycles, milliseconds or any other value,
+	as long as it can be converted to seconds by <time_in_secs>.
+	This methodology is taken to accomodate any hardware or simulated platform.
+	The sample implementation returns millisecs by default,
+	and the resolution is controlled by <TIMER_RES_DIVIDER>
+*/
+CORE_TICKS get_time(void) {
+	CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+	return elapsed;
+}
+/* Function : time_in_secs
+	Convert the value returned by get_time to seconds.
+
+	The <secs_ret> type is used to accomodate systems with no support for floating point.
+	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
+*/
+secs_ret time_in_secs(CORE_TICKS ticks) {
+	secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+	return retval;
+}
+
+ee_u32 default_num_contexts = MULTITHREAD;
+
+/* Function : portable_init
+	Target specific initialization code
+	Test for some common mistakes.
+*/
+void portable_init(core_portable *p, int *argc, char *argv[])
+{
+	// Serial.begin(9600);
+	// #error "Call board initialization routines in portable init (if needed), in particular initialize UART!\n"
+	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
+		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+	}
+	p->portable_id=1;
+}
+/* Function : portable_fini
+	Target specific final code
+*/
+void portable_fini(core_portable *p)
+{
+	p->portable_id=0;
+}
+
+void iterate_task(void *arg)
+{
+  iterate(arg);
+  vTaskDelete(NULL);
+}
+
+#if (MULTITHREAD > 1)
+ee_u8 core_start_parallel(core_results *res)
+{
+    int ret;
+    res->port.task = NULL;
+    ret = xTaskCreatePinnedToCore(iterate_task,      /* Function to implement the task */
+                                  "CoreMarkTask",    /* Name of the task */
+                                  10000,             /* Stack size in words */
+                                  (void *)res,       /* Task input parameter */
+                                  20,                /* Priority of the task */
+                                  &(res->port.task), /* Task handle */
+                                  next_core);        /* Core where the task should run */
+
+    next_core = (next_core + 1) % MULTITHREAD;
+    return (ee_u8) ret;
+}
+
+ee_u8 core_stop_parallel(core_results *res)
+{
+    while (eTaskGetState(res->port.task) != eDeleted);
+    res->port.task = NULL;
+    return 0;
+}
+#endif
+
diff --git a/tests/performance/coremark/core_portme.h b/tests/performance/coremark/core_portme.h
new file mode 100644
index 000000000..9511aafba
--- /dev/null
+++ b/tests/performance/coremark/core_portme.h
@@ -0,0 +1,217 @@
+#include "Arduino.h"
+#include <stdint.h>
+#include <stdio.h>
+
+// a minor hack to rename the main function, so we can call it from C++
+#define main(ignore) coremark_main(void)
+
+#define FLAGS_STR "(flags unknown)"
+
+#define PERFORMANCE_RUN 1
+
+// 0 means auto-detect number of iterations for 10 second test
+#define ITERATIONS 0
+
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+/* Topic : Description
+	This file contains configuration constants required to execute on different platforms
+*/
+#ifndef CORE_PORTME_H
+#define CORE_PORTME_H
+/************************/
+/* Data types and settings */
+/************************/
+/* Configuration : HAS_FLOAT
+	Define to 1 if the platform supports floating point.
+*/
+#ifndef HAS_FLOAT
+#define HAS_FLOAT 1
+#endif
+/* Configuration : HAS_TIME_H
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef HAS_TIME_H
+#define HAS_TIME_H 0
+#endif
+/* Configuration : USE_CLOCK
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef USE_CLOCK
+#define USE_CLOCK 0
+#endif
+/* Configuration : HAS_STDIO
+	Define to 1 if the platform has stdio.h.
+*/
+#ifndef HAS_STDIO
+#define HAS_STDIO 1
+#endif
+/* Configuration : HAS_PRINTF
+	Define to 1 if the platform has stdio.h and implements the printf function.
+*/
+#ifndef HAS_PRINTF
+#define HAS_PRINTF 0
+#endif
+
+
+/* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
+	Initialize these strings per platform
+*/
+#ifndef COMPILER_VERSION
+ #ifdef __GNUC__
+ #define COMPILER_VERSION "GCC"__VERSION__
+ #else
+ #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
+ #endif
+#endif
+#ifndef COMPILER_FLAGS
+ #define COMPILER_FLAGS FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
+#endif
+#ifndef MEM_LOCATION
+ #define MEM_LOCATION "STACK"
+#endif
+
+/* Data Types :
+	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
+
+	*Imprtant* :
+	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
+*/
+typedef int16_t  ee_s16;
+typedef uint16_t ee_u16;
+typedef int32_t  ee_s32;
+typedef double   ee_f32;
+typedef uint8_t  ee_u8;
+typedef uint32_t ee_u32;
+typedef uintptr_t ee_ptr_int;
+typedef size_t ee_size_t;
+#define NULL ((void *)0)
+/* align_mem :
+	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
+*/
+#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x) - 1) & ~3))
+
+/* Configuration : CORE_TICKS
+	Define type of return from the timing functions.
+ */
+#define CORETIMETYPE ee_u32
+typedef ee_u32 CORE_TICKS;
+
+/* Configuration : SEED_METHOD
+	Defines method to get seed values that cannot be computed at compile time.
+
+	Valid values :
+	SEED_ARG - from command line.
+	SEED_FUNC - from a system function.
+	SEED_VOLATILE - from volatile variables.
+*/
+#ifndef SEED_METHOD
+#define SEED_METHOD SEED_VOLATILE
+#endif
+
+/* Configuration : MEM_METHOD
+	Defines method to get a block of memry.
+
+	Valid values :
+	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+	MEM_STATIC - to use a static memory array.
+	MEM_STACK - to allocate the data block on the stack (NYI).
+*/
+#ifndef MEM_METHOD
+#define MEM_METHOD MEM_STACK
+#endif
+
+/* Configuration : MULTITHREAD
+	Define for parallel execution
+
+	Valid values :
+	1 - only one context (default).
+	N>1 - will execute N copies in parallel.
+
+	Note :
+	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
+
+	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
+
+	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
+	to fit a particular architecture.
+*/
+#ifndef MULTITHREAD
+#define MULTITHREAD CONFIG_SOC_CPU_CORES_NUM
+#define PARALLEL_METHOD "FreeRTOS"
+#define USE_PTHREAD 0
+#define USE_FORK 0
+#define USE_SOCKET 0
+#endif
+
+/* Configuration : MAIN_HAS_NOARGC
+	Needed if platform does not support getting arguments to main.
+
+	Valid values :
+	0 - argc/argv to main is supported
+	1 - argc/argv to main is not supported
+
+	Note :
+	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
+*/
+#ifndef MAIN_HAS_NOARGC
+#define MAIN_HAS_NOARGC 1
+#endif
+
+/* Configuration : MAIN_HAS_NORETURN
+	Needed if platform does not support returning a value from main.
+
+	Valid values :
+	0 - main returns an int, and return value will be 0.
+	1 - platform does not support returning a value from main
+*/
+#ifndef MAIN_HAS_NORETURN
+#define MAIN_HAS_NORETURN 0
+#endif
+
+/* Variable : default_num_contexts
+	Not used for this simple port, must cintain the value 1.
+*/
+extern ee_u32 default_num_contexts;
+
+typedef struct CORE_PORTABLE_S {
+#if (MULTITHREAD > 1)
+  TaskHandle_t task;
+#endif
+	ee_u8	portable_id;
+} core_portable;
+
+/* target specific init/fini */
+void portable_init(core_portable *p, int *argc, char *argv[]);
+void portable_fini(core_portable *p);
+
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE==1200)
+#define PROFILE_RUN 1
+#elif (TOTAL_DATA_SIZE==2000)
+#define PERFORMANCE_RUN 1
+#else
+#define VALIDATION_RUN 1
+#endif
+#endif
+
+int ee_printf(const char *fmt, ...);
+
+#endif /* CORE_PORTME_H */
diff --git a/tests/performance/coremark/core_state.c b/tests/performance/coremark/core_state.c
new file mode 100644
index 000000000..bb3193308
--- /dev/null
+++ b/tests/performance/coremark/core_state.c
@@ -0,0 +1,277 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/* local functions */
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count);
+
+/*
+Topic: Description
+	Simple state machines like this one are used in many embedded products.
+	
+	For more complex state machines, sometimes a state transition table implementation is used instead, 
+	trading speed of direct coding for ease of maintenance.
+	
+	Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour,
+	we are using a small moore machine. 
+	
+	In particular, this machine tests type of string input,
+	trying to determine whether the input is a number or something else.
+	(see core_state.png).
+*/
+
+/* Function: core_bench_state
+	Benchmark function
+
+	Go over the input twice, once direct, and once after introducing some corruption. 
+*/
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, 
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc) 
+{
+	ee_u32 final_counts[NUM_CORE_STATES];
+	ee_u32 track_counts[NUM_CORE_STATES];
+	ee_u8 *p=memblock;
+	ee_u32 i;
+
+
+#if CORE_DEBUG
+	ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc);
+#endif
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		final_counts[i]=track_counts[i]=0;
+	}
+	/* run the state machine over the input */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* insert some corruption */
+		if (*p!=',')
+			*p^=(ee_u8)seed1;
+		p+=step;
+	}
+	p=memblock;
+	/* run the state machine over the input again */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* undo corruption is seed1 and seed2 are equal */
+		if (*p!=',')
+			*p^=(ee_u8)seed2;
+		p+=step;
+	}
+	/* end timing */
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		crc=crcu32(final_counts[i],crc);
+		crc=crcu32(track_counts[i],crc);
+	}
+	return crc;
+}
+
+/* Default initialization patterns */
+static ee_u8 *intpat[4]  ={(ee_u8 *)"5012",(ee_u8 *)"1234",(ee_u8 *)"-874",(ee_u8 *)"+122"};
+static ee_u8 *floatpat[4]={(ee_u8 *)"35.54400",(ee_u8 *)".1234500",(ee_u8 *)"-110.700",(ee_u8 *)"+0.64400"};
+static ee_u8 *scipat[4]  ={(ee_u8 *)"5.500e+3",(ee_u8 *)"-.123e-2",(ee_u8 *)"-87e+832",(ee_u8 *)"+0.6e-12"};
+static ee_u8 *errpat[4]  ={(ee_u8 *)"T0.3e-1F",(ee_u8 *)"-T.T++Tq",(ee_u8 *)"1T3.4e4z",(ee_u8 *)"34.0e-T^"};
+
+/* Function: core_init_state
+	Initialize the input data for the state machine.
+
+	Populate the input with several predetermined strings, interspersed.
+	Actual patterns chosen depend on the seed parameter.
+	
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p) {
+	ee_u32 total=0,next=0,i;
+	ee_u8 *buf=0;
+#if CORE_DEBUG
+	ee_u8 *start=p;
+	ee_printf("State: %d,%d\n",size,seed);
+#endif
+	size--;
+	next=0;
+	while ((total+next+1)<size) {
+		if (next>0) {
+			for(i=0;i<next;i++)
+				*(p+total+i)=buf[i];
+			*(p+total+i)=',';
+			total+=next+1;
+		}
+		seed++;
+		switch (seed & 0x7) {
+			case 0: /* int */
+			case 1: /* int */
+			case 2: /* int */
+				buf=intpat[(seed>>3) & 0x3];
+				next=4;
+			break;
+			case 3: /* float */
+			case 4: /* float */
+				buf=floatpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 5: /* scientific */
+			case 6: /* scientific */
+				buf=scipat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 7: /* invalid */
+				buf=errpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			default: /* Never happen, just to make some compilers happy */
+			break;
+		}
+	}
+	size++;
+	while (total<size) { /* fill the rest with 0 */
+		*(p+total)=0;
+		total++;
+	}
+#if CORE_DEBUG
+	ee_printf("State Input: %s\n",start);
+#endif
+}
+
+static ee_u8 ee_isdigit(ee_u8 c) {
+	ee_u8 retval;
+	retval = ((c>='0') & (c<='9')) ? 1 : 0;
+	return retval;
+}
+
+/* Function: core_state_transition
+	Actual state machine.
+
+	The state machine will continue scanning until either:
+	1 - an invalid input is detcted.
+	2 - a valid number has been detected.
+	
+	The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid).
+*/
+
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) {
+	ee_u8 *str=*instr;
+	ee_u8 NEXT_SYMBOL;
+	enum CORE_STATE state=CORE_START;
+	for( ; *str && state != CORE_INVALID; str++ ) {
+		NEXT_SYMBOL = *str;
+		if (NEXT_SYMBOL==',') /* end of this input */ {
+			str++;
+			break;
+		}
+		switch(state) {
+		case CORE_START:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+			}
+			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_S1;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			transition_count[CORE_START]++;
+			break;
+		case CORE_S1:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+				transition_count[CORE_S1]++;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_S1]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S1]++;
+			}
+			break;
+		case CORE_INT:
+			if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_INT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INT]++;
+			}
+			break;
+		case CORE_FLOAT:
+			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
+				state = CORE_S2;
+				transition_count[CORE_FLOAT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_FLOAT]++;
+			}
+			break;
+		case CORE_S2:
+			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_EXPONENT;
+				transition_count[CORE_S2]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S2]++;
+			}
+			break;
+		case CORE_EXPONENT:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_SCIENTIFIC;
+				transition_count[CORE_EXPONENT]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_EXPONENT]++;
+			}
+			break;
+		case CORE_SCIENTIFIC:
+			if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	*instr=str;
+	return state;
+}
diff --git a/tests/performance/coremark/core_util.c b/tests/performance/coremark/core_util.c
new file mode 100644
index 000000000..581adcc24
--- /dev/null
+++ b/tests/performance/coremark/core_util.c
@@ -0,0 +1,210 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/* Function: get_seed
+	Get a values that cannot be determined at compile time.
+
+	Since different embedded systems and compilers are used, 3 different methods are provided:
+	1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that
+	reads the value of a volatile variable from memory at run time. 
+	Please note, if using this method, you would need to modify core_portme.c to generate training profile.
+	2 - Command line arguments. This is the preferred method if command line arguments are supported.
+	3 - System function. If none of the first 2 methods is available on the platform,
+	a system function which is not a stub can be used.
+	
+	e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions.
+*/
+#if (SEED_METHOD==SEED_VOLATILE)
+	extern volatile ee_s32 seed1_volatile;
+	extern volatile ee_s32 seed2_volatile;
+	extern volatile ee_s32 seed3_volatile;
+	extern volatile ee_s32 seed4_volatile;
+	extern volatile ee_s32 seed5_volatile;
+	ee_s32 get_seed_32(int i) {
+		ee_s32 retval;
+		switch (i) {
+			case 1:
+				retval=seed1_volatile;
+				break;
+			case 2:
+				retval=seed2_volatile;
+				break;
+			case 3:
+				retval=seed3_volatile;
+				break;
+			case 4:
+				retval=seed4_volatile;
+				break;
+			case 5:
+				retval=seed5_volatile;
+				break;
+			default:
+				retval=0;
+				break;
+		}
+		return retval;
+	}
+#elif (SEED_METHOD==SEED_ARG)
+ee_s32 parseval(char *valstring) {
+	ee_s32 retval=0;
+	ee_s32 neg=1;
+	int hexmode=0;
+	if (*valstring == '-') {
+		neg=-1;
+		valstring++;
+	}
+	if ((valstring[0] == '0') && (valstring[1] == 'x')) {
+		hexmode=1;
+		valstring+=2;
+	}
+		/* first look for digits */
+	if (hexmode) {
+		while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) {
+			ee_s32 digit=*valstring-'0';
+			if (digit>9)
+				digit=10+*valstring-'a';
+			retval*=16;
+			retval+=digit;
+			valstring++;
+		}
+	} else {
+		while ((*valstring >= '0') && (*valstring <= '9')) {
+			ee_s32 digit=*valstring-'0';
+			retval*=10;
+			retval+=digit;
+			valstring++;
+		}
+	}
+	/* now add qualifiers */
+	if (*valstring=='K')
+		retval*=1024;
+	if (*valstring=='M')
+		retval*=1024*1024;
+
+	retval*=neg;
+	return retval;
+}
+
+ee_s32 get_seed_args(int i, int argc, char *argv[]) {
+	if (argc>i)
+		return parseval(argv[i]);
+	return 0;
+}
+
+#elif (SEED_METHOD==SEED_FUNC)
+/* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */
+ee_s32 get_seed_32(int i) {
+	ee_s32 retval;
+	switch (i) {
+		case 1:
+			retval=portme_sys1();
+			break;
+		case 2:
+			retval=portme_sys2();
+			break;
+		case 3:
+			retval=portme_sys3();
+			break;
+		case 4:
+			retval=portme_sys4();
+			break;
+		case 5:
+			retval=portme_sys5();
+			break;
+		default:
+			retval=0;
+			break;
+	}
+	return retval;
+}
+#endif
+
+/* Function: crc*
+	Service functions to calculate 16b CRC code.
+
+*/
+ee_u16 crcu8(ee_u8 data, ee_u16 crc )
+{
+	ee_u8 i=0,x16=0,carry=0;
+
+	for (i = 0; i < 8; i++)
+    {
+		x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+		data >>= 1;
+
+		if (x16 == 1)
+		{
+		   crc ^= 0x4002;
+		   carry = 1;
+		}
+		else 
+			carry = 0;
+		crc >>= 1;
+		if (carry)
+		   crc |= 0x8000;
+		else
+		   crc &= 0x7fff;
+    }
+	return crc;
+} 
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc) {
+	crc=crcu8( (ee_u8) (newval)				,crc);
+	crc=crcu8( (ee_u8) ((newval)>>8)	,crc);
+	return crc;
+}
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc) {
+	crc=crc16((ee_s16) newval		,crc);
+	crc=crc16((ee_s16) (newval>>16)	,crc);
+	return crc;
+}
+ee_u16 crc16(ee_s16 newval, ee_u16 crc) {
+	return crcu16((ee_u16)newval, crc);
+}
+
+ee_u8 check_data_types() {
+	ee_u8 retval=0;
+	if (sizeof(ee_u8) != 1) {
+		ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u16) != 2) {
+		ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s16) != 2) {
+		ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s32) != 4) {
+		ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_ptr_int) != sizeof(int *)) {
+		ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
+		retval++;
+	}
+	if (retval>0) {
+		ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
+	}
+	return retval;
+}
diff --git a/tests/performance/coremark/coremark.h b/tests/performance/coremark/coremark.h
new file mode 100644
index 000000000..dc9f8c7ae
--- /dev/null
+++ b/tests/performance/coremark/coremark.h
@@ -0,0 +1,174 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* Topic: Description
+	This file contains  declarations of the various benchmark functions.
+*/
+
+/* Configuration: TOTAL_DATA_SIZE
+	Define total size for data algorithms will operate on
+*/
+#ifndef TOTAL_DATA_SIZE 
+#define TOTAL_DATA_SIZE 2*1000
+#endif
+
+#define SEED_ARG 0
+#define SEED_FUNC 1
+#define SEED_VOLATILE 2
+
+#define MEM_STATIC 0
+#define MEM_MALLOC 1
+#define MEM_STACK 2
+
+#include "core_portme.h"
+
+#if HAS_STDIO
+#include <stdio.h>
+#endif
+#if HAS_PRINTF
+#define ee_printf printf
+#endif
+
+/* Actual benchmark execution in iterate */
+void *iterate(void *pres);
+
+/* Typedef: secs_ret
+	For machines that have floating point support, get number of seconds as a double. 
+	Otherwise an unsigned int.
+*/
+#if HAS_FLOAT
+typedef double secs_ret;
+#else
+typedef ee_u32 secs_ret;
+#endif
+
+#if MAIN_HAS_NORETURN
+#define MAIN_RETURN_VAL 
+#define MAIN_RETURN_TYPE void
+#else
+#define MAIN_RETURN_VAL 0
+#define MAIN_RETURN_TYPE int
+#endif 
+
+void start_time(void);
+void stop_time(void);
+CORE_TICKS get_time(void);
+secs_ret time_in_secs(CORE_TICKS ticks);
+
+/* Misc useful functions */
+ee_u16 crcu8(ee_u8 data, ee_u16 crc);
+ee_u16 crc16(ee_s16 newval, ee_u16 crc);
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
+ee_u8 check_data_types();
+void *portable_malloc(ee_size_t size);
+void portable_free(void *p);
+ee_s32 parseval(char *valstring);
+
+/* Algorithm IDS */
+#define ID_LIST 	(1<<0)
+#define ID_MATRIX 	(1<<1)
+#define ID_STATE 	(1<<2)
+#define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE)
+#define NUM_ALGORITHMS 3
+
+/* list data structures */
+typedef struct list_data_s {
+	ee_s16 data16;
+	ee_s16 idx;
+} list_data;
+
+typedef struct list_head_s {
+	struct list_head_s *next;
+	struct list_data_s *info;
+} list_head;
+
+
+/*matrix benchmark related stuff */
+#define MATDAT_INT 1
+#if MATDAT_INT
+typedef ee_s16 MATDAT;
+typedef ee_s32 MATRES;
+#else
+typedef ee_f16 MATDAT;
+typedef ee_f32 MATRES;
+#endif
+
+typedef struct MAT_PARAMS_S {
+	int N;
+	MATDAT *A;
+	MATDAT *B;
+	MATRES *C;
+} mat_params;
+
+/* state machine related stuff */
+/* List of all the possible states for the FSM */
+typedef enum CORE_STATE {
+	CORE_START=0,
+	CORE_INVALID,
+	CORE_S1,
+	CORE_S2,
+	CORE_INT,
+	CORE_FLOAT,
+	CORE_EXPONENT,
+	CORE_SCIENTIFIC,
+	NUM_CORE_STATES
+} core_state_e ;
+
+		
+/* Helper structure to hold results */
+typedef struct RESULTS_S {
+	/* inputs */
+	ee_s16	seed1;		/* Initializing seed */
+	ee_s16	seed2;		/* Initializing seed */
+	ee_s16	seed3;		/* Initializing seed */
+	void	*memblock[4];	/* Pointer to safe memory location */
+	ee_u32	size;		/* Size of the data */
+	ee_u32 iterations;		/* Number of iterations to execute */
+	ee_u32	execs;		/* Bitmask of operations to execute */
+	struct list_head_s *list;
+	mat_params mat;
+	/* outputs */
+	ee_u16	crc;
+	ee_u16	crclist;
+	ee_u16	crcmatrix;
+	ee_u16	crcstate;
+	ee_s16	err;
+	/* ultithread specific */
+	core_portable port;
+} core_results;
+
+/* Multicore execution handling */
+#if (MULTITHREAD>1)
+ee_u8 core_start_parallel(core_results *res);
+ee_u8 core_stop_parallel(core_results *res);
+#endif
+
+/* list benchmark functions */
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx);
+
+/* state benchmark functions */
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, 
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc);
+
+/* matrix benchmark functions */
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p);
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
+
diff --git a/tests/performance/coremark/coremark.ino b/tests/performance/coremark/coremark.ino
new file mode 100644
index 000000000..776db7874
--- /dev/null
+++ b/tests/performance/coremark/coremark.ino
@@ -0,0 +1,118 @@
+/*
+    CoreMark benchmark for ESP32 using Arduino's C++ environment with multithreading support.
+
+    Based on https://github.com/PaulStoffregen/CoreMark/tree/master
+    Modified to run on ESP32 by Lucas Saavedra Vaz, 2024.
+*/
+
+#include <Arduino.h>
+#include <stdarg.h>
+
+#include <esp_task_wdt.h>
+
+// Timeout for the task watchdog timer
+#define TWDT_TIMEOUT_S 20
+
+// Number of runs to average
+#define N_RUNS 3
+
+// A way to call the C-only coremark function from Arduino's C++ environment
+extern "C" int coremark_main(void);
+
+void setup() {
+  Serial.begin(115200);
+  while (!Serial) {
+    delay(10);
+  }
+
+  // To avoid the watchdog timer from resetting the ESP32 while running CoreMark we
+  // need to reconfigure it to have a longer timeout.
+  esp_task_wdt_config_t config = {
+    .timeout_ms = TWDT_TIMEOUT_S * 1000,
+    .idle_core_mask = 0,
+    .trigger_panic = false,
+  };
+
+  esp_task_wdt_reconfigure(&config);
+
+  log_d("Starting CoreMark test");
+  Serial.printf("Runs: %d\n", N_RUNS);
+  Serial.printf("Cores: %d\n", CONFIG_SOC_CPU_CORES_NUM);
+  Serial.flush();
+  for (int i = 0; i < N_RUNS; i++) {
+    Serial.printf("Run %d", i);
+    coremark_main();
+    Serial.flush();
+  }
+  log_d("CoreMark test finished");
+}
+
+void loop() {
+  vTaskDelete(NULL);
+}
+
+// CoreMark calls this function to print results.
+extern "C" int ee_printf(const char *format, ...) {
+  va_list args;
+  va_start(args, format);
+  for (; *format; format++) {
+    if (*format == '%') {
+      bool islong = false;
+      format++;
+      if (*format == '%') {
+        Serial.print(*format);
+        continue;
+      }
+      if (*format == '-') {
+        format++;  // ignore size
+      }
+      while (*format >= '0' && *format <= '9') {
+        format++;  // ignore size
+      }
+      if (*format == 'l') {
+        islong = true;
+        format++;
+      }
+      if (*format == '\0') {
+        break;
+      }
+      if (*format == 's') {
+        Serial.print((char *)va_arg(args, int));
+      } else if (*format == 'f') {
+        Serial.print(va_arg(args, double));
+      } else if (*format == 'd') {
+        if (islong) {
+          Serial.print(va_arg(args, long));
+        } else {
+          Serial.print(va_arg(args, int));
+        }
+      } else if (*format == 'u') {
+        if (islong) {
+          Serial.print(va_arg(args, unsigned long));
+        } else {
+          Serial.print(va_arg(args, unsigned int));
+        }
+      } else if (*format == 'x') {
+        if (islong) {
+          Serial.print(va_arg(args, unsigned long), HEX);
+        } else {
+          Serial.print(va_arg(args, unsigned int), HEX);
+        }
+      } else if (*format == 'c') {
+        Serial.print(va_arg(args, int));
+      }
+    } else {
+      if (*format == '\n') {
+        Serial.print('\r');
+      }
+      Serial.print(*format);
+    }
+  }
+  va_end(args);
+  return 1;
+}
+
+// CoreMark calls this function to measure elapsed time
+extern "C" uint32_t Arduino_millis(void) {
+  return millis();
+}
diff --git a/tests/performance/coremark/test_coremark.py b/tests/performance/coremark/test_coremark.py
new file mode 100644
index 000000000..befd7c3a1
--- /dev/null
+++ b/tests/performance/coremark/test_coremark.py
@@ -0,0 +1,58 @@
+import json
+import logging
+import os
+
+
+def test_coremark(dut, request):
+    LOGGER = logging.getLogger(__name__)
+
+    # Match "Runs: %d"
+    res = dut.expect(r"Runs: (\d+)", timeout=60)
+    runs = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of runs: {}".format(runs))
+    assert runs > 0, "Invalid number of runs"
+
+    # Match "Cores: %d"
+    res = dut.expect(r"Cores: (\d+)", timeout=60)
+    cores = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of cores: {}".format(cores))
+    assert cores > 0, "Invalid number of cores"
+
+    total_score = 0
+
+    for i in range(runs):
+        # Match "Run %d"
+        res = dut.expect(r"Run (\d+)", timeout=120)
+        run = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Run {}".format(run))
+        assert run == i, "Invalid run number"
+
+        score = 0
+        # Match "CoreMark 1.0 : %d"
+        res = dut.expect(r"CoreMark 1.0 : (\d+)\.(\d+)", timeout=120)
+        score = float(res.group(0).decode("utf-8").split(" ")[3])
+        LOGGER.info("CoreMark score: {}".format(score))
+        assert score > 0 and score < 10000, "Impossible CoreMark score"
+        total_score += score
+
+    avg_score = round(total_score / runs, 2)
+    LOGGER.info("Average CoreMark score: {}".format(avg_score))
+    assert avg_score > 0 and avg_score < 10000, "Impossible CoreMark score"
+
+    # Create JSON with results and write it to file
+    # Always create a JSON with this format (so it can be merged later on):
+    # { TEST_NAME_STR: TEST_RESULTS_DICT }
+    results = {"coremark": {"runs": runs, "cores": cores, "avg_score": avg_score}}
+
+    current_folder = os.path.dirname(request.path)
+    file_index = 0
+    report_file = os.path.join(current_folder, "result_coremark" + str(file_index) + ".json")
+    while os.path.exists(report_file):
+        report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json")
+        file_index += 1
+
+    with open(report_file, "w") as f:
+        try:
+            f.write(json.dumps(results))
+        except Exception as e:
+            LOGGER.warning("Failed to write results to file: {}".format(e))
diff --git a/tests/performance/fibonacci/fibonacci.ino b/tests/performance/fibonacci/fibonacci.ino
new file mode 100644
index 000000000..01fd6f7be
--- /dev/null
+++ b/tests/performance/fibonacci/fibonacci.ino
@@ -0,0 +1,48 @@
+/*
+  Fibonacci calculation test for Arduino and ESP32.
+  Created by Lucas Saavedra Vaz, 2024
+*/
+
+#include <Arduino.h>
+
+// Number of runs to average
+#define N_RUNS 3
+
+// Fibonacci number to calculate. Keep between 35 and 45.
+#define FIB_N 40
+
+uint64_t fib(uint32_t n) {
+  if (n < 2) {
+    return n;
+  }
+  return fib(n - 1) + fib(n - 2);
+}
+
+void setup() {
+  uint64_t fibonacci;
+
+  Serial.begin(115200);
+  while (!Serial) {
+    delay(10);
+  }
+
+  log_d("Starting fibonacci calculation");
+  Serial.printf("Runs: %d\n", N_RUNS);
+  Serial.printf("N: %d\n", FIB_N);
+  Serial.flush();
+  for (int i = 0; i < N_RUNS; i++) {
+    Serial.printf("Run %d", i);
+    unsigned long start = millis();
+    fibonacci = fib(FIB_N);
+    unsigned long elapsed = millis() - start;
+    Serial.printf("Fibonacci(N): %llu\n", fibonacci);
+    Serial.printf("Time: %lu.%03lu s\n", elapsed / 1000, elapsed % 1000);
+    Serial.flush();
+  }
+
+  log_d("Fibonacci calculation test done");
+}
+
+void loop() {
+  vTaskDelete(NULL);
+}
diff --git a/tests/performance/fibonacci/test_fibonacci.py b/tests/performance/fibonacci/test_fibonacci.py
new file mode 100644
index 000000000..622ea77ee
--- /dev/null
+++ b/tests/performance/fibonacci/test_fibonacci.py
@@ -0,0 +1,78 @@
+import json
+import logging
+import os
+
+
+def test_fibonacci(dut, request):
+    LOGGER = logging.getLogger(__name__)
+
+    # Fibonacci results starting from fib(35) to fib(45)
+    fib_results = [
+        9227465,
+        14930352,
+        24157817,
+        39088169,
+        63245986,
+        102334155,
+        165580141,
+        267914296,
+        433494437,
+        701408733,
+    ]
+
+    # Match "Runs: %d"
+    res = dut.expect(r"Runs: (\d+)", timeout=60)
+    runs = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of runs: {}".format(runs))
+    assert runs > 0, "Invalid number of runs"
+
+    # Match "N: %d"
+    res = dut.expect(r"N: (\d+)", timeout=300)
+    fib_n = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Calculating Fibonacci({})".format(fib_n))
+    assert fib_n > 30 and fib_n < 50, "Invalid Fibonacci number"
+
+    list_time = []
+
+    for i in range(runs):
+        # Match "Run %d"
+        res = dut.expect(r"Run (\d+)", timeout=120)
+        run = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Run {}".format(run))
+        assert run == i, "Invalid run number"
+
+        # Match "Fibonacci(N): %llu"
+        res = dut.expect(r"Fibonacci\(N\): (\d+)", timeout=300)
+        fib_result = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Fibonacci({}) = {}".format(fib_n, fib_result))
+        assert fib_result > 0, "Invalid Fibonacci result"
+
+        # Check if the result is correct
+        assert fib_result == fib_results[fib_n - 35]
+
+        # Match "Time: %lu.%03lu s"
+        res = dut.expect(r"Time: (\d+)\.(\d+) s", timeout=300)
+        time = float(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Time on run {}: {} s".format(i, time))
+        assert time > 0 and time < 1000, "Invalid time"
+        list_time.append(time)
+
+    avg_time = round(sum(list_time) / len(list_time), 3)
+
+    # Create JSON with results and write it to file
+    # Always create a JSON with this format (so it can be merged later on):
+    # { TEST_NAME_STR: TEST_RESULTS_DICT }
+    results = {"fibonacci": {"runs": runs, "fib_n": fib_n, "avg_time": avg_time}}
+
+    current_folder = os.path.dirname(request.path)
+    file_index = 0
+    report_file = os.path.join(current_folder, "result_fibonacci" + str(file_index) + ".json")
+    while os.path.exists(report_file):
+        report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json")
+        file_index += 1
+
+    with open(report_file, "w") as f:
+        try:
+            f.write(json.dumps(results))
+        except Exception as e:
+            LOGGER.warning("Failed to write results to file: {}".format(e))
diff --git a/tests/performance/psramspeed/.skip.esp32c3 b/tests/performance/psramspeed/.skip.esp32c3
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/performance/psramspeed/.skip.esp32c6 b/tests/performance/psramspeed/.skip.esp32c6
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/performance/psramspeed/.skip.esp32h2 b/tests/performance/psramspeed/.skip.esp32h2
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/performance/psramspeed/psramspeed.ino b/tests/performance/psramspeed/psramspeed.ino
new file mode 100644
index 000000000..81175e6b3
--- /dev/null
+++ b/tests/performance/psramspeed/psramspeed.ino
@@ -0,0 +1,266 @@
+/*
+  Based on the ramspeed test from NuttX.
+  https://github.com/apache/nuttx-apps/blob/master/benchmarks/ramspeed/ramspeed_main.c
+  Modified for Arduino and ESP32 by Lucas Saavedra Vaz, 2024
+*/
+
+#include <Arduino.h>
+
+// Test settings
+
+// Number of runs to average
+#define N_RUNS 3
+
+// Value to fill the memory with
+#define FILL_VALUE 0x00
+
+// Number of copies to be performed in each test
+#define N_COPIES 400
+
+// Start size for the tests. Value must be a power of 2.
+// Values lower or equal than 32 KB may cause the operations to use the cache instead of the PSRAM.
+#define START_SIZE 65536
+
+// Max size to be copied. Must be bigger than 32 and it will be floored to the nearest power of 2
+#define MAX_TEST_SIZE 512 * 1024  // 512KB
+
+// Implementation macros
+
+#if defined(UINTPTR_MAX) && UINTPTR_MAX > 0xFFFFFFFF
+#define MEM_UNIT   uint64_t
+#define ALIGN_MASK 0x7
+#else
+#define MEM_UNIT   uint32_t
+#define ALIGN_MASK 0x3
+#endif
+
+#define COPY32 \
+  *d32 = *s32; \
+  d32++;       \
+  s32++;
+#define COPY8 \
+  *d8 = *s8;  \
+  d8++;       \
+  s8++;
+#define SET32(x) \
+  *d32 = x;      \
+  d32++;
+#define SET8(x) \
+  *d8 = x;      \
+  d8++;
+#define REPEAT8(expr) expr expr expr expr expr expr expr expr
+
+/* Functions */
+
+static void *mock_memcpy(void *dst, const void *src, size_t len) {
+  uint8_t *d8 = (uint8_t *)dst;
+  const uint8_t *s8 = (uint8_t *)src;
+
+  uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK;
+  uintptr_t s_align = (uintptr_t)s8 & ALIGN_MASK;
+  uint32_t *d32;
+  const uint32_t *s32;
+
+  /* Byte copy for unaligned memories */
+
+  if (s_align != d_align) {
+    while (len > 32) {
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      len -= 32;
+    }
+
+    while (len) {
+      COPY8;
+      len--;
+    }
+
+    return dst;
+  }
+
+  /* Make the memories aligned */
+
+  if (d_align) {
+    d_align = ALIGN_MASK + 1 - d_align;
+    while (d_align && len) {
+      COPY8;
+      d_align--;
+      len--;
+    }
+  }
+
+  d32 = (uint32_t *)d8;
+  s32 = (uint32_t *)s8;
+  while (len > 32) {
+    REPEAT8(COPY32);
+    len -= 32;
+  }
+
+  while (len > 4) {
+    COPY32;
+    len -= 4;
+  }
+
+  d8 = (uint8_t *)d32;
+  s8 = (const uint8_t *)s32;
+  while (len) {
+    COPY8;
+    len--;
+  }
+
+  return dst;
+}
+
+static void mock_memset(void *dst, uint8_t v, size_t len) {
+  uint8_t *d8 = (uint8_t *)dst;
+  uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK;
+  uint32_t v32;
+  uint32_t *d32;
+
+  /* Make the address aligned */
+
+  if (d_align) {
+    d_align = ALIGN_MASK + 1 - d_align;
+    while (d_align && len) {
+      SET8(v);
+      len--;
+      d_align--;
+    }
+  }
+
+  v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24);
+
+  d32 = (uint32_t *)d8;
+
+  while (len > 32) {
+    REPEAT8(SET32(v32));
+    len -= 32;
+  }
+
+  while (len > 4) {
+    SET32(v32);
+    len -= 4;
+  }
+
+  d8 = (uint8_t *)d32;
+  while (len) {
+    SET8(v);
+    len--;
+  }
+}
+
+static void print_rate(const char *name, uint64_t bytes, uint32_t cost_time) {
+  uint32_t rate;
+  if (cost_time == 0) {
+    Serial.println("Error: Too little time taken, please increase N_COPIES");
+    return;
+  }
+
+  rate = bytes * 1000 / cost_time / 1024;
+  Serial.printf("%s Rate = %" PRIu32 " KB/s Time: %" PRIu32 " ms\n", name, rate, cost_time);
+}
+
+static void memcpy_speed_test(void *dest, const void *src, size_t size, uint32_t repeat_cnt) {
+  uint32_t start_time;
+  uint32_t cost_time_system;
+  uint32_t cost_time_mock;
+  uint32_t cnt;
+  uint32_t step;
+  uint64_t total_size;
+
+  for (step = START_SIZE; step <= size; step <<= 1) {
+    total_size = (uint64_t)step * (uint64_t)repeat_cnt;
+
+    Serial.printf("Memcpy %" PRIu32 " Bytes test\n", step);
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_cnt; cnt++) {
+      memcpy(dest, src, step);
+    }
+
+    cost_time_system = millis() - start_time;
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_cnt; cnt++) {
+      mock_memcpy(dest, src, step);
+    }
+
+    cost_time_mock = millis() - start_time;
+
+    print_rate("System memcpy():", total_size, cost_time_system);
+    print_rate("Mock memcpy():", total_size, cost_time_mock);
+  }
+}
+
+static void memset_speed_test(void *dest, uint8_t value, size_t size, uint32_t repeat_num) {
+  uint32_t start_time;
+  uint32_t cost_time_system;
+  uint32_t cost_time_mock;
+  uint32_t cnt;
+  uint32_t step;
+  uint64_t total_size;
+
+  for (step = START_SIZE; step <= size; step <<= 1) {
+    total_size = (uint64_t)step * (uint64_t)repeat_num;
+
+    Serial.printf("Memset %" PRIu32 " Bytes test\n", step);
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_num; cnt++) {
+      memset(dest, value, step);
+    }
+
+    cost_time_system = millis() - start_time;
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_num; cnt++) {
+      mock_memset(dest, value, step);
+    }
+
+    cost_time_mock = millis() - start_time;
+
+    print_rate("System memset():", total_size, cost_time_system);
+    print_rate("Mock memset():", total_size, cost_time_mock);
+  }
+}
+
+/* Main */
+
+void setup() {
+  Serial.begin(115200);
+  while (!Serial) {
+    delay(10);
+  }
+
+  void *dest = ps_malloc(MAX_TEST_SIZE);
+  const void *src = ps_malloc(MAX_TEST_SIZE);
+
+  if (!dest || !src) {
+    Serial.println("Memory allocation failed");
+    return;
+  }
+
+  log_d("Starting PSRAM speed test");
+  Serial.printf("Runs: %d\n", N_RUNS);
+  Serial.printf("Copies: %d\n", N_COPIES);
+  Serial.printf("Max test size: %d\n", MAX_TEST_SIZE);
+  Serial.flush();
+  for (int i = 0; i < N_RUNS; i++) {
+    Serial.printf("Run %d", i);
+    memcpy_speed_test(dest, src, MAX_TEST_SIZE, N_COPIES);
+    Serial.flush();
+    memset_speed_test(dest, FILL_VALUE, MAX_TEST_SIZE, N_COPIES);
+    Serial.flush();
+  }
+  log_d("PSRAM speed test done");
+}
+
+void loop() {
+  vTaskDelete(NULL);
+}
diff --git a/tests/performance/psramspeed/test_psramspeed.py b/tests/performance/psramspeed/test_psramspeed.py
new file mode 100644
index 000000000..8d0515807
--- /dev/null
+++ b/tests/performance/psramspeed/test_psramspeed.py
@@ -0,0 +1,105 @@
+import json
+import logging
+import os
+
+from collections import defaultdict
+
+
+def test_psramspeed(dut, request):
+    LOGGER = logging.getLogger(__name__)
+
+    runs_results = []
+
+    # Match "Runs: %d"
+    res = dut.expect(r"Runs: (\d+)", timeout=60)
+    runs = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of runs: {}".format(runs))
+    assert runs > 0, "Invalid number of runs"
+
+    # Match "Copies: %d"
+    res = dut.expect(r"Copies: (\d+)", timeout=60)
+    copies = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of copies in each test: {}".format(copies))
+    assert copies > 0, "Invalid number of copies"
+
+    # Match "Max test size: %lu"
+    res = dut.expect(r"Max test size: (\d+)", timeout=60)
+    max_test_size = int(res.group(0).decode("utf-8").split(" ")[3])
+    LOGGER.info("Max test size: {}".format(max_test_size))
+    assert max_test_size > 0, "Invalid max test size"
+
+    for i in range(runs):
+        # Match "Run %d"
+        res = dut.expect(r"Run (\d+)", timeout=120)
+        run = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Run {}".format(run))
+        assert run == i, "Invalid run number"
+
+        for j in range(2):
+            while True:
+                # Match "Memcpy/Memtest %d Bytes test"
+                res = dut.expect(r"(Memcpy|Memset) (\d+) Bytes test", timeout=60)
+                current_test = res.group(0).decode("utf-8").split(" ")[0].lower()
+                current_test_size = int(res.group(0).decode("utf-8").split(" ")[1])
+                LOGGER.info("Current {} test size: {}".format(current_test, current_test_size))
+                assert current_test_size > 0, "Invalid test size"
+
+                for k in range(2):
+                    # Match "System/Mock memcpy/memtest(): Rate = %d KB/s Time: %d ms" or "Error: %s"
+                    res = dut.expect(
+                        r"((System|Mock) (memcpy|memset)\(\): Rate = (\d+) KB/s Time: (\d+) ms|^Error)", timeout=90
+                    )
+                    implementation = res.group(0).decode("utf-8").split(" ")[0].lower()
+                    assert implementation != "error:", "Error detected in test output"
+                    test_type = res.group(0).decode("utf-8").split(" ")[1].lower()[:-3]
+                    rate = int(res.group(0).decode("utf-8").split(" ")[4])
+                    time = int(res.group(0).decode("utf-8").split(" ")[7])
+                    assert rate > 0, "Invalid rate"
+                    assert time > 0, "Invalid time"
+                    assert test_type == current_test, "Missing test output"
+                    LOGGER.info("{} {}: Rate = {} KB/s. Time = {} ms".format(implementation, test_type, rate, time))
+
+                    runs_results.append(((current_test, str(current_test_size), implementation), (rate, time)))
+
+                if current_test_size == max_test_size:
+                    break
+
+            LOGGER.info("=============================================================")
+
+    # Calculate average rate and time for each test size
+    sums = defaultdict(lambda: {"rate_sum": 0, "time_sum": 0})
+
+    for (test, size, impl), (rate, time) in runs_results:
+        sums[(test, size, impl)]["rate_sum"] += rate
+        sums[(test, size, impl)]["time_sum"] += time
+
+    avg_results = {}
+    for (test, size, impl) in sums:
+        rate_avg = round(sums[(test, size, impl)]["rate_sum"] / runs, 2)
+        time_avg = round(sums[(test, size, impl)]["time_sum"] / runs, 2)
+        LOGGER.info(
+            "Test: {}-{}-{}: Average rate = {} KB/s. Average time = {} ms".format(test, size, impl, rate_avg, time_avg)
+        )
+        if test not in avg_results:
+            avg_results[test] = {}
+        if size not in avg_results[test]:
+            avg_results[test][size] = {}
+        avg_results[test][size][impl] = {"avg_rate": rate_avg, "avg_time": time_avg}
+
+    # Create JSON with results and write it to file
+    # Always create a JSON with this format (so it can be merged later on):
+    # { TEST_NAME_STR: TEST_RESULTS_DICT }
+    results = {"psramspeed": {"runs": runs, "copies": copies, "max_test_size": max_test_size, "results": avg_results}}
+
+    current_folder = os.path.dirname(request.path)
+    file_index = 0
+    report_file = os.path.join(current_folder, "result_psramspeed" + str(file_index) + ".json")
+    while os.path.exists(report_file):
+        report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json")
+        file_index += 1
+
+    with open(report_file, "w") as f:
+        try:
+            f.write(json.dumps(results))
+        except Exception as e:
+            LOGGER.warning("Failed to write results to file: {}".format(e))
diff --git a/tests/performance/ramspeed/cfg.json b/tests/performance/ramspeed/cfg.json
new file mode 100644
index 000000000..dc67d4016
--- /dev/null
+++ b/tests/performance/ramspeed/cfg.json
@@ -0,0 +1,40 @@
+{
+  "targets": [
+    {
+      "name": "esp32",
+      "fqbn":[
+        "espressif:esp32:esp32:PSRAM=disabled,PartitionScheme=huge_app"
+      ]
+    },
+    {
+      "name": "esp32s2",
+      "fqbn": [
+        "espressif:esp32:esp32s2:PSRAM=disabled,PartitionScheme=huge_app"
+      ]
+    },
+    {
+      "name": "esp32c3",
+      "fqbn": [
+        "espressif:esp32:esp32c3:PartitionScheme=huge_app"
+      ]
+    },
+    {
+      "name": "esp32s3",
+      "fqbn": [
+        "espressif:esp32:esp32s3:PSRAM=disabled,USBMode=default,PartitionScheme=huge_app"
+      ]
+    },
+    {
+      "name": "esp32c6",
+      "fqbn": [
+        "espressif:esp32:esp32c6:PartitionScheme=huge_app"
+      ]
+    },
+    {
+      "name": "esp32h2",
+      "fqbn": [
+        "espressif:esp32:esp32h2:PartitionScheme=huge_app"
+      ]
+    }
+  ]
+}
diff --git a/tests/performance/ramspeed/ramspeed.ino b/tests/performance/ramspeed/ramspeed.ino
new file mode 100644
index 000000000..e0ab0db4c
--- /dev/null
+++ b/tests/performance/ramspeed/ramspeed.ino
@@ -0,0 +1,262 @@
+/*
+  Based on the ramspeed test from NuttX.
+  https://github.com/apache/nuttx-apps/blob/master/benchmarks/ramspeed/ramspeed_main.c
+  Modified for Arduino and ESP32 by Lucas Saavedra Vaz, 2024
+*/
+
+#include <Arduino.h>
+
+// Test settings
+
+// Number of runs to average
+#define N_RUNS 3
+
+// Value to fill the memory with
+#define FILL_VALUE 0x00
+
+// Number of copies to be performed in each test
+#define N_COPIES 50000
+
+// Max size to be copied. Must be bigger than 32 and it will be floored to the nearest power of 2
+#define MAX_TEST_SIZE 64 * 1024  // 64KB
+
+// Implementation macros
+
+#if defined(UINTPTR_MAX) && UINTPTR_MAX > 0xFFFFFFFF
+#define MEM_UNIT   uint64_t
+#define ALIGN_MASK 0x7
+#else
+#define MEM_UNIT   uint32_t
+#define ALIGN_MASK 0x3
+#endif
+
+#define COPY32 \
+  *d32 = *s32; \
+  d32++;       \
+  s32++;
+#define COPY8 \
+  *d8 = *s8;  \
+  d8++;       \
+  s8++;
+#define SET32(x) \
+  *d32 = x;      \
+  d32++;
+#define SET8(x) \
+  *d8 = x;      \
+  d8++;
+#define REPEAT8(expr) expr expr expr expr expr expr expr expr
+
+/* Functions */
+
+static void *mock_memcpy(void *dst, const void *src, size_t len) {
+  uint8_t *d8 = (uint8_t *)dst;
+  const uint8_t *s8 = (uint8_t *)src;
+
+  uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK;
+  uintptr_t s_align = (uintptr_t)s8 & ALIGN_MASK;
+  uint32_t *d32;
+  const uint32_t *s32;
+
+  /* Byte copy for unaligned memories */
+
+  if (s_align != d_align) {
+    while (len > 32) {
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      REPEAT8(COPY8);
+      len -= 32;
+    }
+
+    while (len) {
+      COPY8;
+      len--;
+    }
+
+    return dst;
+  }
+
+  /* Make the memories aligned */
+
+  if (d_align) {
+    d_align = ALIGN_MASK + 1 - d_align;
+    while (d_align && len) {
+      COPY8;
+      d_align--;
+      len--;
+    }
+  }
+
+  d32 = (uint32_t *)d8;
+  s32 = (uint32_t *)s8;
+  while (len > 32) {
+    REPEAT8(COPY32);
+    len -= 32;
+  }
+
+  while (len > 4) {
+    COPY32;
+    len -= 4;
+  }
+
+  d8 = (uint8_t *)d32;
+  s8 = (const uint8_t *)s32;
+  while (len) {
+    COPY8;
+    len--;
+  }
+
+  return dst;
+}
+
+static void mock_memset(void *dst, uint8_t v, size_t len) {
+  uint8_t *d8 = (uint8_t *)dst;
+  uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK;
+  uint32_t v32;
+  uint32_t *d32;
+
+  /* Make the address aligned */
+
+  if (d_align) {
+    d_align = ALIGN_MASK + 1 - d_align;
+    while (d_align && len) {
+      SET8(v);
+      len--;
+      d_align--;
+    }
+  }
+
+  v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24);
+
+  d32 = (uint32_t *)d8;
+
+  while (len > 32) {
+    REPEAT8(SET32(v32));
+    len -= 32;
+  }
+
+  while (len > 4) {
+    SET32(v32);
+    len -= 4;
+  }
+
+  d8 = (uint8_t *)d32;
+  while (len) {
+    SET8(v);
+    len--;
+  }
+}
+
+static void print_rate(const char *name, uint64_t bytes, uint32_t cost_time) {
+  uint32_t rate;
+  if (cost_time == 0) {
+    Serial.println("Error: Too little time taken, please increase N_COPIES");
+    return;
+  }
+
+  rate = bytes * 1000 / cost_time / 1024;
+  Serial.printf("%s Rate = %" PRIu32 " KB/s Time: %" PRIu32 " ms\n", name, rate, cost_time);
+}
+
+static void memcpy_speed_test(void *dest, const void *src, size_t size, uint32_t repeat_cnt) {
+  uint32_t start_time;
+  uint32_t cost_time_system;
+  uint32_t cost_time_mock;
+  uint32_t cnt;
+  uint32_t step;
+  uint64_t total_size;
+
+  for (step = 32; step <= size; step <<= 1) {
+    total_size = (uint64_t)step * (uint64_t)repeat_cnt;
+
+    Serial.printf("Memcpy %" PRIu32 " Bytes test\n", step);
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_cnt; cnt++) {
+      memcpy(dest, src, step);
+    }
+
+    cost_time_system = millis() - start_time;
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_cnt; cnt++) {
+      mock_memcpy(dest, src, step);
+    }
+
+    cost_time_mock = millis() - start_time;
+
+    print_rate("System memcpy():", total_size, cost_time_system);
+    print_rate("Mock memcpy():", total_size, cost_time_mock);
+  }
+}
+
+static void memset_speed_test(void *dest, uint8_t value, size_t size, uint32_t repeat_num) {
+  uint32_t start_time;
+  uint32_t cost_time_system;
+  uint32_t cost_time_mock;
+  uint32_t cnt;
+  uint32_t step;
+  uint64_t total_size;
+
+  for (step = 32; step <= size; step <<= 1) {
+    total_size = (uint64_t)step * (uint64_t)repeat_num;
+
+    Serial.printf("Memset %" PRIu32 " Bytes test\n", step);
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_num; cnt++) {
+      memset(dest, value, step);
+    }
+
+    cost_time_system = millis() - start_time;
+
+    start_time = millis();
+
+    for (cnt = 0; cnt < repeat_num; cnt++) {
+      mock_memset(dest, value, step);
+    }
+
+    cost_time_mock = millis() - start_time;
+
+    print_rate("System memset():", total_size, cost_time_system);
+    print_rate("Mock memset():", total_size, cost_time_mock);
+  }
+}
+
+/* Main */
+
+void setup() {
+  Serial.begin(115200);
+  while (!Serial) {
+    delay(10);
+  }
+
+  void *dest = malloc(MAX_TEST_SIZE);
+  const void *src = malloc(MAX_TEST_SIZE);
+
+  if (!dest || !src) {
+    Serial.println("Memory allocation failed");
+    return;
+  }
+
+  log_d("Starting RAM speed test");
+  Serial.printf("Runs: %d\n", N_RUNS);
+  Serial.printf("Copies: %d\n", N_COPIES);
+  Serial.printf("Max test size: %d\n", MAX_TEST_SIZE);
+  Serial.flush();
+  for (int i = 0; i < N_RUNS; i++) {
+    Serial.printf("Run %d", i);
+    memcpy_speed_test(dest, src, MAX_TEST_SIZE, N_COPIES);
+    Serial.flush();
+    memset_speed_test(dest, FILL_VALUE, MAX_TEST_SIZE, N_COPIES);
+    Serial.flush();
+  }
+  log_d("RAM speed test done");
+}
+
+void loop() {
+  vTaskDelete(NULL);
+}
diff --git a/tests/performance/ramspeed/test_ramspeed.py b/tests/performance/ramspeed/test_ramspeed.py
new file mode 100644
index 000000000..b4c3cee7f
--- /dev/null
+++ b/tests/performance/ramspeed/test_ramspeed.py
@@ -0,0 +1,105 @@
+import json
+import logging
+import os
+
+from collections import defaultdict
+
+
+def test_ramspeed(dut, request):
+    LOGGER = logging.getLogger(__name__)
+
+    runs_results = []
+
+    # Match "Runs: %d"
+    res = dut.expect(r"Runs: (\d+)", timeout=60)
+    runs = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of runs: {}".format(runs))
+    assert runs > 0, "Invalid number of runs"
+
+    # Match "Copies: %d"
+    res = dut.expect(r"Copies: (\d+)", timeout=60)
+    copies = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of copies in each test: {}".format(copies))
+    assert copies > 0, "Invalid number of copies"
+
+    # Match "Max test size: %lu"
+    res = dut.expect(r"Max test size: (\d+)", timeout=60)
+    max_test_size = int(res.group(0).decode("utf-8").split(" ")[3])
+    LOGGER.info("Max test size: {}".format(max_test_size))
+    assert max_test_size > 0, "Invalid max test size"
+
+    for i in range(runs):
+        # Match "Run %d"
+        res = dut.expect(r"Run (\d+)", timeout=120)
+        run = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Run {}".format(run))
+        assert run == i, "Invalid run number"
+
+        for j in range(2):
+            while True:
+                # Match "Memcpy/Memtest %d Bytes test"
+                res = dut.expect(r"(Memcpy|Memset) (\d+) Bytes test", timeout=60)
+                current_test = res.group(0).decode("utf-8").split(" ")[0].lower()
+                current_test_size = int(res.group(0).decode("utf-8").split(" ")[1])
+                LOGGER.info("Current {} test size: {}".format(current_test, current_test_size))
+                assert current_test_size > 0, "Invalid test size"
+
+                for k in range(2):
+                    # Match "System/Mock memcpy/memtest(): Rate = %d KB/s Time: %d ms" or "Error: %s"
+                    res = dut.expect(
+                        r"((System|Mock) (memcpy|memset)\(\): Rate = (\d+) KB/s Time: (\d+) ms|^Error)", timeout=90
+                    )
+                    implementation = res.group(0).decode("utf-8").split(" ")[0].lower()
+                    assert implementation != "error:", "Error detected in test output"
+                    test_type = res.group(0).decode("utf-8").split(" ")[1].lower()[:-3]
+                    rate = int(res.group(0).decode("utf-8").split(" ")[4])
+                    time = int(res.group(0).decode("utf-8").split(" ")[7])
+                    assert rate > 0, "Invalid rate"
+                    assert time > 0, "Invalid time"
+                    assert test_type == current_test, "Missing test output"
+                    LOGGER.info("{} {}: Rate = {} KB/s. Time = {} ms".format(implementation, test_type, rate, time))
+
+                    runs_results.append(((current_test, str(current_test_size), implementation), (rate, time)))
+
+                if current_test_size == max_test_size:
+                    break
+
+            LOGGER.info("=============================================================")
+
+    # Calculate average rate and time for each test size
+    sums = defaultdict(lambda: {"rate_sum": 0, "time_sum": 0})
+
+    for (test, size, impl), (rate, time) in runs_results:
+        sums[(test, size, impl)]["rate_sum"] += rate
+        sums[(test, size, impl)]["time_sum"] += time
+
+    avg_results = {}
+    for (test, size, impl) in sums:
+        rate_avg = round(sums[(test, size, impl)]["rate_sum"] / runs, 2)
+        time_avg = round(sums[(test, size, impl)]["time_sum"] / runs, 2)
+        LOGGER.info(
+            "Test: {}-{}-{}: Average rate = {} KB/s. Average time = {} ms".format(test, size, impl, rate_avg, time_avg)
+        )
+        if test not in avg_results:
+            avg_results[test] = {}
+        if size not in avg_results[test]:
+            avg_results[test][size] = {}
+        avg_results[test][size][impl] = {"avg_rate": rate_avg, "avg_time": time_avg}
+
+    # Create JSON with results and write it to file
+    # Always create a JSON with this format (so it can be merged later on):
+    # { TEST_NAME_STR: TEST_RESULTS_DICT }
+    results = {"ramspeed": {"runs": runs, "copies": copies, "max_test_size": max_test_size, "results": avg_results}}
+
+    current_folder = os.path.dirname(request.path)
+    file_index = 0
+    report_file = os.path.join(current_folder, "result_ramspeed" + str(file_index) + ".json")
+    while os.path.exists(report_file):
+        report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json")
+        file_index += 1
+
+    with open(report_file, "w") as f:
+        try:
+            f.write(json.dumps(results))
+        except Exception as e:
+            LOGGER.warning("Failed to write results to file: {}".format(e))
diff --git a/tests/performance/superpi/fftsg_h.cpp b/tests/performance/superpi/fftsg_h.cpp
new file mode 100644
index 000000000..8361b5a57
--- /dev/null
+++ b/tests/performance/superpi/fftsg_h.cpp
@@ -0,0 +1,2329 @@
+/*
+  Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999.
+  https://github.com/Fibonacci43/SuperPI
+  Modified for Arduino by Lucas Saavedra Vaz, 2024.
+*/
+
+#include <fftsg_h.h>
+
+void cdft(int n, int isgn, double *a) {
+  if (isgn >= 0) {
+    cftfsub(n, a);
+  } else {
+    cftbsub(n, a);
+  }
+}
+
+void rdft(int n, int isgn, double *a) {
+  double xi;
+
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a);
+      rftfsub(n, a);
+    } else if (n == 4) {
+      cftfsub(n, a);
+    }
+    xi = a[0] - a[1];
+    a[0] += a[1];
+    a[1] = xi;
+  } else {
+    a[1] = 0.5 * (a[0] - a[1]);
+    a[0] -= a[1];
+    if (n > 4) {
+      rftbsub(n, a);
+      cftbsub(n, a);
+    } else if (n == 4) {
+      cftbsub(n, a);
+    }
+  }
+}
+
+void ddct(int n, int isgn, double *a) {
+  int j;
+  double xr;
+
+  if (isgn < 0) {
+    xr = a[n - 1];
+    for (j = n - 2; j >= 2; j -= 2) {
+      a[j + 1] = a[j] - a[j - 1];
+      a[j] += a[j - 1];
+    }
+    a[1] = a[0] - xr;
+    a[0] += xr;
+    if (n > 4) {
+      rftbsub(n, a);
+      cftbsub(n, a);
+    } else if (n == 4) {
+      cftbsub(n, a);
+    }
+  }
+  if (n > 4) {
+    dctsub(n, a);
+  } else {
+    dctsub4(n, a);
+  }
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a);
+      rftfsub(n, a);
+    } else if (n == 4) {
+      cftfsub(n, a);
+    }
+    xr = a[0] - a[1];
+    a[0] += a[1];
+    for (j = 2; j < n; j += 2) {
+      a[j - 1] = a[j] - a[j + 1];
+      a[j] += a[j + 1];
+    }
+    a[n - 1] = xr;
+  }
+}
+
+void ddst(int n, int isgn, double *a) {
+  int j;
+  double xr;
+
+  if (isgn < 0) {
+    xr = a[n - 1];
+    for (j = n - 2; j >= 2; j -= 2) {
+      a[j + 1] = -a[j] - a[j - 1];
+      a[j] -= a[j - 1];
+    }
+    a[1] = a[0] + xr;
+    a[0] -= xr;
+    if (n > 4) {
+      rftbsub(n, a);
+      cftbsub(n, a);
+    } else if (n == 4) {
+      cftbsub(n, a);
+    }
+  }
+  if (n > 4) {
+    dstsub(n, a);
+  } else {
+    dstsub4(n, a);
+  }
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a);
+      rftfsub(n, a);
+    } else if (n == 4) {
+      cftfsub(n, a);
+    }
+    xr = a[0] - a[1];
+    a[0] += a[1];
+    for (j = 2; j < n; j += 2) {
+      a[j - 1] = -a[j] - a[j + 1];
+      a[j] -= a[j + 1];
+    }
+    a[n - 1] = -xr;
+  }
+}
+
+void dfct(int n, double *a) {
+  int j, k, m, mh;
+  double xr, xi, yr, yi, an;
+
+  m = n >> 1;
+  for (j = 0; j < m; j++) {
+    k = n - j;
+    xr = a[j] + a[k];
+    a[j] -= a[k];
+    a[k] = xr;
+  }
+  an = a[n];
+  while (m >= 2) {
+    ddct(m, 1, a);
+    bitrv1(m, a);
+    mh = m >> 1;
+    xi = a[m];
+    a[m] = a[0];
+    a[0] = an - xi;
+    an += xi;
+    for (j = 1; j < mh; j++) {
+      k = m - j;
+      xr = a[m + k];
+      xi = a[m + j];
+      yr = a[j];
+      yi = a[k];
+      a[m + j] = yr;
+      a[m + k] = yi;
+      a[j] = xr - xi;
+      a[k] = xr + xi;
+    }
+    xr = a[mh];
+    a[mh] = a[m + mh];
+    a[m + mh] = xr;
+    m = mh;
+  }
+  xi = a[1];
+  a[1] = a[0];
+  a[0] = an + xi;
+  a[n] = an - xi;
+  bitrv1(n, a);
+}
+
+void dfst(int n, double *a) {
+  int j, k, m, mh;
+  double xr, xi, yr, yi;
+
+  m = n >> 1;
+  for (j = 1; j < m; j++) {
+    k = n - j;
+    xr = a[j] - a[k];
+    a[j] += a[k];
+    a[k] = xr;
+  }
+  a[0] = a[m];
+  while (m >= 2) {
+    ddst(m, 1, a);
+    bitrv1(m, a);
+    mh = m >> 1;
+    for (j = 1; j < mh; j++) {
+      k = m - j;
+      xr = a[m + k];
+      xi = a[m + j];
+      yr = a[j];
+      yi = a[k];
+      a[m + j] = yr;
+      a[m + k] = yi;
+      a[j] = xr + xi;
+      a[k] = xr - xi;
+    }
+    a[m] = a[0];
+    a[0] = a[m + mh];
+    a[m + mh] = a[mh];
+    m = mh;
+  }
+  a[1] = a[0];
+  a[0] = 0;
+  bitrv1(n, a);
+}
+
+/* -------- child routines -------- */
+
+void cftfsub(int n, double *a) {
+  int m;
+
+  if (n > 32) {
+    m = n >> 2;
+    cftmdl1(n, a);
+    if (n > CDFT_RECURSIVE_N) {
+      cftrec1(m, a);
+      cftrec2(m, &a[m]);
+      cftrec1(m, &a[2 * m]);
+      cftrec1(m, &a[3 * m]);
+    } else if (m > 32) {
+      cftexp1(n, a);
+    } else {
+      cftfx41(n, a);
+    }
+    bitrv2(n, a);
+  } else if (n > 8) {
+    if (n == 32) {
+      cftf161(a);
+      bitrv216(a);
+    } else {
+      cftf081(a);
+      bitrv208(a);
+    }
+  } else if (n == 8) {
+    cftf040(a);
+  } else if (n == 4) {
+    cftx020(a);
+  }
+}
+
+void cftbsub(int n, double *a) {
+  int m;
+
+  if (n > 32) {
+    m = n >> 2;
+    cftb1st(n, a);
+    if (n > CDFT_RECURSIVE_N) {
+      cftrec1(m, a);
+      cftrec2(m, &a[m]);
+      cftrec1(m, &a[2 * m]);
+      cftrec1(m, &a[3 * m]);
+    } else if (m > 32) {
+      cftexp1(n, a);
+    } else {
+      cftfx41(n, a);
+    }
+    bitrv2conj(n, a);
+  } else if (n > 8) {
+    if (n == 32) {
+      cftf161(a);
+      bitrv216neg(a);
+    } else {
+      cftf081(a);
+      bitrv208neg(a);
+    }
+  } else if (n == 8) {
+    cftb040(a);
+  } else if (n == 4) {
+    cftx020(a);
+  }
+}
+
+void bitrv2(int n, double *a) {
+  int j0, k0, j1, k1, l, m, i, j, k;
+  double xr, xi, yr, yi;
+
+  l = n >> 2;
+  m = 2;
+  while (m < l) {
+    l >>= 1;
+    m <<= 1;
+  }
+  if (m == l) {
+    j0 = 0;
+    for (k0 = 0; k0 < m; k0 += 2) {
+      k = k0;
+      for (j = j0; j < j0 + k0; j += 2) {
+        xr = a[j];
+        xi = a[j + 1];
+        yr = a[k];
+        yi = a[k + 1];
+        a[j] = yr;
+        a[j + 1] = yi;
+        a[k] = xr;
+        a[k + 1] = xi;
+        j1 = j + m;
+        k1 = k + 2 * m;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += m;
+        k1 -= m;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += m;
+        k1 += 2 * m;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+      j1 = j0 + k0 + m;
+      k1 = j1 + m;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+    }
+  } else {
+    j0 = 0;
+    for (k0 = 2; k0 < m; k0 += 2) {
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+      k = k0;
+      for (j = j0; j < j0 + k0; j += 2) {
+        xr = a[j];
+        xi = a[j + 1];
+        yr = a[k];
+        yi = a[k + 1];
+        a[j] = yr;
+        a[j + 1] = yi;
+        a[k] = xr;
+        a[k + 1] = xi;
+        j1 = j + m;
+        k1 = k + m;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+    }
+  }
+}
+
+void bitrv2conj(int n, double *a) {
+  int j0, k0, j1, k1, l, m, i, j, k;
+  double xr, xi, yr, yi;
+
+  l = n >> 2;
+  m = 2;
+  while (m < l) {
+    l >>= 1;
+    m <<= 1;
+  }
+  if (m == l) {
+    j0 = 0;
+    for (k0 = 0; k0 < m; k0 += 2) {
+      k = k0;
+      for (j = j0; j < j0 + k0; j += 2) {
+        xr = a[j];
+        xi = -a[j + 1];
+        yr = a[k];
+        yi = -a[k + 1];
+        a[j] = yr;
+        a[j + 1] = yi;
+        a[k] = xr;
+        a[k + 1] = xi;
+        j1 = j + m;
+        k1 = k + 2 * m;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += m;
+        k1 -= m;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += m;
+        k1 += 2 * m;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+      k1 = j0 + k0;
+      a[k1 + 1] = -a[k1 + 1];
+      j1 = k1 + m;
+      k1 = j1 + m;
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      k1 += m;
+      a[k1 + 1] = -a[k1 + 1];
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+    }
+  } else {
+    a[1] = -a[1];
+    a[m + 1] = -a[m + 1];
+    j0 = 0;
+    for (k0 = 2; k0 < m; k0 += 2) {
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+      k = k0;
+      for (j = j0; j < j0 + k0; j += 2) {
+        xr = a[j];
+        xi = -a[j + 1];
+        yr = a[k];
+        yi = -a[k + 1];
+        a[j] = yr;
+        a[j + 1] = yi;
+        a[k] = xr;
+        a[k + 1] = xi;
+        j1 = j + m;
+        k1 = k + m;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+      k1 = j0 + k0;
+      a[k1 + 1] = -a[k1 + 1];
+      a[k1 + m + 1] = -a[k1 + m + 1];
+    }
+  }
+}
+
+void bitrv216(double *a) {
+  double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x7r = a[14];
+  x7i = a[15];
+  x8r = a[16];
+  x8i = a[17];
+  x10r = a[20];
+  x10i = a[21];
+  x11r = a[22];
+  x11i = a[23];
+  x12r = a[24];
+  x12i = a[25];
+  x13r = a[26];
+  x13i = a[27];
+  x14r = a[28];
+  x14i = a[29];
+  a[2] = x8r;
+  a[3] = x8i;
+  a[4] = x4r;
+  a[5] = x4i;
+  a[6] = x12r;
+  a[7] = x12i;
+  a[8] = x2r;
+  a[9] = x2i;
+  a[10] = x10r;
+  a[11] = x10i;
+  a[14] = x14r;
+  a[15] = x14i;
+  a[16] = x1r;
+  a[17] = x1i;
+  a[20] = x5r;
+  a[21] = x5i;
+  a[22] = x13r;
+  a[23] = x13i;
+  a[24] = x3r;
+  a[25] = x3i;
+  a[26] = x11r;
+  a[27] = x11i;
+  a[28] = x7r;
+  a[29] = x7i;
+}
+
+void bitrv216neg(double *a) {
+  double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i,
+    x15r, x15i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x6r = a[12];
+  x6i = a[13];
+  x7r = a[14];
+  x7i = a[15];
+  x8r = a[16];
+  x8i = a[17];
+  x9r = a[18];
+  x9i = a[19];
+  x10r = a[20];
+  x10i = a[21];
+  x11r = a[22];
+  x11i = a[23];
+  x12r = a[24];
+  x12i = a[25];
+  x13r = a[26];
+  x13i = a[27];
+  x14r = a[28];
+  x14i = a[29];
+  x15r = a[30];
+  x15i = a[31];
+  a[2] = x15r;
+  a[3] = x15i;
+  a[4] = x7r;
+  a[5] = x7i;
+  a[6] = x11r;
+  a[7] = x11i;
+  a[8] = x3r;
+  a[9] = x3i;
+  a[10] = x13r;
+  a[11] = x13i;
+  a[12] = x5r;
+  a[13] = x5i;
+  a[14] = x9r;
+  a[15] = x9i;
+  a[16] = x1r;
+  a[17] = x1i;
+  a[18] = x14r;
+  a[19] = x14i;
+  a[20] = x6r;
+  a[21] = x6i;
+  a[22] = x10r;
+  a[23] = x10i;
+  a[24] = x2r;
+  a[25] = x2i;
+  a[26] = x12r;
+  a[27] = x12i;
+  a[28] = x4r;
+  a[29] = x4i;
+  a[30] = x8r;
+  a[31] = x8i;
+}
+
+void bitrv208(double *a) {
+  double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x6r = a[12];
+  x6i = a[13];
+  a[2] = x4r;
+  a[3] = x4i;
+  a[6] = x6r;
+  a[7] = x6i;
+  a[8] = x1r;
+  a[9] = x1i;
+  a[12] = x3r;
+  a[13] = x3i;
+}
+
+void bitrv208neg(double *a) {
+  double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x6r = a[12];
+  x6i = a[13];
+  x7r = a[14];
+  x7i = a[15];
+  a[2] = x7r;
+  a[3] = x7i;
+  a[4] = x3r;
+  a[5] = x3i;
+  a[6] = x5r;
+  a[7] = x5i;
+  a[8] = x1r;
+  a[9] = x1i;
+  a[10] = x6r;
+  a[11] = x6i;
+  a[12] = x2r;
+  a[13] = x2i;
+  a[14] = x4r;
+  a[15] = x4i;
+}
+
+void bitrv1(int n, double *a) {
+  int j0, k0, j1, k1, l, m, i, j, k;
+  double x;
+
+  l = n >> 2;
+  m = 1;
+  while (m < l) {
+    l >>= 1;
+    m <<= 1;
+  }
+  if (m == l) {
+    j0 = 0;
+    for (k0 = 0; k0 < m; k0++) {
+      k = k0;
+      for (j = j0; j < j0 + k0; j++) {
+        x = a[j];
+        a[j] = a[k];
+        a[k] = x;
+        j1 = j + m;
+        k1 = k + 2 * m;
+        x = a[j1];
+        a[j1] = a[k1];
+        a[k1] = x;
+        j1 += m;
+        k1 -= m;
+        x = a[j1];
+        a[j1] = a[k1];
+        a[k1] = x;
+        j1 += m;
+        k1 += 2 * m;
+        x = a[j1];
+        a[j1] = a[k1];
+        a[k1] = x;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+      j1 = j0 + k0 + m;
+      k1 = j1 + m;
+      x = a[j1];
+      a[j1] = a[k1];
+      a[k1] = x;
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+    }
+  } else {
+    j0 = 0;
+    for (k0 = 1; k0 < m; k0++) {
+      for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+      k = k0;
+      for (j = j0; j < j0 + k0; j++) {
+        x = a[j];
+        a[j] = a[k];
+        a[k] = x;
+        j1 = j + m;
+        k1 = k + m;
+        x = a[j1];
+        a[j1] = a[k1];
+        a[k1] = x;
+        for (i = n >> 1; i > (k ^= i); i >>= 1);
+      }
+    }
+  }
+}
+
+void cftb1st(int n, double *a) {
+  int i, i0, j, j0, j1, j2, j3, m, mh;
+  double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i, ss1, ss3;
+  double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] + a[j2];
+  x0i = -a[1] - a[j2 + 1];
+  x1r = a[0] - a[j2];
+  x1i = -a[1] + a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[0] = x0r + x2r;
+  a[1] = x0i - x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i + x2i;
+  a[j2] = x1r + x3i;
+  a[j2 + 1] = x1i + x3r;
+  a[j3] = x1r - x3i;
+  a[j3 + 1] = x1i - x3r;
+  wd1r = 1;
+  wd1i = 0;
+  wd3r = 1;
+  wd3i = 0;
+  ew = M_PI_2 / m;
+  w1r = cos(2 * ew);
+  w1i = sin(2 * ew);
+  wk1r = w1r;
+  wk1i = w1i;
+  ss1 = 2 * w1i;
+  wk3i = 2 * ss1 * wk1r;
+  wk3r = wk1r - wk3i * wk1i;
+  wk3i = wk1i - wk3i * wk1r;
+  ss3 = 2 * wk3i;
+  i = 0;
+  for (;;) {
+    i0 = i + 4 * CDFT_LOOP_DIV;
+    if (i0 > mh - 4) {
+      i0 = mh - 4;
+    }
+    for (j = i + 2; j < i0; j += 4) {
+      wd1r -= ss1 * wk1i;
+      wd1i += ss1 * wk1r;
+      wd3r -= ss3 * wk3i;
+      wd3i += ss3 * wk3r;
+      j1 = j + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j] + a[j2];
+      x0i = -a[j + 1] - a[j2 + 1];
+      x1r = a[j] - a[j2];
+      x1i = -a[j + 1] + a[j2 + 1];
+      x2r = a[j1] + a[j3];
+      x2i = a[j1 + 1] + a[j3 + 1];
+      x3r = a[j1] - a[j3];
+      x3i = a[j1 + 1] - a[j3 + 1];
+      a[j] = x0r + x2r;
+      a[j + 1] = x0i - x2i;
+      a[j1] = x0r - x2r;
+      a[j1 + 1] = x0i + x2i;
+      x0r = x1r + x3i;
+      x0i = x1i + x3r;
+      a[j2] = wk1r * x0r - wk1i * x0i;
+      a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3r * x0r + wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+      x0r = a[j + 2] + a[j2 + 2];
+      x0i = -a[j + 3] - a[j2 + 3];
+      x1r = a[j + 2] - a[j2 + 2];
+      x1i = -a[j + 3] + a[j2 + 3];
+      x2r = a[j1 + 2] + a[j3 + 2];
+      x2i = a[j1 + 3] + a[j3 + 3];
+      x3r = a[j1 + 2] - a[j3 + 2];
+      x3i = a[j1 + 3] - a[j3 + 3];
+      a[j + 2] = x0r + x2r;
+      a[j + 3] = x0i - x2i;
+      a[j1 + 2] = x0r - x2r;
+      a[j1 + 3] = x0i + x2i;
+      x0r = x1r + x3i;
+      x0i = x1i + x3r;
+      a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+      a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i - x3r;
+      a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+      a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+      j0 = m - j;
+      j1 = j0 + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j0] + a[j2];
+      x0i = -a[j0 + 1] - a[j2 + 1];
+      x1r = a[j0] - a[j2];
+      x1i = -a[j0 + 1] + a[j2 + 1];
+      x2r = a[j1] + a[j3];
+      x2i = a[j1 + 1] + a[j3 + 1];
+      x3r = a[j1] - a[j3];
+      x3i = a[j1 + 1] - a[j3 + 1];
+      a[j0] = x0r + x2r;
+      a[j0 + 1] = x0i - x2i;
+      a[j1] = x0r - x2r;
+      a[j1 + 1] = x0i + x2i;
+      x0r = x1r + x3i;
+      x0i = x1i + x3r;
+      a[j2] = wk1i * x0r - wk1r * x0i;
+      a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3i * x0r + wk3r * x0i;
+      a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+      x0r = a[j0 - 2] + a[j2 - 2];
+      x0i = -a[j0 - 1] - a[j2 - 1];
+      x1r = a[j0 - 2] - a[j2 - 2];
+      x1i = -a[j0 - 1] + a[j2 - 1];
+      x2r = a[j1 - 2] + a[j3 - 2];
+      x2i = a[j1 - 1] + a[j3 - 1];
+      x3r = a[j1 - 2] - a[j3 - 2];
+      x3i = a[j1 - 1] - a[j3 - 1];
+      a[j0 - 2] = x0r + x2r;
+      a[j0 - 1] = x0i - x2i;
+      a[j1 - 2] = x0r - x2r;
+      a[j1 - 1] = x0i + x2i;
+      x0r = x1r + x3i;
+      x0i = x1i + x3r;
+      a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+      a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i - x3r;
+      a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+      a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+      wk1r -= ss1 * wd1i;
+      wk1i += ss1 * wd1r;
+      wk3r -= ss3 * wd3i;
+      wk3i += ss3 * wd3r;
+    }
+    if (i0 == mh - 4) {
+      break;
+    }
+    wd1r = cos(ew * i0);
+    wd1i = sin(ew * i0);
+    wd3i = 4 * wd1i * wd1r;
+    wd3r = wd1r - wd3i * wd1i;
+    wd3i = wd1i - wd3i * wd1r;
+    wk1r = w1r * wd1r - w1i * wd1i;
+    wk1i = w1r * wd1i + w1i * wd1r;
+    wk3i = 4 * wk1i * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    i = i0;
+  }
+  wd1r -= ss1 * wk1i;
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0 - 2] + a[j2 - 2];
+  x0i = -a[j0 - 1] - a[j2 - 1];
+  x1r = a[j0 - 2] - a[j2 - 2];
+  x1i = -a[j0 - 1] + a[j2 - 1];
+  x2r = a[j1 - 2] + a[j3 - 2];
+  x2i = a[j1 - 1] + a[j3 - 1];
+  x3r = a[j1 - 2] - a[j3 - 2];
+  x3i = a[j1 - 1] - a[j3 - 1];
+  a[j0 - 2] = x0r + x2r;
+  a[j0 - 1] = x0i - x2i;
+  a[j1 - 2] = x0r - x2r;
+  a[j1 - 1] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+  a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+  a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+  x0r = a[j0] + a[j2];
+  x0i = -a[j0 + 1] - a[j2 + 1];
+  x1r = a[j0] - a[j2];
+  x1i = -a[j0 + 1] + a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[j0] = x0r + x2r;
+  a[j0 + 1] = x0i - x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2] = wd1r * (x0r - x0i);
+  a[j2 + 1] = wd1r * (x0i + x0r);
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3] = -wd1r * (x0r + x0i);
+  a[j3 + 1] = -wd1r * (x0i - x0r);
+  x0r = a[j0 + 2] + a[j2 + 2];
+  x0i = -a[j0 + 3] - a[j2 + 3];
+  x1r = a[j0 + 2] - a[j2 + 2];
+  x1i = -a[j0 + 3] + a[j2 + 3];
+  x2r = a[j1 + 2] + a[j3 + 2];
+  x2i = a[j1 + 3] + a[j3 + 3];
+  x3r = a[j1 + 2] - a[j3 + 2];
+  x3i = a[j1 + 3] - a[j3 + 3];
+  a[j0 + 2] = x0r + x2r;
+  a[j0 + 3] = x0i - x2i;
+  a[j1 + 2] = x0r - x2r;
+  a[j1 + 3] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+  a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+  a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+void cftrec1(int n, double *a) {
+  int m;
+
+  m = n >> 2;
+  cftmdl1(n, a);
+  if (n > CDFT_RECURSIVE_N) {
+    cftrec1(m, a);
+    cftrec2(m, &a[m]);
+    cftrec1(m, &a[2 * m]);
+    cftrec1(m, &a[3 * m]);
+  } else {
+    cftexp1(n, a);
+  }
+}
+
+void cftrec2(int n, double *a) {
+  int m;
+
+  m = n >> 2;
+  cftmdl2(n, a);
+  if (n > CDFT_RECURSIVE_N) {
+    cftrec1(m, a);
+    cftrec2(m, &a[m]);
+    cftrec1(m, &a[2 * m]);
+    cftrec2(m, &a[3 * m]);
+  } else {
+    cftexp2(n, a);
+  }
+}
+
+void cftexp1(int n, double *a) {
+  int j, k, l;
+
+  l = n >> 2;
+  while (l > 128) {
+    for (k = l; k < n; k <<= 2) {
+      for (j = k - l; j < n; j += 4 * k) {
+        cftmdl1(l, &a[j]);
+        cftmdl2(l, &a[k + j]);
+        cftmdl1(l, &a[2 * k + j]);
+      }
+    }
+    cftmdl1(l, &a[n - l]);
+    l >>= 2;
+  }
+  for (k = l; k < n; k <<= 2) {
+    for (j = k - l; j < n; j += 4 * k) {
+      cftmdl1(l, &a[j]);
+      cftfx41(l, &a[j]);
+      cftmdl2(l, &a[k + j]);
+      cftfx42(l, &a[k + j]);
+      cftmdl1(l, &a[2 * k + j]);
+      cftfx41(l, &a[2 * k + j]);
+    }
+  }
+  cftmdl1(l, &a[n - l]);
+  cftfx41(l, &a[n - l]);
+}
+
+void cftexp2(int n, double *a) {
+  int j, k, l, m;
+
+  m = n >> 1;
+  l = n >> 2;
+  while (l > 128) {
+    for (k = l; k < m; k <<= 2) {
+      for (j = k - l; j < m; j += 2 * k) {
+        cftmdl1(l, &a[j]);
+        cftmdl1(l, &a[m + j]);
+      }
+      for (j = 2 * k - l; j < m; j += 4 * k) {
+        cftmdl2(l, &a[j]);
+        cftmdl2(l, &a[m + j]);
+      }
+    }
+    l >>= 2;
+  }
+  for (k = l; k < m; k <<= 2) {
+    for (j = k - l; j < m; j += 2 * k) {
+      cftmdl1(l, &a[j]);
+      cftfx41(l, &a[j]);
+      cftmdl1(l, &a[m + j]);
+      cftfx41(l, &a[m + j]);
+    }
+    for (j = 2 * k - l; j < m; j += 4 * k) {
+      cftmdl2(l, &a[j]);
+      cftfx42(l, &a[j]);
+      cftmdl2(l, &a[m + j]);
+      cftfx42(l, &a[m + j]);
+    }
+  }
+}
+
+void cftmdl1(int n, double *a) {
+  int i, i0, j, j0, j1, j2, j3, m, mh;
+  double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i, ss1, ss3;
+  double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] + a[j2];
+  x0i = a[1] + a[j2 + 1];
+  x1r = a[0] - a[j2];
+  x1i = a[1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  a[j2] = x1r - x3i;
+  a[j2 + 1] = x1i + x3r;
+  a[j3] = x1r + x3i;
+  a[j3 + 1] = x1i - x3r;
+  wd1r = 1;
+  wd1i = 0;
+  wd3r = 1;
+  wd3i = 0;
+  ew = M_PI_2 / m;
+  w1r = cos(2 * ew);
+  w1i = sin(2 * ew);
+  wk1r = w1r;
+  wk1i = w1i;
+  ss1 = 2 * w1i;
+  wk3i = 2 * ss1 * wk1r;
+  wk3r = wk1r - wk3i * wk1i;
+  wk3i = wk1i - wk3i * wk1r;
+  ss3 = 2 * wk3i;
+  i = 0;
+  for (;;) {
+    i0 = i + 4 * CDFT_LOOP_DIV;
+    if (i0 > mh - 4) {
+      i0 = mh - 4;
+    }
+    for (j = i + 2; j < i0; j += 4) {
+      wd1r -= ss1 * wk1i;
+      wd1i += ss1 * wk1r;
+      wd3r -= ss3 * wk3i;
+      wd3i += ss3 * wk3r;
+      j1 = j + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j] + a[j2];
+      x0i = a[j + 1] + a[j2 + 1];
+      x1r = a[j] - a[j2];
+      x1i = a[j + 1] - a[j2 + 1];
+      x2r = a[j1] + a[j3];
+      x2i = a[j1 + 1] + a[j3 + 1];
+      x3r = a[j1] - a[j3];
+      x3i = a[j1 + 1] - a[j3 + 1];
+      a[j] = x0r + x2r;
+      a[j + 1] = x0i + x2i;
+      a[j1] = x0r - x2r;
+      a[j1 + 1] = x0i - x2i;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j2] = wk1r * x0r - wk1i * x0i;
+      a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3r * x0r + wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+      x0r = a[j + 2] + a[j2 + 2];
+      x0i = a[j + 3] + a[j2 + 3];
+      x1r = a[j + 2] - a[j2 + 2];
+      x1i = a[j + 3] - a[j2 + 3];
+      x2r = a[j1 + 2] + a[j3 + 2];
+      x2i = a[j1 + 3] + a[j3 + 3];
+      x3r = a[j1 + 2] - a[j3 + 2];
+      x3i = a[j1 + 3] - a[j3 + 3];
+      a[j + 2] = x0r + x2r;
+      a[j + 3] = x0i + x2i;
+      a[j1 + 2] = x0r - x2r;
+      a[j1 + 3] = x0i - x2i;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+      a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+      a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+      j0 = m - j;
+      j1 = j0 + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j0] + a[j2];
+      x0i = a[j0 + 1] + a[j2 + 1];
+      x1r = a[j0] - a[j2];
+      x1i = a[j0 + 1] - a[j2 + 1];
+      x2r = a[j1] + a[j3];
+      x2i = a[j1 + 1] + a[j3 + 1];
+      x3r = a[j1] - a[j3];
+      x3i = a[j1 + 1] - a[j3 + 1];
+      a[j0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      a[j1] = x0r - x2r;
+      a[j1 + 1] = x0i - x2i;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j2] = wk1i * x0r - wk1r * x0i;
+      a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3i * x0r + wk3r * x0i;
+      a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+      x0r = a[j0 - 2] + a[j2 - 2];
+      x0i = a[j0 - 1] + a[j2 - 1];
+      x1r = a[j0 - 2] - a[j2 - 2];
+      x1i = a[j0 - 1] - a[j2 - 1];
+      x2r = a[j1 - 2] + a[j3 - 2];
+      x2i = a[j1 - 1] + a[j3 - 1];
+      x3r = a[j1 - 2] - a[j3 - 2];
+      x3i = a[j1 - 1] - a[j3 - 1];
+      a[j0 - 2] = x0r + x2r;
+      a[j0 - 1] = x0i + x2i;
+      a[j1 - 2] = x0r - x2r;
+      a[j1 - 1] = x0i - x2i;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+      a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+      a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+      wk1r -= ss1 * wd1i;
+      wk1i += ss1 * wd1r;
+      wk3r -= ss3 * wd3i;
+      wk3i += ss3 * wd3r;
+    }
+    if (i0 == mh - 4) {
+      break;
+    }
+    wd1r = cos(ew * i0);
+    wd1i = sin(ew * i0);
+    wd3i = 4 * wd1i * wd1r;
+    wd3r = wd1r - wd3i * wd1i;
+    wd3i = wd1i - wd3i * wd1r;
+    wk1r = w1r * wd1r - w1i * wd1i;
+    wk1i = w1r * wd1i + w1i * wd1r;
+    wk3i = 4 * wk1i * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    i = i0;
+  }
+  wd1r -= ss1 * wk1i;
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0 - 2] + a[j2 - 2];
+  x0i = a[j0 - 1] + a[j2 - 1];
+  x1r = a[j0 - 2] - a[j2 - 2];
+  x1i = a[j0 - 1] - a[j2 - 1];
+  x2r = a[j1 - 2] + a[j3 - 2];
+  x2i = a[j1 - 1] + a[j3 - 1];
+  x3r = a[j1 - 2] - a[j3 - 2];
+  x3i = a[j1 - 1] - a[j3 - 1];
+  a[j0 - 2] = x0r + x2r;
+  a[j0 - 1] = x0i + x2i;
+  a[j1 - 2] = x0r - x2r;
+  a[j1 - 1] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+  a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+  a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+  x0r = a[j0] + a[j2];
+  x0i = a[j0 + 1] + a[j2 + 1];
+  x1r = a[j0] - a[j2];
+  x1i = a[j0 + 1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[j0] = x0r + x2r;
+  a[j0 + 1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2] = wd1r * (x0r - x0i);
+  a[j2 + 1] = wd1r * (x0i + x0r);
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3] = -wd1r * (x0r + x0i);
+  a[j3 + 1] = -wd1r * (x0i - x0r);
+  x0r = a[j0 + 2] + a[j2 + 2];
+  x0i = a[j0 + 3] + a[j2 + 3];
+  x1r = a[j0 + 2] - a[j2 + 2];
+  x1i = a[j0 + 3] - a[j2 + 3];
+  x2r = a[j1 + 2] + a[j3 + 2];
+  x2i = a[j1 + 3] + a[j3 + 3];
+  x3r = a[j1 + 2] - a[j3 + 2];
+  x3i = a[j1 + 3] - a[j3 + 3];
+  a[j0 + 2] = x0r + x2r;
+  a[j0 + 3] = x0i + x2i;
+  a[j1 + 2] = x0r - x2r;
+  a[j1 + 3] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+  a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+  a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+void cftmdl2(int n, double *a) {
+  int i, i0, j, j0, j1, j2, j3, m, mh;
+  double ew, w1r, w1i, wn4r, wk1r, wk1i, wk3r, wk3i, wl1r, wl1i, wl3r, wl3i, wd1r, wd1i, wd3r, wd3i, we1r, we1i, we3r, we3i, ss1, ss3;
+  double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  wn4r = WR5000;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] - a[j2 + 1];
+  x0i = a[1] + a[j2];
+  x1r = a[0] + a[j2 + 1];
+  x1i = a[1] - a[j2];
+  x2r = a[j1] - a[j3 + 1];
+  x2i = a[j1 + 1] + a[j3];
+  x3r = a[j1] + a[j3 + 1];
+  x3i = a[j1 + 1] - a[j3];
+  y0r = wn4r * (x2r - x2i);
+  y0i = wn4r * (x2i + x2r);
+  a[0] = x0r + y0r;
+  a[1] = x0i + y0i;
+  a[j1] = x0r - y0r;
+  a[j1 + 1] = x0i - y0i;
+  y0r = wn4r * (x3r - x3i);
+  y0i = wn4r * (x3i + x3r);
+  a[j2] = x1r - y0i;
+  a[j2 + 1] = x1i + y0r;
+  a[j3] = x1r + y0i;
+  a[j3 + 1] = x1i - y0r;
+  wl1r = 1;
+  wl1i = 0;
+  wl3r = 1;
+  wl3i = 0;
+  we1r = wn4r;
+  we1i = wn4r;
+  we3r = -wn4r;
+  we3i = -wn4r;
+  ew = M_PI_2 / (2 * m);
+  w1r = cos(2 * ew);
+  w1i = sin(2 * ew);
+  wk1r = w1r;
+  wk1i = w1i;
+  wd1r = wn4r * (w1r - w1i);
+  wd1i = wn4r * (w1i + w1r);
+  ss1 = 2 * w1i;
+  wk3i = 2 * ss1 * wk1r;
+  wk3r = wk1r - wk3i * wk1i;
+  wk3i = wk1i - wk3i * wk1r;
+  ss3 = 2 * wk3i;
+  wd3r = -wn4r * (wk3r - wk3i);
+  wd3i = -wn4r * (wk3i + wk3r);
+  i = 0;
+  for (;;) {
+    i0 = i + 4 * CDFT_LOOP_DIV;
+    if (i0 > mh - 4) {
+      i0 = mh - 4;
+    }
+    for (j = i + 2; j < i0; j += 4) {
+      wl1r -= ss1 * wk1i;
+      wl1i += ss1 * wk1r;
+      wl3r -= ss3 * wk3i;
+      wl3i += ss3 * wk3r;
+      we1r -= ss1 * wd1i;
+      we1i += ss1 * wd1r;
+      we3r -= ss3 * wd3i;
+      we3i += ss3 * wd3r;
+      j1 = j + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j] - a[j2 + 1];
+      x0i = a[j + 1] + a[j2];
+      x1r = a[j] + a[j2 + 1];
+      x1i = a[j + 1] - a[j2];
+      x2r = a[j1] - a[j3 + 1];
+      x2i = a[j1 + 1] + a[j3];
+      x3r = a[j1] + a[j3 + 1];
+      x3i = a[j1 + 1] - a[j3];
+      y0r = wk1r * x0r - wk1i * x0i;
+      y0i = wk1r * x0i + wk1i * x0r;
+      y2r = wd1r * x2r - wd1i * x2i;
+      y2i = wd1r * x2i + wd1i * x2r;
+      a[j] = y0r + y2r;
+      a[j + 1] = y0i + y2i;
+      a[j1] = y0r - y2r;
+      a[j1 + 1] = y0i - y2i;
+      y0r = wk3r * x1r + wk3i * x1i;
+      y0i = wk3r * x1i - wk3i * x1r;
+      y2r = wd3r * x3r + wd3i * x3i;
+      y2i = wd3r * x3i - wd3i * x3r;
+      a[j2] = y0r + y2r;
+      a[j2 + 1] = y0i + y2i;
+      a[j3] = y0r - y2r;
+      a[j3 + 1] = y0i - y2i;
+      x0r = a[j + 2] - a[j2 + 3];
+      x0i = a[j + 3] + a[j2 + 2];
+      x1r = a[j + 2] + a[j2 + 3];
+      x1i = a[j + 3] - a[j2 + 2];
+      x2r = a[j1 + 2] - a[j3 + 3];
+      x2i = a[j1 + 3] + a[j3 + 2];
+      x3r = a[j1 + 2] + a[j3 + 3];
+      x3i = a[j1 + 3] - a[j3 + 2];
+      y0r = wl1r * x0r - wl1i * x0i;
+      y0i = wl1r * x0i + wl1i * x0r;
+      y2r = we1r * x2r - we1i * x2i;
+      y2i = we1r * x2i + we1i * x2r;
+      a[j + 2] = y0r + y2r;
+      a[j + 3] = y0i + y2i;
+      a[j1 + 2] = y0r - y2r;
+      a[j1 + 3] = y0i - y2i;
+      y0r = wl3r * x1r + wl3i * x1i;
+      y0i = wl3r * x1i - wl3i * x1r;
+      y2r = we3r * x3r + we3i * x3i;
+      y2i = we3r * x3i - we3i * x3r;
+      a[j2 + 2] = y0r + y2r;
+      a[j2 + 3] = y0i + y2i;
+      a[j3 + 2] = y0r - y2r;
+      a[j3 + 3] = y0i - y2i;
+      j0 = m - j;
+      j1 = j0 + m;
+      j2 = j1 + m;
+      j3 = j2 + m;
+      x0r = a[j0] - a[j2 + 1];
+      x0i = a[j0 + 1] + a[j2];
+      x1r = a[j0] + a[j2 + 1];
+      x1i = a[j0 + 1] - a[j2];
+      x2r = a[j1] - a[j3 + 1];
+      x2i = a[j1 + 1] + a[j3];
+      x3r = a[j1] + a[j3 + 1];
+      x3i = a[j1 + 1] - a[j3];
+      y0r = wd1i * x0r - wd1r * x0i;
+      y0i = wd1i * x0i + wd1r * x0r;
+      y2r = wk1i * x2r - wk1r * x2i;
+      y2i = wk1i * x2i + wk1r * x2r;
+      a[j0] = y0r + y2r;
+      a[j0 + 1] = y0i + y2i;
+      a[j1] = y0r - y2r;
+      a[j1 + 1] = y0i - y2i;
+      y0r = wd3i * x1r + wd3r * x1i;
+      y0i = wd3i * x1i - wd3r * x1r;
+      y2r = wk3i * x3r + wk3r * x3i;
+      y2i = wk3i * x3i - wk3r * x3r;
+      a[j2] = y0r + y2r;
+      a[j2 + 1] = y0i + y2i;
+      a[j3] = y0r - y2r;
+      a[j3 + 1] = y0i - y2i;
+      x0r = a[j0 - 2] - a[j2 - 1];
+      x0i = a[j0 - 1] + a[j2 - 2];
+      x1r = a[j0 - 2] + a[j2 - 1];
+      x1i = a[j0 - 1] - a[j2 - 2];
+      x2r = a[j1 - 2] - a[j3 - 1];
+      x2i = a[j1 - 1] + a[j3 - 2];
+      x3r = a[j1 - 2] + a[j3 - 1];
+      x3i = a[j1 - 1] - a[j3 - 2];
+      y0r = we1i * x0r - we1r * x0i;
+      y0i = we1i * x0i + we1r * x0r;
+      y2r = wl1i * x2r - wl1r * x2i;
+      y2i = wl1i * x2i + wl1r * x2r;
+      a[j0 - 2] = y0r + y2r;
+      a[j0 - 1] = y0i + y2i;
+      a[j1 - 2] = y0r - y2r;
+      a[j1 - 1] = y0i - y2i;
+      y0r = we3i * x1r + we3r * x1i;
+      y0i = we3i * x1i - we3r * x1r;
+      y2r = wl3i * x3r + wl3r * x3i;
+      y2i = wl3i * x3i - wl3r * x3r;
+      a[j2 - 2] = y0r + y2r;
+      a[j2 - 1] = y0i + y2i;
+      a[j3 - 2] = y0r - y2r;
+      a[j3 - 1] = y0i - y2i;
+      wk1r -= ss1 * wl1i;
+      wk1i += ss1 * wl1r;
+      wk3r -= ss3 * wl3i;
+      wk3i += ss3 * wl3r;
+      wd1r -= ss1 * we1i;
+      wd1i += ss1 * we1r;
+      wd3r -= ss3 * we3i;
+      wd3i += ss3 * we3r;
+    }
+    if (i0 == mh - 4) {
+      break;
+    }
+    wl1r = cos(ew * i0);
+    wl1i = sin(ew * i0);
+    wl3i = 4 * wl1i * wl1r;
+    wl3r = wl1r - wl3i * wl1i;
+    wl3i = wl1i - wl3i * wl1r;
+    we1r = wn4r * (wl1r - wl1i);
+    we1i = wn4r * (wl1i + wl1r);
+    we3r = -wn4r * (wl3r - wl3i);
+    we3i = -wn4r * (wl3i + wl3r);
+    wk1r = w1r * wl1r - w1i * wl1i;
+    wk1i = w1r * wl1i + w1i * wl1r;
+    wk3i = 4 * wk1i * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    wd1r = wn4r * (wk1r - wk1i);
+    wd1i = wn4r * (wk1i + wk1r);
+    wd3r = -wn4r * (wk3r - wk3i);
+    wd3i = -wn4r * (wk3i + wk3r);
+    i = i0;
+  }
+  wl1r -= ss1 * wk1i;
+  wl1i += ss1 * wk1r;
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0 - 2] - a[j2 - 1];
+  x0i = a[j0 - 1] + a[j2 - 2];
+  x1r = a[j0 - 2] + a[j2 - 1];
+  x1i = a[j0 - 1] - a[j2 - 2];
+  x2r = a[j1 - 2] - a[j3 - 1];
+  x2i = a[j1 - 1] + a[j3 - 2];
+  x3r = a[j1 - 2] + a[j3 - 1];
+  x3i = a[j1 - 1] - a[j3 - 2];
+  y0r = wk1r * x0r - wk1i * x0i;
+  y0i = wk1r * x0i + wk1i * x0r;
+  y2r = wd1r * x2r - wd1i * x2i;
+  y2i = wd1r * x2i + wd1i * x2r;
+  a[j0 - 2] = y0r + y2r;
+  a[j0 - 1] = y0i + y2i;
+  a[j1 - 2] = y0r - y2r;
+  a[j1 - 1] = y0i - y2i;
+  y0r = wk3r * x1r + wk3i * x1i;
+  y0i = wk3r * x1i - wk3i * x1r;
+  y2r = wd3r * x3r + wd3i * x3i;
+  y2i = wd3r * x3i - wd3i * x3r;
+  a[j2 - 2] = y0r + y2r;
+  a[j2 - 1] = y0i + y2i;
+  a[j3 - 2] = y0r - y2r;
+  a[j3 - 1] = y0i - y2i;
+  x0r = a[j0] - a[j2 + 1];
+  x0i = a[j0 + 1] + a[j2];
+  x1r = a[j0] + a[j2 + 1];
+  x1i = a[j0 + 1] - a[j2];
+  x2r = a[j1] - a[j3 + 1];
+  x2i = a[j1 + 1] + a[j3];
+  x3r = a[j1] + a[j3 + 1];
+  x3i = a[j1 + 1] - a[j3];
+  y0r = wl1r * x0r - wl1i * x0i;
+  y0i = wl1r * x0i + wl1i * x0r;
+  y2r = wl1i * x2r - wl1r * x2i;
+  y2i = wl1i * x2i + wl1r * x2r;
+  a[j0] = y0r + y2r;
+  a[j0 + 1] = y0i + y2i;
+  a[j1] = y0r - y2r;
+  a[j1 + 1] = y0i - y2i;
+  y0r = wl1i * x1r - wl1r * x1i;
+  y0i = wl1i * x1i + wl1r * x1r;
+  y2r = wl1r * x3r - wl1i * x3i;
+  y2i = wl1r * x3i + wl1i * x3r;
+  a[j2] = y0r - y2r;
+  a[j2 + 1] = y0i - y2i;
+  a[j3] = y0r + y2r;
+  a[j3 + 1] = y0i + y2i;
+  x0r = a[j0 + 2] - a[j2 + 3];
+  x0i = a[j0 + 3] + a[j2 + 2];
+  x1r = a[j0 + 2] + a[j2 + 3];
+  x1i = a[j0 + 3] - a[j2 + 2];
+  x2r = a[j1 + 2] - a[j3 + 3];
+  x2i = a[j1 + 3] + a[j3 + 2];
+  x3r = a[j1 + 2] + a[j3 + 3];
+  x3i = a[j1 + 3] - a[j3 + 2];
+  y0r = wd1i * x0r - wd1r * x0i;
+  y0i = wd1i * x0i + wd1r * x0r;
+  y2r = wk1i * x2r - wk1r * x2i;
+  y2i = wk1i * x2i + wk1r * x2r;
+  a[j0 + 2] = y0r + y2r;
+  a[j0 + 3] = y0i + y2i;
+  a[j1 + 2] = y0r - y2r;
+  a[j1 + 3] = y0i - y2i;
+  y0r = wd3i * x1r + wd3r * x1i;
+  y0i = wd3i * x1i - wd3r * x1r;
+  y2r = wk3i * x3r + wk3r * x3i;
+  y2i = wk3i * x3i - wk3r * x3r;
+  a[j2 + 2] = y0r + y2r;
+  a[j2 + 3] = y0i + y2i;
+  a[j3 + 2] = y0r - y2r;
+  a[j3 + 3] = y0i - y2i;
+}
+
+void cftfx41(int n, double *a) {
+  if (n == 128) {
+    cftf161(a);
+    cftf162(&a[32]);
+    cftf161(&a[64]);
+    cftf161(&a[96]);
+  } else {
+    cftf081(a);
+    cftf082(&a[16]);
+    cftf081(&a[32]);
+    cftf081(&a[48]);
+  }
+}
+
+void cftfx42(int n, double *a) {
+  if (n == 128) {
+    cftf161(a);
+    cftf162(&a[32]);
+    cftf161(&a[64]);
+    cftf162(&a[96]);
+  } else {
+    cftf081(a);
+    cftf082(&a[16]);
+    cftf081(&a[32]);
+    cftf082(&a[48]);
+  }
+}
+
+void cftf161(double *a) {
+  double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, y8r, y8i,
+    y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+  wn4r = WR5000;
+  wk1r = WR2500;
+  wk1i = WI2500;
+  x0r = a[0] + a[16];
+  x0i = a[1] + a[17];
+  x1r = a[0] - a[16];
+  x1i = a[1] - a[17];
+  x2r = a[8] + a[24];
+  x2i = a[9] + a[25];
+  x3r = a[8] - a[24];
+  x3i = a[9] - a[25];
+  y0r = x0r + x2r;
+  y0i = x0i + x2i;
+  y4r = x0r - x2r;
+  y4i = x0i - x2i;
+  y8r = x1r - x3i;
+  y8i = x1i + x3r;
+  y12r = x1r + x3i;
+  y12i = x1i - x3r;
+  x0r = a[2] + a[18];
+  x0i = a[3] + a[19];
+  x1r = a[2] - a[18];
+  x1i = a[3] - a[19];
+  x2r = a[10] + a[26];
+  x2i = a[11] + a[27];
+  x3r = a[10] - a[26];
+  x3i = a[11] - a[27];
+  y1r = x0r + x2r;
+  y1i = x0i + x2i;
+  y5r = x0r - x2r;
+  y5i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y9r = wk1r * x0r - wk1i * x0i;
+  y9i = wk1r * x0i + wk1i * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y13r = wk1i * x0r - wk1r * x0i;
+  y13i = wk1i * x0i + wk1r * x0r;
+  x0r = a[4] + a[20];
+  x0i = a[5] + a[21];
+  x1r = a[4] - a[20];
+  x1i = a[5] - a[21];
+  x2r = a[12] + a[28];
+  x2i = a[13] + a[29];
+  x3r = a[12] - a[28];
+  x3i = a[13] - a[29];
+  y2r = x0r + x2r;
+  y2i = x0i + x2i;
+  y6r = x0r - x2r;
+  y6i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y10r = wn4r * (x0r - x0i);
+  y10i = wn4r * (x0i + x0r);
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y14r = wn4r * (x0r + x0i);
+  y14i = wn4r * (x0i - x0r);
+  x0r = a[6] + a[22];
+  x0i = a[7] + a[23];
+  x1r = a[6] - a[22];
+  x1i = a[7] - a[23];
+  x2r = a[14] + a[30];
+  x2i = a[15] + a[31];
+  x3r = a[14] - a[30];
+  x3i = a[15] - a[31];
+  y3r = x0r + x2r;
+  y3i = x0i + x2i;
+  y7r = x0r - x2r;
+  y7i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y11r = wk1i * x0r - wk1r * x0i;
+  y11i = wk1i * x0i + wk1r * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y15r = wk1r * x0r - wk1i * x0i;
+  y15i = wk1r * x0i + wk1i * x0r;
+  x0r = y12r - y14r;
+  x0i = y12i - y14i;
+  x1r = y12r + y14r;
+  x1i = y12i + y14i;
+  x2r = y13r - y15r;
+  x2i = y13i - y15i;
+  x3r = y13r + y15r;
+  x3i = y13i + y15i;
+  a[24] = x0r + x2r;
+  a[25] = x0i + x2i;
+  a[26] = x0r - x2r;
+  a[27] = x0i - x2i;
+  a[28] = x1r - x3i;
+  a[29] = x1i + x3r;
+  a[30] = x1r + x3i;
+  a[31] = x1i - x3r;
+  x0r = y8r + y10r;
+  x0i = y8i + y10i;
+  x1r = y8r - y10r;
+  x1i = y8i - y10i;
+  x2r = y9r + y11r;
+  x2i = y9i + y11i;
+  x3r = y9r - y11r;
+  x3i = y9i - y11i;
+  a[16] = x0r + x2r;
+  a[17] = x0i + x2i;
+  a[18] = x0r - x2r;
+  a[19] = x0i - x2i;
+  a[20] = x1r - x3i;
+  a[21] = x1i + x3r;
+  a[22] = x1r + x3i;
+  a[23] = x1i - x3r;
+  x0r = y5r - y7i;
+  x0i = y5i + y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  x0r = y5r + y7i;
+  x0i = y5i - y7r;
+  x3r = wn4r * (x0r - x0i);
+  x3i = wn4r * (x0i + x0r);
+  x0r = y4r - y6i;
+  x0i = y4i + y6r;
+  x1r = y4r + y6i;
+  x1i = y4i - y6r;
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[10] = x0r - x2r;
+  a[11] = x0i - x2i;
+  a[12] = x1r - x3i;
+  a[13] = x1i + x3r;
+  a[14] = x1r + x3i;
+  a[15] = x1i - x3r;
+  x0r = y0r + y2r;
+  x0i = y0i + y2i;
+  x1r = y0r - y2r;
+  x1i = y0i - y2i;
+  x2r = y1r + y3r;
+  x2i = y1i + y3i;
+  x3r = y1r - y3r;
+  x3i = y1i - y3i;
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[2] = x0r - x2r;
+  a[3] = x0i - x2i;
+  a[4] = x1r - x3i;
+  a[5] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+}
+
+void cftf162(double *a) {
+  double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r, x2i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+    y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+  wn4r = WR5000;
+  wk1r = WR1250;
+  wk1i = WI1250;
+  wk2r = WR2500;
+  wk2i = WI2500;
+  wk3r = WR3750;
+  wk3i = WI3750;
+  x1r = a[0] - a[17];
+  x1i = a[1] + a[16];
+  x0r = a[8] - a[25];
+  x0i = a[9] + a[24];
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  y0r = x1r + x2r;
+  y0i = x1i + x2i;
+  y4r = x1r - x2r;
+  y4i = x1i - x2i;
+  x1r = a[0] + a[17];
+  x1i = a[1] - a[16];
+  x0r = a[8] + a[25];
+  x0i = a[9] - a[24];
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  y8r = x1r - x2i;
+  y8i = x1i + x2r;
+  y12r = x1r + x2i;
+  y12i = x1i - x2r;
+  x0r = a[2] - a[19];
+  x0i = a[3] + a[18];
+  x1r = wk1r * x0r - wk1i * x0i;
+  x1i = wk1r * x0i + wk1i * x0r;
+  x0r = a[10] - a[27];
+  x0i = a[11] + a[26];
+  x2r = wk3i * x0r - wk3r * x0i;
+  x2i = wk3i * x0i + wk3r * x0r;
+  y1r = x1r + x2r;
+  y1i = x1i + x2i;
+  y5r = x1r - x2r;
+  y5i = x1i - x2i;
+  x0r = a[2] + a[19];
+  x0i = a[3] - a[18];
+  x1r = wk3r * x0r - wk3i * x0i;
+  x1i = wk3r * x0i + wk3i * x0r;
+  x0r = a[10] + a[27];
+  x0i = a[11] - a[26];
+  x2r = wk1r * x0r + wk1i * x0i;
+  x2i = wk1r * x0i - wk1i * x0r;
+  y9r = x1r - x2r;
+  y9i = x1i - x2i;
+  y13r = x1r + x2r;
+  y13i = x1i + x2i;
+  x0r = a[4] - a[21];
+  x0i = a[5] + a[20];
+  x1r = wk2r * x0r - wk2i * x0i;
+  x1i = wk2r * x0i + wk2i * x0r;
+  x0r = a[12] - a[29];
+  x0i = a[13] + a[28];
+  x2r = wk2i * x0r - wk2r * x0i;
+  x2i = wk2i * x0i + wk2r * x0r;
+  y2r = x1r + x2r;
+  y2i = x1i + x2i;
+  y6r = x1r - x2r;
+  y6i = x1i - x2i;
+  x0r = a[4] + a[21];
+  x0i = a[5] - a[20];
+  x1r = wk2i * x0r - wk2r * x0i;
+  x1i = wk2i * x0i + wk2r * x0r;
+  x0r = a[12] + a[29];
+  x0i = a[13] - a[28];
+  x2r = wk2r * x0r - wk2i * x0i;
+  x2i = wk2r * x0i + wk2i * x0r;
+  y10r = x1r - x2r;
+  y10i = x1i - x2i;
+  y14r = x1r + x2r;
+  y14i = x1i + x2i;
+  x0r = a[6] - a[23];
+  x0i = a[7] + a[22];
+  x1r = wk3r * x0r - wk3i * x0i;
+  x1i = wk3r * x0i + wk3i * x0r;
+  x0r = a[14] - a[31];
+  x0i = a[15] + a[30];
+  x2r = wk1i * x0r - wk1r * x0i;
+  x2i = wk1i * x0i + wk1r * x0r;
+  y3r = x1r + x2r;
+  y3i = x1i + x2i;
+  y7r = x1r - x2r;
+  y7i = x1i - x2i;
+  x0r = a[6] + a[23];
+  x0i = a[7] - a[22];
+  x1r = wk1i * x0r + wk1r * x0i;
+  x1i = wk1i * x0i - wk1r * x0r;
+  x0r = a[14] + a[31];
+  x0i = a[15] - a[30];
+  x2r = wk3i * x0r - wk3r * x0i;
+  x2i = wk3i * x0i + wk3r * x0r;
+  y11r = x1r + x2r;
+  y11i = x1i + x2i;
+  y15r = x1r - x2r;
+  y15i = x1i - x2i;
+  x1r = y0r + y2r;
+  x1i = y0i + y2i;
+  x2r = y1r + y3r;
+  x2i = y1i + y3i;
+  a[0] = x1r + x2r;
+  a[1] = x1i + x2i;
+  a[2] = x1r - x2r;
+  a[3] = x1i - x2i;
+  x1r = y0r - y2r;
+  x1i = y0i - y2i;
+  x2r = y1r - y3r;
+  x2i = y1i - y3i;
+  a[4] = x1r - x2i;
+  a[5] = x1i + x2r;
+  a[6] = x1r + x2i;
+  a[7] = x1i - x2r;
+  x1r = y4r - y6i;
+  x1i = y4i + y6r;
+  x0r = y5r - y7i;
+  x0i = y5i + y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[8] = x1r + x2r;
+  a[9] = x1i + x2i;
+  a[10] = x1r - x2r;
+  a[11] = x1i - x2i;
+  x1r = y4r + y6i;
+  x1i = y4i - y6r;
+  x0r = y5r + y7i;
+  x0i = y5i - y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[12] = x1r - x2i;
+  a[13] = x1i + x2r;
+  a[14] = x1r + x2i;
+  a[15] = x1i - x2r;
+  x1r = y8r + y10r;
+  x1i = y8i + y10i;
+  x2r = y9r - y11r;
+  x2i = y9i - y11i;
+  a[16] = x1r + x2r;
+  a[17] = x1i + x2i;
+  a[18] = x1r - x2r;
+  a[19] = x1i - x2i;
+  x1r = y8r - y10r;
+  x1i = y8i - y10i;
+  x2r = y9r + y11r;
+  x2i = y9i + y11i;
+  a[20] = x1r - x2i;
+  a[21] = x1i + x2r;
+  a[22] = x1r + x2i;
+  a[23] = x1i - x2r;
+  x1r = y12r - y14i;
+  x1i = y12i + y14r;
+  x0r = y13r + y15i;
+  x0i = y13i - y15r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[24] = x1r + x2r;
+  a[25] = x1i + x2i;
+  a[26] = x1r - x2r;
+  a[27] = x1i - x2i;
+  x1r = y12r + y14i;
+  x1i = y12i - y14r;
+  x0r = y13r - y15i;
+  x0i = y13i + y15r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[28] = x1r - x2i;
+  a[29] = x1i + x2r;
+  a[30] = x1r + x2i;
+  a[31] = x1i - x2r;
+}
+
+void cftf081(double *a) {
+  double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+  wn4r = WR5000;
+  x0r = a[0] + a[8];
+  x0i = a[1] + a[9];
+  x1r = a[0] - a[8];
+  x1i = a[1] - a[9];
+  x2r = a[4] + a[12];
+  x2i = a[5] + a[13];
+  x3r = a[4] - a[12];
+  x3i = a[5] - a[13];
+  y0r = x0r + x2r;
+  y0i = x0i + x2i;
+  y2r = x0r - x2r;
+  y2i = x0i - x2i;
+  y1r = x1r - x3i;
+  y1i = x1i + x3r;
+  y3r = x1r + x3i;
+  y3i = x1i - x3r;
+  x0r = a[2] + a[10];
+  x0i = a[3] + a[11];
+  x1r = a[2] - a[10];
+  x1i = a[3] - a[11];
+  x2r = a[6] + a[14];
+  x2i = a[7] + a[15];
+  x3r = a[6] - a[14];
+  x3i = a[7] - a[15];
+  y4r = x0r + x2r;
+  y4i = x0i + x2i;
+  y6r = x0r - x2r;
+  y6i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  x2r = x1r + x3i;
+  x2i = x1i - x3r;
+  y5r = wn4r * (x0r - x0i);
+  y5i = wn4r * (x0r + x0i);
+  y7r = wn4r * (x2r - x2i);
+  y7i = wn4r * (x2r + x2i);
+  a[8] = y1r + y5r;
+  a[9] = y1i + y5i;
+  a[10] = y1r - y5r;
+  a[11] = y1i - y5i;
+  a[12] = y3r - y7i;
+  a[13] = y3i + y7r;
+  a[14] = y3r + y7i;
+  a[15] = y3i - y7r;
+  a[0] = y0r + y4r;
+  a[1] = y0i + y4i;
+  a[2] = y0r - y4r;
+  a[3] = y0i - y4i;
+  a[4] = y2r - y6i;
+  a[5] = y2i + y6r;
+  a[6] = y2r + y6i;
+  a[7] = y2i - y6r;
+}
+
+void cftf082(double *a) {
+  double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+  wn4r = WR5000;
+  wk1r = WR2500;
+  wk1i = WI2500;
+  y0r = a[0] - a[9];
+  y0i = a[1] + a[8];
+  y1r = a[0] + a[9];
+  y1i = a[1] - a[8];
+  x0r = a[4] - a[13];
+  x0i = a[5] + a[12];
+  y2r = wn4r * (x0r - x0i);
+  y2i = wn4r * (x0i + x0r);
+  x0r = a[4] + a[13];
+  x0i = a[5] - a[12];
+  y3r = wn4r * (x0r - x0i);
+  y3i = wn4r * (x0i + x0r);
+  x0r = a[2] - a[11];
+  x0i = a[3] + a[10];
+  y4r = wk1r * x0r - wk1i * x0i;
+  y4i = wk1r * x0i + wk1i * x0r;
+  x0r = a[2] + a[11];
+  x0i = a[3] - a[10];
+  y5r = wk1i * x0r - wk1r * x0i;
+  y5i = wk1i * x0i + wk1r * x0r;
+  x0r = a[6] - a[15];
+  x0i = a[7] + a[14];
+  y6r = wk1i * x0r - wk1r * x0i;
+  y6i = wk1i * x0i + wk1r * x0r;
+  x0r = a[6] + a[15];
+  x0i = a[7] - a[14];
+  y7r = wk1r * x0r - wk1i * x0i;
+  y7i = wk1r * x0i + wk1i * x0r;
+  x0r = y0r + y2r;
+  x0i = y0i + y2i;
+  x1r = y4r + y6r;
+  x1i = y4i + y6i;
+  a[0] = x0r + x1r;
+  a[1] = x0i + x1i;
+  a[2] = x0r - x1r;
+  a[3] = x0i - x1i;
+  x0r = y0r - y2r;
+  x0i = y0i - y2i;
+  x1r = y4r - y6r;
+  x1i = y4i - y6i;
+  a[4] = x0r - x1i;
+  a[5] = x0i + x1r;
+  a[6] = x0r + x1i;
+  a[7] = x0i - x1r;
+  x0r = y1r - y3i;
+  x0i = y1i + y3r;
+  x1r = y5r - y7r;
+  x1i = y5i - y7i;
+  a[8] = x0r + x1r;
+  a[9] = x0i + x1i;
+  a[10] = x0r - x1r;
+  a[11] = x0i - x1i;
+  x0r = y1r + y3i;
+  x0i = y1i - y3r;
+  x1r = y5r + y7r;
+  x1i = y5i + y7i;
+  a[12] = x0r - x1i;
+  a[13] = x0i + x1r;
+  a[14] = x0r + x1i;
+  a[15] = x0i - x1r;
+}
+
+void cftf040(double *a) {
+  double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[4];
+  x0i = a[1] + a[5];
+  x1r = a[0] - a[4];
+  x1i = a[1] - a[5];
+  x2r = a[2] + a[6];
+  x2i = a[3] + a[7];
+  x3r = a[2] - a[6];
+  x3i = a[3] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+}
+
+void cftb040(double *a) {
+  double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[4];
+  x0i = a[1] + a[5];
+  x1r = a[0] - a[4];
+  x1i = a[1] - a[5];
+  x2r = a[2] + a[6];
+  x2i = a[3] + a[7];
+  x3r = a[2] - a[6];
+  x3i = a[3] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r + x3i;
+  a[3] = x1i - x3r;
+  a[6] = x1r - x3i;
+  a[7] = x1i + x3r;
+}
+
+void cftx020(double *a) {
+  double x0r, x0i;
+
+  x0r = a[0] - a[2];
+  x0i = a[1] - a[3];
+  a[0] += a[2];
+  a[1] += a[3];
+  a[2] = x0r;
+  a[3] = x0i;
+}
+
+void rftfsub(int n, double *a) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 4) {
+      i0 = 4;
+    }
+    for (j = i - 4; j >= i0; j -= 4) {
+      k = n - j;
+      xr = a[j + 2] - a[k - 2];
+      xi = a[j + 3] + a[k - 1];
+      yr = wdr * xr - wdi * xi;
+      yi = wdr * xi + wdi * xr;
+      a[j + 2] -= yr;
+      a[j + 3] -= yi;
+      a[k - 2] += yr;
+      a[k - 1] -= yi;
+      wkr += ss * wdi;
+      wki += ss * (0.5 - wdr);
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      a[j] -= yr;
+      a[j + 1] -= yi;
+      a[k] += yr;
+      a[k + 1] -= yi;
+      wdr += ss * wki;
+      wdi += ss * (0.5 - wkr);
+    }
+    if (i0 == 4) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+  xr = a[2] - a[n - 2];
+  xi = a[3] + a[n - 1];
+  yr = wdr * xr - wdi * xi;
+  yi = wdr * xi + wdi * xr;
+  a[2] -= yr;
+  a[3] -= yi;
+  a[n - 2] += yr;
+  a[n - 1] -= yi;
+}
+
+void rftbsub(int n, double *a) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 4) {
+      i0 = 4;
+    }
+    for (j = i - 4; j >= i0; j -= 4) {
+      k = n - j;
+      xr = a[j + 2] - a[k - 2];
+      xi = a[j + 3] + a[k - 1];
+      yr = wdr * xr + wdi * xi;
+      yi = wdr * xi - wdi * xr;
+      a[j + 2] -= yr;
+      a[j + 3] -= yi;
+      a[k - 2] += yr;
+      a[k - 1] -= yi;
+      wkr += ss * wdi;
+      wki += ss * (0.5 - wdr);
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      a[j] -= yr;
+      a[j + 1] -= yi;
+      a[k] += yr;
+      a[k + 1] -= yi;
+      wdr += ss * wki;
+      wdi += ss * (0.5 - wkr);
+    }
+    if (i0 == 4) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+  xr = a[2] - a[n - 2];
+  xi = a[3] + a[n - 1];
+  yr = wdr * xr + wdi * xi;
+  yi = wdr * xi - wdi * xr;
+  a[2] -= yr;
+  a[3] -= yi;
+  a[n - 2] += yr;
+  a[n - 1] -= yi;
+}
+
+void dctsub(int n, double *a) {
+  int i, i0, j, k, m;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+
+  ec = M_PI_2 / n;
+  wkr = 0.5;
+  wki = 0.5;
+  w1r = cos(ec);
+  w1i = sin(ec);
+  wdr = 0.5 * (w1r - w1i);
+  wdi = 0.5 * (w1r + w1i);
+  ss = 2 * w1i;
+  m = n >> 1;
+  i = 0;
+  for (;;) {
+    i0 = i + 2 * DCST_LOOP_DIV;
+    if (i0 > m - 2) {
+      i0 = m - 2;
+    }
+    for (j = i + 2; j <= i0; j += 2) {
+      k = n - j;
+      xr = wdi * a[j - 1] - wdr * a[k + 1];
+      xi = wdr * a[j - 1] + wdi * a[k + 1];
+      wkr -= ss * wdi;
+      wki += ss * wdr;
+      yr = wki * a[j] - wkr * a[k];
+      yi = wkr * a[j] + wki * a[k];
+      wdr -= ss * wki;
+      wdi += ss * wkr;
+      a[k + 1] = xr;
+      a[k] = yr;
+      a[j - 1] = xi;
+      a[j] = yi;
+    }
+    if (i0 == m - 2) {
+      break;
+    }
+    wdr = cos(ec * i0);
+    wdi = sin(ec * i0);
+    wkr = 0.5 * (wdr - wdi);
+    wki = 0.5 * (wdr + wdi);
+    wdr = wkr * w1r - wki * w1i;
+    wdi = wkr * w1i + wki * w1r;
+    i = i0;
+  }
+  xr = wdi * a[m - 1] - wdr * a[m + 1];
+  a[m - 1] = wdr * a[m - 1] + wdi * a[m + 1];
+  a[m + 1] = xr;
+  a[m] *= wki + ss * wdr;
+}
+
+void dstsub(int n, double *a) {
+  int i, i0, j, k, m;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+
+  ec = M_PI_2 / n;
+  wkr = 0.5;
+  wki = 0.5;
+  w1r = cos(ec);
+  w1i = sin(ec);
+  wdr = 0.5 * (w1r - w1i);
+  wdi = 0.5 * (w1r + w1i);
+  ss = 2 * w1i;
+  m = n >> 1;
+  i = 0;
+  for (;;) {
+    i0 = i + 2 * DCST_LOOP_DIV;
+    if (i0 > m - 2) {
+      i0 = m - 2;
+    }
+    for (j = i + 2; j <= i0; j += 2) {
+      k = n - j;
+      xr = wdi * a[k + 1] - wdr * a[j - 1];
+      xi = wdr * a[k + 1] + wdi * a[j - 1];
+      wkr -= ss * wdi;
+      wki += ss * wdr;
+      yr = wki * a[k] - wkr * a[j];
+      yi = wkr * a[k] + wki * a[j];
+      wdr -= ss * wki;
+      wdi += ss * wkr;
+      a[j - 1] = xr;
+      a[j] = yr;
+      a[k + 1] = xi;
+      a[k] = yi;
+    }
+    if (i0 == m - 2) {
+      break;
+    }
+    wdr = cos(ec * i0);
+    wdi = sin(ec * i0);
+    wkr = 0.5 * (wdr - wdi);
+    wki = 0.5 * (wdr + wdi);
+    wdr = wkr * w1r - wki * w1i;
+    wdi = wkr * w1i + wki * w1r;
+    i = i0;
+  }
+  xr = wdi * a[m + 1] - wdr * a[m - 1];
+  a[m + 1] = wdr * a[m + 1] + wdi * a[m - 1];
+  a[m - 1] = xr;
+  a[m] *= wki + ss * wdr;
+}
+
+void dctsub4(int n, double *a) {
+  int m;
+  double wki, wdr, wdi, xr;
+
+  wki = WR5000;
+  m = n >> 1;
+  if (m == 2) {
+    wdr = wki * WI2500;
+    wdi = wki * WR2500;
+    xr = wdi * a[1] - wdr * a[3];
+    a[1] = wdr * a[1] + wdi * a[3];
+    a[3] = xr;
+  }
+  a[m] *= wki;
+}
+
+void dstsub4(int n, double *a) {
+  int m;
+  double wki, wdr, wdi, xr;
+
+  wki = WR5000;
+  m = n >> 1;
+  if (m == 2) {
+    wdr = wki * WI2500;
+    wdi = wki * WR2500;
+    xr = wdi * a[3] - wdr * a[1];
+    a[3] = wdr * a[3] + wdi * a[1];
+    a[1] = xr;
+  }
+  a[m] *= wki;
+}
diff --git a/tests/performance/superpi/fftsg_h.h b/tests/performance/superpi/fftsg_h.h
new file mode 100644
index 000000000..3158ce80a
--- /dev/null
+++ b/tests/performance/superpi/fftsg_h.h
@@ -0,0 +1,88 @@
+/*
+  Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999.
+  https://github.com/Fibonacci43/SuperPI
+  Modified for Arduino by Lucas Saavedra Vaz, 2024.
+*/
+
+#pragma once
+
+#include <math.h>
+
+#ifndef M_PI_2
+#define M_PI_2 1.570796326794896619231321691639751442098584699687
+#endif
+#ifndef WR5000 /* cos(M_PI_2*0.5000) */
+#define WR5000 0.707106781186547524400844362104849039284835937688
+#endif
+#ifndef WR2500 /* cos(M_PI_2*0.2500) */
+#define WR2500 0.923879532511286756128183189396788286822416625863
+#endif
+#ifndef WI2500 /* sin(M_PI_2*0.2500) */
+#define WI2500 0.382683432365089771728459984030398866761344562485
+#endif
+#ifndef WR1250 /* cos(M_PI_2*0.1250) */
+#define WR1250 0.980785280403230449126182236134239036973933730893
+#endif
+#ifndef WI1250 /* sin(M_PI_2*0.1250) */
+#define WI1250 0.195090322016128267848284868477022240927691617751
+#endif
+#ifndef WR3750 /* cos(M_PI_2*0.3750) */
+#define WR3750 0.831469612302545237078788377617905756738560811987
+#endif
+#ifndef WI3750 /* sin(M_PI_2*0.3750) */
+#define WI3750 0.555570233019602224742830813948532874374937190754
+#endif
+
+#ifndef CDFT_RECURSIVE_N     /* length of the recursive FFT mode */
+#define CDFT_RECURSIVE_N 512 /* <= (L1 cache size) / 16 */
+#endif
+
+#ifndef CDFT_LOOP_DIV /* control of the CDFT's speed & tolerance */
+#define CDFT_LOOP_DIV 32
+#endif
+
+#ifndef RDFT_LOOP_DIV /* control of the RDFT's speed & tolerance */
+#define RDFT_LOOP_DIV 64
+#endif
+
+#ifndef DCST_LOOP_DIV /* control of the DCT,DST's speed & tolerance */
+#define DCST_LOOP_DIV 64
+#endif
+
+void bitrv1(int n, double *a);
+void bitrv2(int n, double *a);
+void bitrv208(double *a);
+void bitrv208neg(double *a);
+void bitrv216(double *a);
+void bitrv216neg(double *a);
+void bitrv2conj(int n, double *a);
+void cdft(int n, int isgn, double *a);
+void cftb040(double *a);
+void cftb1st(int n, double *a);
+void cftbsub(int n, double *a);
+void cftexp1(int n, double *a);
+void cftexp2(int n, double *a);
+void cftf040(double *a);
+void cftf081(double *a);
+void cftf082(double *a);
+void cftf161(double *a);
+void cftf162(double *a);
+void cftfsub(int n, double *a);
+void cftfx41(int n, double *a);
+void cftfx42(int n, double *a);
+void cftmdl1(int n, double *a);
+void cftmdl2(int n, double *a);
+void cftrec1(int n, double *a);
+void cftrec2(int n, double *a);
+void cftx020(double *a);
+void dctsub(int n, double *a);
+void dctsub4(int n, double *a);
+void ddct(int n, int isgn, double *a);
+void ddst(int n, int isgn, double *a);
+void dfct(int n, double *a);
+void dfst(int n, double *a);
+void dstsub(int n, double *a);
+void dstsub4(int n, double *a);
+void rdft(int n, int isgn, double *a);
+void rftbsub(int n, double *a);
+void rftfsub(int n, double *a);
diff --git a/tests/performance/superpi/pi_fftcs.cpp b/tests/performance/superpi/pi_fftcs.cpp
new file mode 100644
index 000000000..bf83dd291
--- /dev/null
+++ b/tests/performance/superpi/pi_fftcs.cpp
@@ -0,0 +1,2214 @@
+/*
+  Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999.
+  https://github.com/Fibonacci43/SuperPI
+  Modified for Arduino by Lucas Saavedra Vaz, 2024.
+*/
+
+#include <Arduino.h>
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "fftsg_h.h"
+#include "pi_fftcs.h"
+
+void pi_calc(int nfft) {
+  int log2_nfft, radix, log10_radix, n, npow, nprc;
+#if PRINT_DIGITS
+  int j = 0, k = 0, l = 0;
+#endif
+  double err;
+  int *a, *b, *c, *e, *i1, *i2;
+  double *d1, *d2, *d3;
+  char *dgt;
+  uint32_t start_time;
+  double elap_time, loop_time;
+  log_d("Calculation of PI using FFT and AGM, %s", PI_FFTC_VER);
+
+  // DGTINT is defined as short int, so it should be 2 bytes
+  assert(sizeof(DGTINT) == 2);
+
+  log_d("initializing...");
+  nfft /= 4;
+  start_time = millis();
+  for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++);
+  nfft = 1 << log2_nfft;
+  n = nfft + 2;
+  a = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  b = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  c = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  e = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  i1 = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  i2 = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT));
+  d1 = (double *)malloc((nfft + 2) * sizeof(double));
+  d2 = (double *)malloc((nfft + 2) * sizeof(double));
+  d3 = (double *)malloc((nfft + 2) * sizeof(double));
+  if (d3 == NULL) {
+    printf("Allocation Failure!\n");
+    exit(1);
+  }
+  /* ---- radix test ---- */
+  log10_radix = 1;
+  radix = 10;
+  err = mp_mul_radix_test(n, radix, nfft, d1);
+  err += DBL_EPSILON * (n * radix * radix / 4);
+  while (100 * err < DBL_ERROR_MARGIN && radix <= DGTINT_MAX / 20) {
+    err *= 100;
+    log10_radix++;
+    radix *= 10;
+  }
+  log_d("nfft= %d, radix= %d, error_margin= %g", nfft, radix, err);
+  log_d("calculating %d digits of PI...", log10_radix * (n - 2));
+  /*
+   * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+   *   c = sqrt(0.125);
+   *   a = 1 + 3 * c;
+   *   b = sqrt(a);
+   *   e = b - 0.625;
+   *   b = 2 * b;
+   *   c = e - c;
+   *   a = a + e;
+   *   npow = 4;
+   *   do {
+   *       npow = 2 * npow;
+   *       e = (a + b) / 2;
+   *       b = sqrt(a * b);
+   *       e = e - b;
+   *       b = 2 * b;
+   *       c = c - e;
+   *       a = e + b;
+   *   } while (e > SQRT_SQRT_EPSILON);
+   *   e = e * e / 4;
+   *   a = a + b;
+   *   pi = (a * a - e - e / 2) / (a * c - e) / npow;
+   * ---- modification ----
+   *   This is a modified version of Gauss-Legendre formula
+   *   (by T.Ooura). It is faster than original version.
+   * ---- reference ----
+   *   1. E.Salamin,
+   *      Computation of PI Using Arithmetic-Geometric Mean,
+   *      Mathematics of Computation, Vol.30 1976.
+   *   2. R.P.Brent,
+   *      Fast Multiple-Precision Evaluation of Elementary Functions,
+   *      J. ACM 23 1976.
+   *   3. D.Takahasi, Y.Kanada,
+   *      Calculation of PI to 51.5 Billion Decimal Digits on
+   *      Distributed Memoriy Parallel Processors,
+   *      Transactions of Information Processing Society of Japan,
+   *      Vol.39 No.7 1998.
+   *   4. T.Ooura,
+   *      Improvement of the PI Calculation Algorithm and
+   *      Implementation of Fast Multiple-Precision Computation,
+   *      Information Processing Society of Japan SIG Notes,
+   *      98-HPC-74, 1998.
+   */
+  /* ---- c = 1 / sqrt(8) ---- */
+  mp_invisqrt(n, radix, 8, c, i1, i2, nfft, d1, d2);
+  /* ---- a = 1 + 3 * c ---- */
+  mp_imul(n, radix, c, 3, e);
+  mp_sscanf(n, log10_radix, (char *)"1", a);
+  mp_add(n, radix, a, e, a);
+  /* ---- b = sqrt(a) ---- */
+  mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2);
+  /* ---- e = b - 0.625 ---- */
+  mp_sscanf(n, log10_radix, (char *)"0.625", e);
+  mp_sub(n, radix, b, e, e);
+  /* ---- b = 2 * b ---- */
+  mp_add(n, radix, b, b, b);
+  /* ---- c = e - c ---- */
+  mp_sub(n, radix, e, c, c);
+  /* ---- a = a + e ---- */
+  mp_add(n, radix, a, e, a);
+  log_d("AGM iteration");
+  npow = 4;
+  elap_time = ((double)(millis() - start_time)) / 1000;
+
+  do {
+    uint32_t start_loop_time = millis();
+    npow *= 2;
+    /* ---- e = (a + b) / 2 ---- */
+    mp_add(n, radix, a, b, e);
+    mp_idiv_2(n, radix, e, e);
+    /* ---- b = sqrt(a * b) ---- */
+    mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3);
+    mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2);
+    /* ---- e = e - b ---- */
+    mp_sub(n, radix, e, b, e);
+    /* ---- b = 2 * b ---- */
+    mp_add(n, radix, b, b, b);
+    /* ---- c = c - e ---- */
+    mp_sub(n, radix, c, e, c);
+    /* ---- a = e + b ---- */
+    mp_add(n, radix, e, b, a);
+    /* ---- convergence check ---- */
+    nprc = -e[1];
+    if (e[0] == 0) {
+      nprc = n;
+    }
+    loop_time = ((double)(millis() - start_loop_time)) / 1000;
+    elap_time += loop_time;
+    log_d("precision= %d: %0.2f sec", 4 * nprc * log10_radix, loop_time);
+  } while (4 * nprc <= n);
+  start_time = millis();
+  /* ---- e = e * e / 4 (half precision) ---- */
+  mp_idiv_2(n, radix, e, e);
+  mp_squh(n, radix, e, e, nfft, d1);
+  /* ---- a = a + b ---- */
+  mp_add(n, radix, a, b, a);
+  /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */
+  mp_mulhf(n, radix, a, c, c, i1, nfft, d1, d2);
+  mp_sub(n, radix, c, e, c);
+  mp_inv(n, radix, c, b, i1, i2, nfft, d2, d3);
+  mp_squhf_use_infft(n, radix, d1, a, a, i1, nfft, d2);
+  mp_sub(n, radix, a, e, a);
+  mp_idiv_2(n, radix, e, e);
+  mp_sub(n, radix, a, e, a);
+  mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3);
+  mp_idiv(n, radix, a, npow, a);
+  /* ---- output ---- */
+  dgt = (char *)d1;
+  mp_sprintf(n - 1, log10_radix, a, dgt);
+  elap_time = ((double)(millis() - start_time)) / 1000;
+
+#if PRINT_DIGITS
+  do {
+    if (!isdigit(*dgt)) {
+      if (isalpha(*dgt) != 0) {
+        fputc('\n', stdout);
+        fputc('\n', stdout);
+      }
+      fputc(*dgt, stdout);
+      fputc('\n', stdout);
+      fputc('\n', stdout);
+      j = 0;
+      k = 0;
+      l = 0;
+      continue;
+    }
+    fputc(*dgt, stdout);
+    if (++j >= DGT_PACK) {
+      j = 0;
+      if (++k >= DGT_PACK_LINE) {
+        k = 0;
+        fputc('\n', stdout);
+        if (++l >= DGT_LINE_BLOCK) {
+          l = 0;
+          fputc('\n', stdout);
+        }
+      } else {
+        fputc(' ', stdout);
+      }
+    }
+  } while (*dgt++ && *dgt != 'e');
+  fputc('\n', stdout);
+  fprintf(stdout, "%s\n", dgt);
+#endif
+
+  free(d3);
+  free(d2);
+  free(d1);
+  free(i2);
+  free(i1);
+  free(e);
+  free(c);
+  free(b);
+  free(a);
+  /* ---- difftime ---- */
+  log_d("%0.2f sec. (real time)", elap_time);
+}
+
+/* -------- multiple precision routines -------- */
+
+/* -------- mp_load routines -------- */
+
+void mp_load_0(int n, int radix, int out[]) {
+  int j;
+  DGTINT *outr;
+
+  outr = ((DGTINT *)&out[2]) - 2;
+  out[0] = 0;
+  out[1] = 0;
+  for (j = 2; j <= n + 1; j++) {
+    outr[j] = 0;
+  }
+}
+
+void mp_load_1(int n, int radix, int out[]) {
+  int j;
+  DGTINT *outr;
+
+  outr = ((DGTINT *)&out[2]) - 2;
+  out[0] = 1;
+  out[1] = 0;
+  outr[2] = 1;
+  for (j = 3; j <= n + 1; j++) {
+    outr[j] = 0;
+  }
+}
+
+void mp_round(int n, int radix, int m, int inout[]) {
+  int j, x;
+  DGTINT *inoutr;
+
+  inoutr = ((DGTINT *)&inout[2]) - 2;
+  if (m < n) {
+    for (j = n + 1; j > m + 2; j--) {
+      inoutr[j] = 0;
+    }
+    x = 2 * inoutr[m + 2];
+    inoutr[m + 2] = 0;
+    if (x >= radix) {
+      for (j = m + 1; j >= 2; j--) {
+        x = inoutr[j] + 1;
+        if (x < radix) {
+          inoutr[j] = (DGTINT)x;
+          break;
+        }
+        inoutr[j] = 0;
+      }
+      if (x >= radix) {
+        inoutr[2] = 1;
+        inout[1]++;
+      }
+    }
+  }
+}
+
+/* -------- mp_add routines -------- */
+
+int mp_cmp(int n, int radix, int in1[], int in2[]) {
+  int mp_unsgn_cmp(int n, int in1[], int in2[]);
+
+  if (in1[0] > in2[0]) {
+    return 1;
+  } else if (in1[0] < in2[0]) {
+    return -1;
+  }
+  return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]);
+}
+
+void mp_add(int n, int radix, int in1[], int in2[], int out[]) {
+  int mp_unsgn_cmp(int n, int in1[], int in2[]);
+  int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]);
+  int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]);
+  int outsgn, outexp, expdif;
+
+  expdif = in1[1] - in2[1];
+  outexp = in1[1];
+  if (expdif < 0) {
+    outexp = in2[1];
+  }
+  outsgn = in1[0] * in2[0];
+  if (outsgn >= 0) {
+    if (outsgn > 0) {
+      outsgn = in1[0];
+    } else {
+      outsgn = in1[0] + in2[0];
+      outexp = in1[1] + in2[1];
+      expdif = 0;
+    }
+    if (expdif >= 0) {
+      outexp += mp_unexp_add(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]);
+    } else {
+      outexp += mp_unexp_add(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]);
+    }
+  } else {
+    outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+    if (outsgn >= 0) {
+      expdif = mp_unexp_sub(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]);
+    } else {
+      expdif = mp_unexp_sub(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]);
+    }
+    outexp -= expdif;
+    outsgn *= in1[0];
+    if (expdif == n) {
+      outsgn = 0;
+    }
+  }
+  if (outsgn == 0) {
+    outexp = 0;
+  }
+  out[0] = outsgn;
+  out[1] = outexp;
+}
+
+void mp_sub(int n, int radix, int in1[], int in2[], int out[]) {
+  int mp_unsgn_cmp(int n, int in1[], int in2[]);
+  int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]);
+  int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]);
+  int outsgn, outexp, expdif;
+
+  expdif = in1[1] - in2[1];
+  outexp = in1[1];
+  if (expdif < 0) {
+    outexp = in2[1];
+  }
+  outsgn = in1[0] * in2[0];
+  if (outsgn <= 0) {
+    if (outsgn < 0) {
+      outsgn = in1[0];
+    } else {
+      outsgn = in1[0] - in2[0];
+      outexp = in1[1] + in2[1];
+      expdif = 0;
+    }
+    if (expdif >= 0) {
+      outexp += mp_unexp_add(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]);
+    } else {
+      outexp += mp_unexp_add(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]);
+    }
+  } else {
+    outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+    if (outsgn >= 0) {
+      expdif = mp_unexp_sub(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]);
+    } else {
+      expdif = mp_unexp_sub(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]);
+    }
+    outexp -= expdif;
+    outsgn *= in1[0];
+    if (expdif == n) {
+      outsgn = 0;
+    }
+  }
+  if (outsgn == 0) {
+    outexp = 0;
+  }
+  out[0] = outsgn;
+  out[1] = outexp;
+}
+
+/* -------- mp_add child routines -------- */
+
+int mp_unsgn_cmp(int n, int in1[], int in2[]) {
+  int j, cmp;
+  DGTINT *in1r, *in2r;
+
+  in1r = ((DGTINT *)&in1[1]) - 1;
+  in2r = ((DGTINT *)&in2[1]) - 1;
+  cmp = in1[0] - in2[0];
+  for (j = 1; j <= n && cmp == 0; j++) {
+    cmp = in1r[j] - in2r[j];
+  }
+  if (cmp > 0) {
+    cmp = 1;
+  } else if (cmp < 0) {
+    cmp = -1;
+  }
+  return cmp;
+}
+
+int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]) {
+  int j, x, carry;
+
+  carry = 0;
+  if (expdif == 0 && in1[0] + in2[0] >= radix) {
+    x = in1[n - 1] + in2[n - 1];
+    carry = x >= radix ? -1 : 0;
+    for (j = n - 1; j > 0; j--) {
+      x = in1[j - 1] + in2[j - 1] - carry;
+      carry = x >= radix ? -1 : 0;
+      out[j] = (DGTINT)(x - (radix & carry));
+    }
+    out[0] = (DGTINT)-carry;
+  } else {
+    if (expdif > n) {
+      expdif = n;
+    }
+    for (j = n - 1; j >= expdif; j--) {
+      x = in1[j] + in2[j - expdif] - carry;
+      carry = x >= radix ? -1 : 0;
+      out[j] = (DGTINT)(x - (radix & carry));
+    }
+    for (j = expdif - 1; j >= 0; j--) {
+      x = in1[j] - carry;
+      carry = x >= radix ? -1 : 0;
+      out[j] = (DGTINT)(x - (radix & carry));
+    }
+    if (carry != 0) {
+      for (j = n - 1; j > 0; j--) {
+        out[j] = out[j - 1];
+      }
+      out[0] = (DGTINT)-carry;
+    }
+  }
+  return -carry;
+}
+
+int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]) {
+  int j, x, borrow, ncancel;
+
+  if (expdif > n) {
+    expdif = n;
+  }
+  borrow = 0;
+  for (j = n - 1; j >= expdif; j--) {
+    x = in1[j] - in2[j - expdif] + borrow;
+    borrow = x < 0 ? -1 : 0;
+    out[j] = (DGTINT)(x + (radix & borrow));
+  }
+  for (j = expdif - 1; j >= 0; j--) {
+    x = in1[j] + borrow;
+    borrow = x < 0 ? -1 : 0;
+    out[j] = (DGTINT)(x + (radix & borrow));
+  }
+  ncancel = 0;
+  for (j = 0; j < n && out[j] == 0; j++) {
+    ncancel = j + 1;
+  }
+  if (ncancel > 0 && ncancel < n) {
+    for (j = 0; j < n - ncancel; j++) {
+      out[j] = out[j + ncancel];
+    }
+    for (j = n - ncancel; j < n; j++) {
+      out[j] = 0;
+    }
+  }
+  return ncancel;
+}
+
+/* -------- mp_imul routines -------- */
+
+void mp_imul(int n, int radix, int in1[], int in2, int out[]) {
+  void mp_unsgn_imul(int n, double dradix, int in1[], double din2, int out[]);
+
+  if (in2 > 0) {
+    out[0] = in1[0];
+  } else if (in2 < 0) {
+    out[0] = -in1[0];
+    in2 = -in2;
+  } else {
+    out[0] = 0;
+  }
+  mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]);
+  if (out[0] == 0) {
+    out[1] = 0;
+  }
+}
+
+int mp_idiv(int n, int radix, int in1[], int in2, int out[]) {
+  void mp_load_0(int n, int radix, int out[]);
+  void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, int out[]);
+
+  if (in2 == 0) {
+    return -1;
+  }
+  if (in2 > 0) {
+    out[0] = in1[0];
+  } else {
+    out[0] = -in1[0];
+    in2 = -in2;
+  }
+  if (in1[0] == 0) {
+    mp_load_0(n, radix, out);
+    return 0;
+  }
+  mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]);
+  return 0;
+}
+
+void mp_idiv_2(int n, int radix, int in[], int out[]) {
+  int j, ix, carry, shift;
+  DGTINT *inr, *outr;
+
+  inr = ((DGTINT *)&in[2]) - 2;
+  outr = ((DGTINT *)&out[2]) - 2;
+  out[0] = in[0];
+  shift = 0;
+  if (inr[2] == 1) {
+    shift = 1;
+  }
+  out[1] = in[1] - shift;
+  carry = -shift;
+  for (j = 2; j <= n + 1 - shift; j++) {
+    ix = inr[j + shift] + (radix & carry);
+    carry = -(ix & 1);
+    outr[j] = (DGTINT)(ix >> 1);
+  }
+  if (shift > 0) {
+    outr[n + 1] = (DGTINT)((radix & carry) >> 1);
+  }
+}
+
+/* -------- mp_imul child routines -------- */
+
+void mp_unsgn_imul(int n, double dradix, int in1[], double din2, int out[]) {
+  int j, carry, shift;
+  double x, d1_radix;
+  DGTINT *in1r, *outr;
+
+  in1r = ((DGTINT *)&in1[1]) - 1;
+  outr = ((DGTINT *)&out[1]) - 1;
+  d1_radix = 1.0 / dradix;
+  carry = 0;
+  for (j = n; j >= 1; j--) {
+    x = din2 * in1r[j] + carry + 0.5;
+    carry = (int)(d1_radix * x);
+    outr[j] = (DGTINT)(x - dradix * carry);
+  }
+  shift = 0;
+  x = carry + 0.5;
+  while (x > 1) {
+    x *= d1_radix;
+    shift++;
+  }
+  out[0] = in1[0] + shift;
+  if (shift > 0) {
+    while (shift > n) {
+      carry = (int)(d1_radix * carry + 0.5);
+      shift--;
+    }
+    for (j = n; j >= shift + 1; j--) {
+      outr[j] = outr[j - shift];
+    }
+    for (j = shift; j >= 1; j--) {
+      x = carry + 0.5;
+      carry = (int)(d1_radix * x);
+      outr[j] = (DGTINT)(x - dradix * carry);
+    }
+  }
+}
+
+void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, int out[]) {
+  int j, ix, carry, shift;
+  double x, d1_in2;
+  DGTINT *in1r, *outr;
+
+  in1r = ((DGTINT *)&in1[1]) - 1;
+  outr = ((DGTINT *)&out[1]) - 1;
+  d1_in2 = 1.0 / din2;
+  shift = 0;
+  x = 0;
+  do {
+    shift++;
+    x *= dradix;
+    if (shift <= n) {
+      x += in1r[shift];
+    }
+  } while (x < din2 - 0.5);
+  x += 0.5;
+  ix = (int)(d1_in2 * x);
+  carry = (int)(x - din2 * ix);
+  outr[1] = (DGTINT)ix;
+  shift--;
+  out[0] = in1[0] - shift;
+  if (shift >= n) {
+    shift = n - 1;
+  }
+  for (j = 2; j <= n - shift; j++) {
+    x = in1r[j + shift] + dradix * carry + 0.5;
+    ix = (int)(d1_in2 * x);
+    carry = (int)(x - din2 * ix);
+    outr[j] = (DGTINT)ix;
+  }
+  for (j = n - shift + 1; j <= n; j++) {
+    x = dradix * carry + 0.5;
+    ix = (int)(d1_in2 * x);
+    carry = (int)(x - din2 * ix);
+    outr[j] = (DGTINT)ix;
+  }
+}
+
+/* -------- mp_mul routines -------- */
+
+double mp_mul_radix_test(int n, int radix, int nfft, double tmpfft[]) {
+  void mp_mul_csqu(int nfft, double d1[]);
+  double mp_mul_d2i_test(int radix, int nfft, double din[]);
+  int j, ndata, radix_2;
+
+  ndata = (nfft >> 1) + 1;
+  if (ndata > n) {
+    ndata = n;
+  }
+  tmpfft[nfft + 1] = radix - 1;
+  for (j = nfft; j > ndata; j--) {
+    tmpfft[j] = 0;
+  }
+  radix_2 = (radix + 1) / 2;
+  for (j = ndata; j > 2; j--) {
+    tmpfft[j] = radix_2;
+  }
+  tmpfft[2] = radix;
+  tmpfft[1] = radix - 1;
+  tmpfft[0] = 0;
+  mp_mul_csqu(nfft, tmpfft);
+  return 2 * mp_mul_d2i_test(radix, nfft, tmpfft);
+}
+
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], double tmp3fft[]) {
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul_nt_out(int nfft, double d1[], double d2[]);
+  void mp_mul_cmul_nt_d2(int nfft, double d1[], double d2[]);
+  void mp_mul_cmul_nt_d1_add(int nfft, double d1[], double d2[], double d3[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  int n_h, shift;
+  DGTINT *in1r, *in2r;
+
+  in1r = ((DGTINT *)&in1[2]) - 2;
+  in2r = ((DGTINT *)&in2[2]) - 2;
+  shift = (nfft >> 1) + 1;
+  while (n > shift) {
+    if (in1r[shift + 2] + in2r[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft);
+  mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft);
+  mp_mul_cmul_nt_out(nfft, tmp1fft, tmp3fft);
+  /* ---- tmp = (upper) in1 * (upper) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft);
+  mp_mul_cmul_nt_d2(nfft, tmp2fft, tmp1fft);
+  mp_mul_d2i(n, radix, nfft, tmp1fft, tmp);
+  /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */
+  mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft);
+  mp_mul_cmul_nt_d1_add(nfft, tmp2fft, tmp1fft, tmp3fft);
+  /* ---- out = tmp + tmp3fft ---- */
+  mp_mul_d2i(n_h, radix, nfft, tmp3fft, out);
+  mp_add(n, radix, out, tmp, out);
+}
+
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul(int nfft, double d1[], double d2[]);
+  void mp_mul_csqu_nt_d1(int nfft, double d1[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  int n_h, shift;
+  DGTINT *inr;
+
+  inr = ((DGTINT *)&in[2]) - 2;
+  shift = (nfft >> 1) + 1;
+  while (n > shift) {
+    if (inr[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp = 2 * (upper) in * (lower) in ---- */
+  mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft);
+  mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft);
+  mp_mul_cmul(nfft, tmp1fft, tmp2fft);
+  mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp);
+  mp_add(n_h, radix, tmp, tmp, tmp);
+  /* ---- out = tmp + ((upper) in)^2 ---- */
+  mp_mul_csqu_nt_d1(nfft, tmp1fft);
+  mp_mul_d2i(n, radix, nfft, tmp1fft, out);
+  mp_add(n, radix, out, tmp, out);
+}
+
+void mp_mulhf(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double in1fft[], double tmpfft[]) {
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul(int nfft, double d1[], double d2[]);
+  void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  int n_h, shift;
+  DGTINT *in2r;
+
+  in2r = ((DGTINT *)&in2[2]) - 2;
+  shift = (nfft >> 1) + 1;
+  while (n > shift) {
+    if (in2r[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp = (upper) in1 * (upper) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, 0, in1, in1fft);
+  mp_mul_i2d(n, radix, nfft, 0, in2, tmpfft);
+  mp_mul_cmul(nfft, in1fft, tmpfft);
+  mp_mul_d2i(n, radix, nfft, tmpfft, tmp);
+  /* ---- out = tmp + (upper) in1 * (lower) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, shift, in2, tmpfft);
+  mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft);
+  mp_mul_d2i(n_h, radix, nfft, tmpfft, out);
+  mp_add(n, radix, out, tmp, out);
+}
+
+void mp_mulhf_use_in1fft(int n, int radix, double in1fft[], int in2[], int out[], int tmp[], int nfft, double tmpfft[]) {
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  int n_h, shift;
+  DGTINT *in2r;
+
+  in2r = ((DGTINT *)&in2[2]) - 2;
+  shift = (nfft >> 1) + 1;
+  while (n > shift) {
+    if (in2r[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp = (upper) in1fft * (upper) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, 0, in2, tmpfft);
+  mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft);
+  mp_mul_d2i(n, radix, nfft, tmpfft, tmp);
+  /* ---- out = tmp + (upper) in1 * (lower) in2 ---- */
+  mp_mul_i2d(n, radix, nfft, shift, in2, tmpfft);
+  mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft);
+  mp_mul_d2i(n_h, radix, nfft, tmpfft, out);
+  mp_add(n, radix, out, tmp, out);
+}
+
+void mp_squhf_use_infft(int n, int radix, double infft[], int in[], int out[], int tmp[], int nfft, double tmpfft[]) {
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]);
+  void mp_mul_csqu_nt_d1(int nfft, double d1[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  int n_h, shift;
+  DGTINT *inr;
+
+  inr = ((DGTINT *)&in[2]) - 2;
+  shift = (nfft >> 1) + 1;
+  while (n > shift) {
+    if (inr[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp = (upper) infft * (lower) in ---- */
+  mp_mul_i2d(n, radix, nfft, shift, in, tmpfft);
+  mp_mul_cmul_nt_d1(nfft, infft, tmpfft);
+  mp_mul_d2i(n_h, radix, nfft, tmpfft, tmp);
+  /* ---- out = tmp + ((upper) infft)^2 ---- */
+  mp_mul_csqu_nt_d1(nfft, infft);
+  mp_mul_d2i(n, radix, nfft, infft, out);
+  mp_add(n, radix, out, tmp, out);
+}
+
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]) {
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul(int nfft, double d1[], double d2[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+
+  mp_mul_i2d(n, radix, nfft, 0, in1, in1fft);
+  mp_mul_i2d(n, radix, nfft, 0, in2, outfft);
+  mp_mul_cmul(nfft, in1fft, outfft);
+  mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]) {
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+  DGTINT *in2r;
+
+  in2r = ((DGTINT *)&in2[2]) - 2;
+  while (n > shift) {
+    if (in2r[shift + 2] != 0) {
+      break;
+    }
+    shift++;
+  }
+  mp_mul_i2d(n, radix, nfft, shift, in2, outfft);
+  mp_mul_cmul_nt_d1(nfft, in1fft, outfft);
+  mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]) {
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_csqu(int nfft, double d1[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+
+  mp_mul_i2d(n, radix, nfft, 0, in, outfft);
+  mp_mul_csqu(nfft, outfft);
+  mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+void mp_squh_save_infft(int n, int radix, int in[], int out[], int nfft, double infft[], double outfft[]) {
+  void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]);
+  void mp_mul_csqu_save_d1(int nfft, double d1[], double d2[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+
+  mp_mul_i2d(n, radix, nfft, 0, in, infft);
+  mp_mul_csqu_save_d1(nfft, infft, outfft);
+  mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], int nfft) {
+  void mp_mul_csqu_nt_d1(int nfft, double d1[]);
+  void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+
+  mp_mul_csqu_nt_d1(nfft, inoutfft);
+  mp_mul_d2i(n, radix, nfft, inoutfft, out);
+}
+
+/* -------- mp_mul child routines -------- */
+
+void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]) {
+  int j, x, carry, ndata, radix_2, topdgt;
+  DGTINT *inr;
+
+  inr = ((DGTINT *)&in[2]) - 2;
+  ndata = 0;
+  topdgt = 0;
+  if (n > shift) {
+    topdgt = inr[shift + 2];
+    ndata = (nfft >> 1) + 1;
+    if (ndata > n - shift) {
+      ndata = n - shift;
+    }
+  }
+  dout[nfft + 1] = in[0] * topdgt;
+  for (j = nfft; j > ndata; j--) {
+    dout[j] = 0;
+  }
+  /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */
+  if (ndata > 1) {
+    radix_2 = radix / 2;
+    carry = 0;
+    for (j = ndata + 1; j > 3; j--) {
+      x = inr[j + shift] - carry;
+      carry = x >= radix_2 ? -1 : 0;
+      dout[j - 1] = x - (radix & carry);
+    }
+    dout[2] = inr[shift + 3] - carry;
+  }
+  dout[1] = topdgt;
+  dout[0] = in[1] - shift;
+}
+
+void mp_mul_cmul(int nfft, double d1[], double d2[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcmul(int n, double *a, double *b);
+  double xr, xi;
+
+  cdft(nfft, 1, &d1[1]);
+  cdft(nfft, 1, &d2[1]);
+  d2[0] += d1[0];
+  xr = d1[1] * d2[1] + d1[2] * d2[2];
+  xi = d1[1] * d2[2] + d1[2] * d2[1];
+  d2[1] = xr;
+  d2[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcmul(nfft, &d1[1], &d2[1]);
+  }
+  d2[nfft + 1] *= d1[nfft + 1];
+  cdft(nfft, -1, &d2[1]);
+}
+
+void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcmul_nt_in1(int n, double *a, double *b);
+  double xr, xi;
+
+  cdft(nfft, 1, &d2[1]);
+  d2[0] += d1[0];
+  xr = d1[1] * d2[1] + d1[2] * d2[2];
+  xi = d1[1] * d2[2] + d1[2] * d2[1];
+  d2[1] = xr;
+  d2[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcmul_nt_in1(nfft, &d1[1], &d2[1]);
+  }
+  d2[nfft + 1] *= d1[nfft + 1];
+  cdft(nfft, -1, &d2[1]);
+}
+
+void mp_mul_cmul_nt_d2(int nfft, double d1[], double d2[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcmul_nt_in2(int n, double *a, double *b);
+  double xr, xi;
+
+  cdft(nfft, 1, &d1[1]);
+  d2[0] += d1[0];
+  xr = d1[1] * d2[1] + d1[2] * d2[2];
+  xi = d1[1] * d2[2] + d1[2] * d2[1];
+  d2[1] = xr;
+  d2[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcmul_nt_in2(nfft, &d1[1], &d2[1]);
+  }
+  d2[nfft + 1] *= d1[nfft + 1];
+  cdft(nfft, -1, &d2[1]);
+}
+
+void mp_mul_cmul_nt_out(int nfft, double d1[], double d2[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcmul_nt_out(int n, double *a, double *b);
+  double xr, xi;
+
+  cdft(nfft, 1, &d1[1]);
+  cdft(nfft, 1, &d2[1]);
+  d2[0] += d1[0];
+  xr = d1[1] * d2[1] + d1[2] * d2[2];
+  xi = d1[1] * d2[2] + d1[2] * d2[1];
+  d2[1] = xr;
+  d2[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcmul_nt_out(nfft, &d1[1], &d2[1]);
+  }
+  d2[nfft + 1] *= d1[nfft + 1];
+}
+
+void mp_mul_cmul_nt_d1_add(int nfft, double d1[], double d2[], double d3[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcmul_nt_in1_add(int n, double *a, double *b, double *badd);
+  double xr, xi;
+
+  cdft(nfft, 1, &d2[1]);
+  xr = d1[1] * d2[1] + d1[2] * d2[2];
+  xi = d1[1] * d2[2] + d1[2] * d2[1];
+  d3[1] += xr;
+  d3[2] += xi;
+  if (nfft > 2) {
+    mp_mul_rcmul_nt_in1_add(nfft, &d1[1], &d2[1], &d3[1]);
+  }
+  d3[nfft + 1] += d1[nfft + 1] * d2[nfft + 1];
+  cdft(nfft, -1, &d3[1]);
+}
+
+void mp_mul_csqu(int nfft, double d1[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcsqu(int n, double *a);
+  double xr, xi;
+
+  cdft(nfft, 1, &d1[1]);
+  d1[0] *= 2;
+  xr = d1[1] * d1[1] + d1[2] * d1[2];
+  xi = 2 * d1[1] * d1[2];
+  d1[1] = xr;
+  d1[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcsqu(nfft, &d1[1]);
+  }
+  d1[nfft + 1] *= d1[nfft + 1];
+  cdft(nfft, -1, &d1[1]);
+}
+
+void mp_mul_csqu_save_d1(int nfft, double d1[], double d2[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcsqu_save(int n, double *a, double *b);
+  double xr, xi;
+
+  cdft(nfft, 1, &d1[1]);
+  d2[0] = 2 * d1[0];
+  xr = d1[1] * d1[1] + d1[2] * d1[2];
+  xi = 2 * d1[1] * d1[2];
+  d2[1] = xr;
+  d2[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcsqu_save(nfft, &d1[1], &d2[1]);
+  }
+  d2[nfft + 1] = d1[nfft + 1] * d1[nfft + 1];
+  cdft(nfft, -1, &d2[1]);
+}
+
+void mp_mul_csqu_nt_d1(int nfft, double d1[]) {
+  void cdft(int n, int isgn, double *a);
+  void mp_mul_rcsqu_nt_in(int n, double *a);
+  double xr, xi;
+
+  d1[0] *= 2;
+  xr = d1[1] * d1[1] + d1[2] * d1[2];
+  xi = 2 * d1[1] * d1[2];
+  d1[1] = xr;
+  d1[2] = xi;
+  if (nfft > 2) {
+    mp_mul_rcsqu_nt_in(nfft, &d1[1]);
+  }
+  d1[nfft + 1] *= d1[nfft + 1];
+  cdft(nfft, -1, &d1[1]);
+}
+
+void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]) {
+  int j, carry, carry1, carry2, shift, ndata;
+  double x, scale, d1_radix, d1_radix2, pow_radix, topdgt;
+  DGTINT *outr;
+
+  outr = ((DGTINT *)&out[2]) - 2;
+  scale = 2.0 / nfft;
+  d1_radix = 1.0 / radix;
+  d1_radix2 = d1_radix * d1_radix;
+  topdgt = din[nfft + 1];
+  x = topdgt < 0 ? -topdgt : topdgt;
+  shift = x + 0.5 >= radix ? 1 : 0;
+  /* ---- correction of cyclic convolution of din[1] ---- */
+  x *= nfft * 0.5;
+  din[nfft + 1] = din[1] - x;
+  din[1] = x;
+  /* ---- output of digits ---- */
+  ndata = n;
+  if (n > nfft + 1 + shift) {
+    ndata = nfft + 1 + shift;
+    for (j = n + 1; j > ndata + 1; j--) {
+      outr[j] = 0;
+    }
+  }
+  x = 0;
+  pow_radix = 1;
+  for (j = ndata + 1 - shift; j <= nfft + 1; j++) {
+    x += pow_radix * din[j];
+    pow_radix *= d1_radix;
+    if (pow_radix < DBL_EPSILON) {
+      break;
+    }
+  }
+  x = d1_radix2 * (scale * x + 0.5);
+  carry2 = ((int)x) - 1;
+  carry = (int)(radix * (x - carry2) + 0.5);
+  for (j = ndata; j > 1; j--) {
+    x = d1_radix2 * (scale * din[j - shift] + carry + 0.5);
+    carry = carry2;
+    carry2 = ((int)x) - 1;
+    x = radix * (x - carry2);
+    carry1 = (int)x;
+    outr[j + 1] = (DGTINT)(radix * (x - carry1));
+    carry += carry1;
+  }
+  x = carry + ((double)radix) * carry2 + 0.5;
+  if (shift == 0) {
+    x += scale * din[1];
+  }
+  carry = (int)(d1_radix * x);
+  outr[2] = (DGTINT)(x - ((double)radix) * carry);
+  if (carry > 0) {
+    for (j = n + 1; j > 2; j--) {
+      outr[j] = outr[j - 1];
+    }
+    outr[2] = (DGTINT)carry;
+    shift++;
+  }
+  /* ---- output of exp, sgn ---- */
+  x = din[0] + shift + 0.5;
+  shift = ((int)x) - 1;
+  out[1] = shift + ((int)(x - shift));
+  out[0] = topdgt > 0.5 ? 1 : -1;
+  if (outr[2] == 0) {
+    out[0] = 0;
+    out[1] = 0;
+  }
+}
+
+double mp_mul_d2i_test(int radix, int nfft, double din[]) {
+  int j, carry, carry1, carry2;
+  double x, scale, d1_radix, d1_radix2, err;
+
+  scale = 2.0 / nfft;
+  d1_radix = 1.0 / radix;
+  d1_radix2 = d1_radix * d1_radix;
+  /* ---- correction of cyclic convolution of din[1] ---- */
+  x = din[nfft + 1] * nfft * 0.5;
+  if (x < 0) {
+    x = -x;
+  }
+  din[nfft + 1] = din[1] - x;
+  /* ---- check of digits ---- */
+  err = 0;
+  carry = 0;
+  carry2 = 0;
+  for (j = nfft + 1; j > 1; j--) {
+    x = d1_radix2 * (scale * din[j] + carry + 0.5);
+    carry = carry2;
+    carry2 = ((int)x) - 1;
+    x = radix * (x - carry2);
+    carry1 = (int)x;
+    x = radix * (x - carry1);
+    carry += carry1;
+    x = x - 0.5 - ((int)x);
+    if (x > err) {
+      err = x;
+    } else if (-x > err) {
+      err = -x;
+    }
+  }
+  return err;
+}
+
+/* -------- mp_mul child^2 routines (mix RFFT routines) -------- */
+
+#ifndef M_PI_2
+#define M_PI_2 1.570796326794896619231321691639751442098584699687
+#endif
+
+#ifndef RDFT_LOOP_DIV /* control of the RDFT's speed & tolerance */
+#define RDFT_LOOP_DIV 64
+#endif
+
+void mp_mul_rcmul(int n, double *a, double *b) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, ajr, aji, akr, aki, bjr, bji, bkr, bki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  yr = b[i];
+  yi = b[i + 1];
+  b[i] = xr * yr - xi * yi;
+  b[i + 1] = xr * yi + xi * yr;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data a[] into RFFT data ---- */
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      ajr = a[j] - yr;
+      aji = a[j + 1] - yi;
+      akr = a[k] + yr;
+      aki = a[k + 1] - yi;
+      a[j] = ajr;
+      a[j + 1] = aji;
+      a[k] = akr;
+      a[k + 1] = aki;
+      /* ---- transform CFFT data b[] into RFFT data ---- */
+      xr = b[j] - b[k];
+      xi = b[j + 1] + b[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = b[j] - yr;
+      xi = b[j + 1] - yi;
+      yr = b[k] + yr;
+      yi = b[k + 1] - yi;
+      /* ---- cmul ---- */
+      bjr = ajr * xr - aji * xi;
+      bji = ajr * xi + aji * xr;
+      bkr = akr * yr - aki * yi;
+      bki = akr * yi + aki * yr;
+      /* ---- transform RFFT data bxx into CFFT data ---- */
+      xr = bjr - bkr;
+      xi = bji + bki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      b[j] = bjr - yr;
+      b[j + 1] = bji - yi;
+      b[k] = bkr + yr;
+      b[k + 1] = bki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcmul_nt_in1(int n, double *a, double *b) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, bjr, bji, bkr, bki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  yr = b[i];
+  yi = b[i + 1];
+  b[i] = xr * yr - xi * yi;
+  b[i + 1] = xr * yi + xi * yr;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data b[] into RFFT data ---- */
+      xr = b[j] - b[k];
+      xi = b[j + 1] + b[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = b[j] - yr;
+      xi = b[j + 1] - yi;
+      yr = b[k] + yr;
+      yi = b[k + 1] - yi;
+      /* ---- cmul ---- */
+      bjr = a[j] * xr - a[j + 1] * xi;
+      bji = a[j] * xi + a[j + 1] * xr;
+      bkr = a[k] * yr - a[k + 1] * yi;
+      bki = a[k] * yi + a[k + 1] * yr;
+      /* ---- transform RFFT data bxx into CFFT data ---- */
+      xr = bjr - bkr;
+      xi = bji + bki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      b[j] = bjr - yr;
+      b[j + 1] = bji - yi;
+      b[k] = bkr + yr;
+      b[k + 1] = bki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcmul_nt_in2(int n, double *a, double *b) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, bjr, bji, bkr, bki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  yr = b[i];
+  yi = b[i + 1];
+  b[i] = xr * yr - xi * yi;
+  b[i + 1] = xr * yi + xi * yr;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data a[] into RFFT data ---- */
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = a[j] - yr;
+      xi = a[j + 1] - yi;
+      yr = a[k] + yr;
+      yi = a[k + 1] - yi;
+      a[j] = xr;
+      a[j + 1] = xi;
+      a[k] = yr;
+      a[k + 1] = yi;
+      /* ---- cmul ---- */
+      bjr = b[j] * xr - b[j + 1] * xi;
+      bji = b[j] * xi + b[j + 1] * xr;
+      bkr = b[k] * yr - b[k + 1] * yi;
+      bki = b[k] * yi + b[k + 1] * yr;
+      /* ---- transform RFFT data bxx into CFFT data ---- */
+      xr = bjr - bkr;
+      xi = bji + bki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      b[j] = bjr - yr;
+      b[j + 1] = bji - yi;
+      b[k] = bkr + yr;
+      b[k + 1] = bki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcmul_nt_out(int n, double *a, double *b) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, ajr, aji, akr, aki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  yr = b[i];
+  yi = b[i + 1];
+  b[i] = xr * yr - xi * yi;
+  b[i + 1] = xr * yi + xi * yr;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data a[] into RFFT data ---- */
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      ajr = a[j] - yr;
+      aji = a[j + 1] - yi;
+      akr = a[k] + yr;
+      aki = a[k + 1] - yi;
+      a[j] = ajr;
+      a[j + 1] = aji;
+      a[k] = akr;
+      a[k + 1] = aki;
+      /* ---- transform CFFT data b[] into RFFT data ---- */
+      xr = b[j] - b[k];
+      xi = b[j + 1] + b[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = b[j] - yr;
+      xi = b[j + 1] - yi;
+      yr = b[k] + yr;
+      yi = b[k + 1] - yi;
+      /* ---- cmul ---- */
+      b[j] = ajr * xr - aji * xi;
+      b[j + 1] = ajr * xi + aji * xr;
+      b[k] = akr * yr - aki * yi;
+      b[k + 1] = akr * yi + aki * yr;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcmul_nt_in1_add(int n, double *a, double *b, double *badd) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, bjr, bji, bkr, bki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  yr = b[i];
+  yi = b[i + 1];
+  badd[i] += xr * yr - xi * yi;
+  badd[i + 1] += xr * yi + xi * yr;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data b[] into RFFT data ---- */
+      xr = b[j] - b[k];
+      xi = b[j + 1] + b[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = b[j] - yr;
+      xi = b[j + 1] - yi;
+      yr = b[k] + yr;
+      yi = b[k + 1] - yi;
+      /* ---- cmul + add ---- */
+      bjr = badd[j] + (a[j] * xr - a[j + 1] * xi);
+      bji = badd[j + 1] + (a[j] * xi + a[j + 1] * xr);
+      bkr = badd[k] + (a[k] * yr - a[k + 1] * yi);
+      bki = badd[k + 1] + (a[k] * yi + a[k + 1] * yr);
+      /* ---- transform RFFT data bxx into CFFT data ---- */
+      xr = bjr - bkr;
+      xi = bji + bki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      badd[j] = bjr - yr;
+      badd[j + 1] = bji - yi;
+      badd[k] = bkr + yr;
+      badd[k + 1] = bki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcsqu(int n, double *a) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, ajr, aji, akr, aki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  a[i] = xr * xr - xi * xi;
+  a[i + 1] = 2 * xr * xi;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data a[] into RFFT data ---- */
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = a[j] - yr;
+      xi = a[j + 1] - yi;
+      yr = a[k] + yr;
+      yi = a[k + 1] - yi;
+      /* ---- csqu ---- */
+      ajr = xr * xr - xi * xi;
+      aji = 2 * xr * xi;
+      akr = yr * yr - yi * yi;
+      aki = 2 * yr * yi;
+      /* ---- transform RFFT data axx into CFFT data ---- */
+      xr = ajr - akr;
+      xi = aji + aki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      a[j] = ajr - yr;
+      a[j + 1] = aji - yi;
+      a[k] = akr + yr;
+      a[k + 1] = aki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcsqu_save(int n, double *a, double *b) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, ajr, aji, akr, aki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  b[i] = xr * xr - xi * xi;
+  b[i + 1] = 2 * xr * xi;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- transform CFFT data a[] into RFFT data ---- */
+      xr = a[j] - a[k];
+      xi = a[j + 1] + a[k + 1];
+      yr = wkr * xr - wki * xi;
+      yi = wkr * xi + wki * xr;
+      xr = a[j] - yr;
+      xi = a[j + 1] - yi;
+      yr = a[k] + yr;
+      yi = a[k + 1] - yi;
+      a[j] = xr;
+      a[j + 1] = xi;
+      a[k] = yr;
+      a[k + 1] = yi;
+      /* ---- csqu ---- */
+      ajr = xr * xr - xi * xi;
+      aji = 2 * xr * xi;
+      akr = yr * yr - yi * yi;
+      aki = 2 * yr * yi;
+      /* ---- transform RFFT data axx into CFFT data ---- */
+      xr = ajr - akr;
+      xi = aji + aki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      b[j] = ajr - yr;
+      b[j + 1] = aji - yi;
+      b[k] = akr + yr;
+      b[k + 1] = aki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+void mp_mul_rcsqu_nt_in(int n, double *a) {
+  int i, i0, j, k;
+  double ec, w1r, w1i, wkr, wki, wdr, wdi, ss;
+  double xr, xi, yr, yi, ajr, aji, akr, aki;
+
+  ec = 2 * M_PI_2 / n;
+  wkr = 0;
+  wki = 0;
+  wdi = cos(ec);
+  wdr = sin(ec);
+  wdi *= wdr;
+  wdr *= wdr;
+  w1r = 1 - 2 * wdr;
+  w1i = 2 * wdi;
+  ss = 2 * w1i;
+  i = n >> 1;
+  xr = a[i];
+  xi = a[i + 1];
+  a[i] = xr * xr - xi * xi;
+  a[i + 1] = 2 * xr * xi;
+  for (;;) {
+    i0 = i - 4 * RDFT_LOOP_DIV;
+    if (i0 < 2) {
+      i0 = 2;
+    }
+    for (j = i - 2; j >= i0; j -= 2) {
+      k = n - j;
+      xr = wkr + ss * wdi;
+      xi = wki + ss * (0.5 - wdr);
+      wkr = wdr;
+      wki = wdi;
+      wdr = xr;
+      wdi = xi;
+      /* ---- csqu ---- */
+      xr = a[j];
+      xi = a[j + 1];
+      yr = a[k];
+      yi = a[k + 1];
+      ajr = xr * xr - xi * xi;
+      aji = 2 * xr * xi;
+      akr = yr * yr - yi * yi;
+      aki = 2 * yr * yi;
+      /* ---- transform RFFT data axx into CFFT data ---- */
+      xr = ajr - akr;
+      xi = aji + aki;
+      yr = wkr * xr + wki * xi;
+      yi = wkr * xi - wki * xr;
+      a[j] = ajr - yr;
+      a[j + 1] = aji - yi;
+      a[k] = akr + yr;
+      a[k + 1] = aki - yi;
+    }
+    if (i0 == 2) {
+      break;
+    }
+    wkr = 0.5 * sin(ec * i0);
+    wki = 0.5 * cos(ec * i0);
+    wdr = 0.5 - (wkr * w1r - wki * w1i);
+    wdi = wkr * w1i + wki * w1r;
+    wkr = 0.5 - wkr;
+    i = i0;
+  }
+}
+
+/* -------- mp_inv routines -------- */
+
+int mp_inv(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  int mp_get_nfft_init(int radix, int nfft_max);
+  void mp_inv_init(int n, int radix, int in[], int out[]);
+  int mp_inv_newton(int n, int radix, int in[], int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]);
+  int n_nwt, nfft_nwt, thr, prc;
+
+  if (in[0] == 0) {
+    return -1;
+  }
+  nfft_nwt = mp_get_nfft_init(radix, nfft);
+  n_nwt = nfft_nwt + 2;
+  if (n_nwt > n) {
+    n_nwt = n;
+  }
+  mp_inv_init(n_nwt, radix, in, out);
+  thr = 8;
+  do {
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+      n_nwt = n;
+    }
+    prc = mp_inv_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft);
+#ifdef DEBUG
+    printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc);
+#endif
+    if (thr * nfft_nwt >= nfft) {
+      thr = 0;
+      if (2 * prc <= n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    } else {
+      if (3 * prc < n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    }
+    nfft_nwt <<= 1;
+  } while (nfft_nwt <= nfft);
+  return 0;
+}
+
+int mp_sqrt(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  void mp_load_0(int n, int radix, int out[]);
+  int mp_get_nfft_init(int radix, int nfft_max);
+  void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]);
+  int mp_sqrt_newton(int n, int radix, int in[], int inout[], int inout_rev[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], int *n_tmp1fft);
+  int n_nwt, nfft_nwt, thr, prc, n_tmp1fft;
+
+  if (in[0] < 0) {
+    return -1;
+  } else if (in[0] == 0) {
+    mp_load_0(n, radix, out);
+    return 0;
+  }
+  nfft_nwt = mp_get_nfft_init(radix, nfft);
+  n_nwt = nfft_nwt + 2;
+  if (n_nwt > n) {
+    n_nwt = n;
+  }
+  mp_sqrt_init(n_nwt, radix, in, out, tmp1);
+  n_tmp1fft = 0;
+  thr = 8;
+  do {
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+      n_nwt = n;
+    }
+    prc = mp_sqrt_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, &n_tmp1fft);
+#ifdef DEBUG
+    printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc);
+#endif
+    if (thr * nfft_nwt >= nfft) {
+      thr = 0;
+      if (2 * prc <= n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    } else {
+      if (3 * prc < n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    }
+    nfft_nwt <<= 1;
+  } while (nfft_nwt <= nfft);
+  return 0;
+}
+
+int mp_invisqrt(int n, int radix, int in, int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  int mp_get_nfft_init(int radix, int nfft_max);
+  void mp_invisqrt_init(int n, int radix, int in, int out[]);
+  int mp_invisqrt_newton(int n, int radix, int in, int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]);
+  int n_nwt, nfft_nwt, thr, prc;
+
+  if (in <= 0) {
+    return -1;
+  }
+  nfft_nwt = mp_get_nfft_init(radix, nfft);
+  n_nwt = nfft_nwt + 2;
+  if (n_nwt > n) {
+    n_nwt = n;
+  }
+  mp_invisqrt_init(n_nwt, radix, in, out);
+  thr = 8;
+  do {
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+      n_nwt = n;
+    }
+    prc = mp_invisqrt_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft);
+#ifdef DEBUG
+    printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc);
+#endif
+    if (thr * nfft_nwt >= nfft) {
+      thr = 0;
+      if (2 * prc <= n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    } else {
+      if (3 * prc < n_nwt - 2) {
+        nfft_nwt >>= 1;
+      }
+    }
+    nfft_nwt <<= 1;
+  } while (nfft_nwt <= nfft);
+  return 0;
+}
+
+/* -------- mp_inv child routines -------- */
+
+int mp_get_nfft_init(int radix, int nfft_max) {
+  int nfft_init;
+  double r;
+
+  r = radix;
+  nfft_init = 1;
+  do {
+    r *= r;
+    nfft_init <<= 1;
+  } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max);
+  return nfft_init;
+}
+
+void mp_inv_init(int n, int radix, int in[], int out[]) {
+  void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]);
+  double mp_unexp_mp2d(int n, int radix, DGTINT in[]);
+  int outexp;
+  double din;
+
+  out[0] = in[0];
+  outexp = -in[1];
+  din = 1.0 / mp_unexp_mp2d(n, radix, (DGTINT *)&in[2]);
+  while (din < 1) {
+    din *= radix;
+    outexp--;
+  }
+  out[1] = outexp;
+  mp_unexp_d2mp(n, radix, din, (DGTINT *)&out[2]);
+}
+
+void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]) {
+  void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]);
+  double mp_unexp_mp2d(int n, int radix, DGTINT in[]);
+  int outexp;
+  double din;
+
+  out[0] = 1;
+  out_rev[0] = 1;
+  outexp = in[1];
+  din = mp_unexp_mp2d(n, radix, (DGTINT *)&in[2]);
+  if (outexp % 2 != 0) {
+    din *= radix;
+    outexp--;
+  }
+  outexp /= 2;
+  din = sqrt(din);
+  if (din < 1) {
+    din *= radix;
+    outexp--;
+  }
+  out[1] = outexp;
+  mp_unexp_d2mp(n, radix, din, (DGTINT *)&out[2]);
+  outexp = -outexp;
+  din = 1.0 / din;
+  while (din < 1) {
+    din *= radix;
+    outexp--;
+  }
+  out_rev[1] = outexp;
+  mp_unexp_d2mp(n, radix, din, (DGTINT *)&out_rev[2]);
+}
+
+void mp_invisqrt_init(int n, int radix, int in, int out[]) {
+  void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]);
+  int outexp;
+  double dout;
+
+  out[0] = 1;
+  outexp = 0;
+  dout = sqrt(1.0 / in);
+  while (dout < 1) {
+    dout *= radix;
+    outexp--;
+  }
+  out[1] = outexp;
+  mp_unexp_d2mp(n, radix, dout, (DGTINT *)&out[2]);
+}
+
+void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]) {
+  int j, x;
+
+  for (j = 0; j < n; j++) {
+    x = (int)din;
+    if (x >= radix) {
+      x = radix - 1;
+      din = radix;
+    }
+    din = radix * (din - x);
+    out[j] = (DGTINT)x;
+  }
+}
+
+double mp_unexp_mp2d(int n, int radix, DGTINT in[]) {
+  int j;
+  double d1_radix, dout;
+
+  d1_radix = 1.0 / radix;
+  dout = 0;
+  for (j = n - 1; j >= 0; j--) {
+    dout = d1_radix * dout + in[j];
+  }
+  return dout;
+}
+
+int mp_inv_newton(int n, int radix, int in[], int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  void mp_load_1(int n, int radix, int out[]);
+  void mp_round(int n, int radix, int m, int inout[]);
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]);
+  void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]);
+  int n_h, shift, prc;
+
+  shift = (nfft >> 1) + 1;
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */
+  mp_round(n, radix, shift, inout);
+  mp_mulh(n, radix, inout, in, tmp1, nfft, tmp1fft, tmp2fft);
+  /* ---- tmp2 = 1 - tmp1 ---- */
+  mp_load_1(n, radix, tmp2);
+  mp_sub(n, radix, tmp2, tmp1, tmp2);
+  /* ---- tmp2 -= inout * (lower) in (half precision) ---- */
+  mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, nfft, tmp2fft);
+  mp_sub(n_h, radix, tmp2, tmp1, tmp2);
+  /* ---- get precision ---- */
+  prc = -tmp2[1];
+  if (tmp2[0] == 0) {
+    prc = nfft + 1;
+  }
+  /* ---- tmp2 *= inout (half precision) ---- */
+  mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, nfft, tmp2fft);
+  /* ---- inout += tmp2 ---- */
+  mp_add(n, radix, inout, tmp2, inout);
+  return prc;
+}
+
+int mp_sqrt_newton(int n, int radix, int in[], int inout[], int inout_rev[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], int *n_tmp1fft) {
+  void mp_round(int n, int radix, int m, int inout[]);
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_idiv_2(int n, int radix, int in[], int out[]);
+  void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]);
+  void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]);
+  void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], int nfft);
+  int n_h, nfft_h, shift, prc;
+
+  nfft_h = nfft >> 1;
+  shift = nfft_h + 1;
+  if (nfft_h < 2) {
+    nfft_h = 2;
+  }
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */
+  mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev);
+  if (*n_tmp1fft != nfft_h) {
+    mp_squh(n_h, radix, inout_rev, tmp, nfft_h, tmp1fft);
+  } else {
+    mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, nfft_h);
+  }
+  /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */
+  mp_round(n, radix, shift, inout);
+  mp_mulh(n_h, radix, inout, tmp, tmp, nfft, tmp1fft, tmp2fft);
+  mp_sub(n_h, radix, inout_rev, tmp, tmp);
+  /* ---- inout_rev += tmp ---- */
+  mp_add(n_h, radix, inout_rev, tmp, inout_rev);
+  /* ---- tmp = in - inout^2 (half to normal precision) ---- */
+  mp_squh_use_in1fft(n, radix, tmp1fft, tmp, nfft);
+  mp_sub(n, radix, in, tmp, tmp);
+  /* ---- get precision ---- */
+  prc = in[1] - tmp[1];
+  if (((DGTINT *)&in[2])[0] > ((DGTINT *)&tmp[2])[0]) {
+    prc++;
+  }
+  if (tmp[0] == 0) {
+    prc = nfft + 1;
+  }
+  /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */
+  mp_round(n_h, radix, shift, inout_rev);
+  mp_mulh(n_h, radix, inout_rev, tmp, tmp, nfft, tmp1fft, tmp2fft);
+  *n_tmp1fft = nfft;
+  mp_idiv_2(n_h, radix, tmp, tmp);
+  /* ---- inout += tmp ---- */
+  mp_add(n, radix, inout, tmp, inout);
+  return prc;
+}
+
+int mp_invisqrt_newton(int n, int radix, int in, int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) {
+  void mp_load_1(int n, int radix, int out[]);
+  void mp_round(int n, int radix, int m, int inout[]);
+  void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+  void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+  void mp_idiv_2(int n, int radix, int in[], int out[]);
+  void mp_squh_save_infft(int n, int radix, int in[], int out[], int nfft, double infft[], double outfft[]);
+  void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]);
+  int n_h, shift, prc;
+
+  shift = (nfft >> 1) + 1;
+  n_h = n / 2 + 1;
+  if (n_h < n - shift) {
+    n_h = n - shift;
+  }
+  /* ---- tmp1 = in * inout^2 (half to normal precision) ---- */
+  mp_round(n, radix, shift, inout);
+  mp_squh_save_infft(n, radix, inout, tmp1, nfft, tmp1fft, tmp2fft);
+  mp_imul(n, radix, tmp1, in, tmp1);
+  /* ---- tmp2 = 1 - tmp1 ---- */
+  mp_load_1(n, radix, tmp2);
+  mp_sub(n, radix, tmp2, tmp1, tmp2);
+  /* ---- get precision ---- */
+  prc = -tmp2[1];
+  if (tmp2[0] == 0) {
+    prc = nfft + 1;
+  }
+  /* ---- tmp2 *= inout / 2 (half precision) ---- */
+  mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, nfft, tmp2fft);
+  mp_idiv_2(n_h, radix, tmp2, tmp2);
+  /* ---- inout += tmp2 ---- */
+  mp_add(n, radix, inout, tmp2, inout);
+  return prc;
+}
+
+/* -------- mp_io routines -------- */
+
+void mp_sprintf(int n, int log10_radix, int in[], char out[]) {
+  int j, k, x, y, outexp, shift;
+  DGTINT *inr;
+
+  inr = ((DGTINT *)&in[2]) - 2;
+  if (in[0] < 0) {
+    *out++ = '-';
+  }
+  x = inr[2];
+  shift = log10_radix;
+  for (k = log10_radix; k > 0; k--) {
+    y = x % 10;
+    x /= 10;
+    out[k] = '0' + y;
+    if (y != 0) {
+      shift = k;
+    }
+  }
+  out[0] = out[shift];
+  out[1] = '.';
+  for (k = 1; k <= log10_radix - shift; k++) {
+    out[k + 1] = out[k + shift];
+  }
+  outexp = log10_radix - shift;
+  out += outexp + 2;
+  for (j = 3; j <= n + 1; j++) {
+    x = inr[j];
+    for (k = log10_radix - 1; k >= 0; k--) {
+      y = x % 10;
+      x /= 10;
+      out[k] = '0' + y;
+    }
+    out += log10_radix;
+  }
+  *out++ = 'e';
+  outexp += log10_radix * in[1];
+  sprintf(out, "%d", outexp);
+}
+
+void mp_sscanf(int n, int log10_radix, char in[], int out[]) {
+  char *s;
+  int j, x, outexp, outexp_mod;
+  DGTINT *outr;
+
+  outr = ((DGTINT *)&out[2]) - 2;
+  while (*in == ' ') {
+    in++;
+  }
+  out[0] = 1;
+  if (*in == '-') {
+    out[0] = -1;
+    in++;
+  } else if (*in == '+') {
+    in++;
+  }
+  while (*in == ' ' || *in == '0') {
+    in++;
+  }
+  outexp = 0;
+  for (s = in; *s != '\0'; s++) {
+    if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') {
+      if (sscanf(++s, "%d", &outexp) != 1) {
+        outexp = 0;
+      }
+      break;
+    }
+  }
+  if (*in == '.') {
+    do {
+      outexp--;
+      while (*++in == ' ');
+    } while (*in == '0' && *in != '\0');
+  } else if (*in != '\0') {
+    s = in;
+    while (*++s == ' ');
+    while (*s >= '0' && *s <= '9' && *s != '\0') {
+      outexp++;
+      while (*++s == ' ');
+    }
+  }
+  x = outexp / log10_radix;
+  outexp_mod = outexp - log10_radix * x;
+  if (outexp_mod < 0) {
+    x--;
+    outexp_mod += log10_radix;
+  }
+  out[1] = x;
+  x = 0;
+  j = 2;
+  for (s = in; *s != '\0'; s++) {
+    if (*s == '.' || *s == ' ') {
+      continue;
+    }
+    if (*s < '0' || *s > '9') {
+      break;
+    }
+    x = 10 * x + (*s - '0');
+    if (--outexp_mod < 0) {
+      if (j > n + 1) {
+        break;
+      }
+      outr[j++] = (DGTINT)x;
+      x = 0;
+      outexp_mod = log10_radix - 1;
+    }
+  }
+  while (outexp_mod-- >= 0) {
+    x *= 10;
+  }
+  while (j <= n + 1) {
+    outr[j++] = (DGTINT)x;
+    x = 0;
+  }
+  if (outr[2] == 0) {
+    out[0] = 0;
+    out[1] = 0;
+  }
+}
diff --git a/tests/performance/superpi/pi_fftcs.h b/tests/performance/superpi/pi_fftcs.h
new file mode 100644
index 000000000..419b15613
--- /dev/null
+++ b/tests/performance/superpi/pi_fftcs.h
@@ -0,0 +1,47 @@
+/*
+  Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999.
+  https://github.com/Fibonacci43/SuperPI
+  Modified for Arduino by Lucas Saavedra Vaz, 2024.
+*/
+
+#pragma once
+
+#include <ctype.h>
+
+#define PI_FFTC_VER "ver. LG1.1.2-MP1.5.2a.memsave"
+
+/* Please check the following macros before compiling */
+#ifndef DBL_ERROR_MARGIN
+#define DBL_ERROR_MARGIN 0.4 /* must be < 0.5 */
+#endif
+
+#define DGTINT     short int /* sizeof(DGTINT) == 2 */
+#define DGTINT_MAX SHRT_MAX
+
+#define DGT_PACK       10
+#define DGT_PACK_LINE  5
+#define DGT_LINE_BLOCK 20
+
+void pi_calc(int nfft);
+void mp_load_0(int n, int radix, int out[]);
+void mp_load_1(int n, int radix, int out[]);
+void mp_round(int n, int radix, int m, int inout[]);
+int mp_cmp(int n, int radix, int in1[], int in2[]);
+void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+int mp_idiv(int n, int radix, int in1[], int in2, int out[]);
+void mp_idiv_2(int n, int radix, int in[], int out[]);
+double mp_mul_radix_test(int n, int radix, int nfft, double tmpfft[]);
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], double tmp3fft[]);
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[]);
+void mp_mulhf(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double in1fft[], double tmpfft[]);
+void mp_mulhf_use_in1fft(int n, int radix, double in1fft[], int in2[], int out[], int tmp[], int nfft, double tmpfft[]);
+void mp_squhf_use_infft(int n, int radix, double infft[], int in[], int out[], int tmp[], int nfft, double tmpfft[]);
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]);
+void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]);
+int mp_inv(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]);
+int mp_sqrt(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]);
+int mp_invisqrt(int n, int radix, int in, int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]);
+void mp_sprintf(int n, int log10_radix, int in[], char out[]);
+void mp_sscanf(int n, int log10_radix, char in[], int out[]);
diff --git a/tests/performance/superpi/superpi.ino b/tests/performance/superpi/superpi.ino
new file mode 100644
index 000000000..ffa6c932b
--- /dev/null
+++ b/tests/performance/superpi/superpi.ino
@@ -0,0 +1,41 @@
+/*
+  Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999.
+  https://github.com/Fibonacci43/SuperPI
+  Modified for Arduino by Lucas Saavedra Vaz, 2024.
+*/
+
+#include <Arduino.h>
+
+#include "pi_fftcs.h"
+
+// Number of runs to average
+#define N_RUNS 3
+
+// Number of decimal digits to calculate
+#define DIGITS (1 << 14)
+
+void setup() {
+  Serial.begin(115200);
+  while (!Serial) {
+    delay(10);
+  }
+
+  log_d("Starting PI calculation");
+  Serial.printf("Runs: %d\n", N_RUNS);
+  Serial.printf("Digits: %d\n", DIGITS);
+  Serial.flush();
+  for (int i = 0; i < N_RUNS; i++) {
+    Serial.printf("Run %d", i);
+    unsigned long start = millis();
+    pi_calc(DIGITS);
+    unsigned long elapsed = millis() - start;
+    Serial.printf("Time: %lu.%03lu s\n", elapsed / 1000, elapsed % 1000);
+    Serial.flush();
+  }
+
+  log_d("PI calculation test done");
+}
+
+void loop() {
+  vTaskDelete(NULL);
+}
diff --git a/tests/performance/superpi/test_superpi.py b/tests/performance/superpi/test_superpi.py
new file mode 100644
index 000000000..0bd7a3477
--- /dev/null
+++ b/tests/performance/superpi/test_superpi.py
@@ -0,0 +1,53 @@
+import json
+import logging
+import os
+
+
+def test_superpi(dut, request):
+    LOGGER = logging.getLogger(__name__)
+
+    # Match "Runs: %d"
+    res = dut.expect(r"Runs: (\d+)", timeout=60)
+    runs = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of runs: {}".format(runs))
+
+    # Match "Digits: %d"
+    res = dut.expect(r"Digits: (\d+)", timeout=60)
+    digits = int(res.group(0).decode("utf-8").split(" ")[1])
+    LOGGER.info("Number of decimal digits: {}".format(digits))
+
+    list_time = []
+
+    for i in range(runs):
+        # Match "Run %d"
+        res = dut.expect(r"Run (\d+)", timeout=120)
+        run = int(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Run {}".format(run))
+        assert run == i, "Invalid run number"
+
+        # Match "Time: %lu.%03lu s"
+        res = dut.expect(r"Time: (\d+)\.(\d+) s", timeout=300)
+        time = float(res.group(0).decode("utf-8").split(" ")[1])
+        LOGGER.info("Time on run {}: {} s".format(i, time))
+        assert time > 0 and time < 1000, "Invalid time"
+        list_time.append(time)
+
+    avg_time = round(sum(list_time) / len(list_time), 3)
+
+    # Create JSON with results and write it to file
+    # Always create a JSON with this format (so it can be merged later on):
+    # { TEST_NAME_STR: TEST_RESULTS_DICT }
+    results = {"superpi": {"runs": runs, "digits": digits, "avg_time": avg_time}}
+
+    current_folder = os.path.dirname(request.path)
+    file_index = 0
+    report_file = os.path.join(current_folder, "result_superpi" + str(file_index) + ".json")
+    while os.path.exists(report_file):
+        report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json")
+        file_index += 1
+
+    with open(report_file, "w") as f:
+        try:
+            f.write(json.dumps(results))
+        except Exception as e:
+            LOGGER.warning("Failed to write results to file: {}".format(e))
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 896699b57..289166dfe 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,5 @@
 cryptography>=2.1.4
 --only-binary cryptography
 pytest-cov
-pytest-embedded-serial-esp>=1.3.4
-pytest-embedded-arduino>=1.3.4
+pytest-embedded-serial-esp>=1.10.0
+pytest-embedded-arduino>=1.10.0
diff --git a/tests/democfg/cfg.json b/tests/validation/democfg/cfg.json
similarity index 100%
rename from tests/democfg/cfg.json
rename to tests/validation/democfg/cfg.json
diff --git a/tests/democfg/democfg.ino b/tests/validation/democfg/democfg.ino
similarity index 100%
rename from tests/democfg/democfg.ino
rename to tests/validation/democfg/democfg.ino
diff --git a/tests/democfg/test_democfg.py b/tests/validation/democfg/test_democfg.py
similarity index 100%
rename from tests/democfg/test_democfg.py
rename to tests/validation/democfg/test_democfg.py
diff --git a/tests/hello_world/hello_world.ino b/tests/validation/hello_world/hello_world.ino
similarity index 100%
rename from tests/hello_world/hello_world.ino
rename to tests/validation/hello_world/hello_world.ino
diff --git a/tests/hello_world/test_hello_world.py b/tests/validation/hello_world/test_hello_world.py
similarity index 100%
rename from tests/hello_world/test_hello_world.py
rename to tests/validation/hello_world/test_hello_world.py
diff --git a/tests/nvs/cfg.json b/tests/validation/nvs/cfg.json
similarity index 100%
rename from tests/nvs/cfg.json
rename to tests/validation/nvs/cfg.json
diff --git a/tests/nvs/nvs.ino b/tests/validation/nvs/nvs.ino
similarity index 100%
rename from tests/nvs/nvs.ino
rename to tests/validation/nvs/nvs.ino
diff --git a/tests/nvs/test_nvs.py b/tests/validation/nvs/test_nvs.py
similarity index 100%
rename from tests/nvs/test_nvs.py
rename to tests/validation/nvs/test_nvs.py
diff --git a/tests/periman/periman.ino b/tests/validation/periman/periman.ino
similarity index 100%
rename from tests/periman/periman.ino
rename to tests/validation/periman/periman.ino
diff --git a/tests/periman/test_periman.py b/tests/validation/periman/test_periman.py
similarity index 100%
rename from tests/periman/test_periman.py
rename to tests/validation/periman/test_periman.py
diff --git a/tests/timer/test_timer.py b/tests/validation/timer/test_timer.py
similarity index 100%
rename from tests/timer/test_timer.py
rename to tests/validation/timer/test_timer.py
diff --git a/tests/timer/timer.ino b/tests/validation/timer/timer.ino
similarity index 100%
rename from tests/timer/timer.ino
rename to tests/validation/timer/timer.ino
diff --git a/tests/touch/test_touch.py b/tests/validation/touch/test_touch.py
similarity index 100%
rename from tests/touch/test_touch.py
rename to tests/validation/touch/test_touch.py
diff --git a/tests/touch/touch.ino b/tests/validation/touch/touch.ino
similarity index 100%
rename from tests/touch/touch.ino
rename to tests/validation/touch/touch.ino
diff --git a/tests/uart/test_uart.py b/tests/validation/uart/test_uart.py
similarity index 100%
rename from tests/uart/test_uart.py
rename to tests/validation/uart/test_uart.py
diff --git a/tests/uart/uart.ino b/tests/validation/uart/uart.ino
similarity index 100%
rename from tests/uart/uart.ino
rename to tests/validation/uart/uart.ino
diff --git a/tests/unity/test_unity.py b/tests/validation/unity/test_unity.py
similarity index 100%
rename from tests/unity/test_unity.py
rename to tests/validation/unity/test_unity.py
diff --git a/tests/unity/unity.ino b/tests/validation/unity/unity.ino
similarity index 100%
rename from tests/unity/unity.ino
rename to tests/validation/unity/unity.ino