From 12995820478df9d2d388f1c633202f6a9ef8dcbd Mon Sep 17 00:00:00 2001 From: Lucas Saavedra Vaz <32426024+lucasssvaz@users.noreply.github.com> Date: Tue, 7 May 2024 08:16:13 -0300 Subject: [PATCH] ci(performance): Add performance tests to CI (#9560) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci(performance): Add performance tests to CI * ci(req): Fix requirements * ci(pre-commit): Apply automatic fixes * ci(pre-commit): Increase maximum allowed complexity for python --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Co-authored-by: Jan Procházka <90197375+P-R-O-C-H-Y@users.noreply.github.com> --- .flake8 | 4 +- .github/scripts/sketch_utils.sh | 17 +- .github/scripts/tests_build.sh | 32 +- .github/scripts/tests_run.sh | 41 +- .github/workflows/hil.yml | 88 +- .pre-commit-config.yaml | 6 +- tests/.gitignore | 1 + tests/performance/coremark/core_list_join.c | 495 ++++ tests/performance/coremark/core_main.c | 356 +++ tests/performance/coremark/core_matrix.c | 308 +++ tests/performance/coremark/core_portme.c | 168 ++ tests/performance/coremark/core_portme.h | 217 ++ tests/performance/coremark/core_state.c | 277 ++ tests/performance/coremark/core_util.c | 210 ++ tests/performance/coremark/coremark.h | 174 ++ tests/performance/coremark/coremark.ino | 118 + tests/performance/coremark/test_coremark.py | 58 + tests/performance/fibonacci/fibonacci.ino | 48 + tests/performance/fibonacci/test_fibonacci.py | 78 + tests/performance/psramspeed/.skip.esp32c3 | 0 tests/performance/psramspeed/.skip.esp32c6 | 0 tests/performance/psramspeed/.skip.esp32h2 | 0 tests/performance/psramspeed/psramspeed.ino | 266 ++ .../performance/psramspeed/test_psramspeed.py | 105 + tests/performance/ramspeed/cfg.json | 40 + tests/performance/ramspeed/ramspeed.ino | 262 ++ tests/performance/ramspeed/test_ramspeed.py | 105 + tests/performance/superpi/fftsg_h.cpp | 2329 +++++++++++++++++ tests/performance/superpi/fftsg_h.h | 88 + tests/performance/superpi/pi_fftcs.cpp | 2214 ++++++++++++++++ tests/performance/superpi/pi_fftcs.h | 47 + tests/performance/superpi/superpi.ino | 41 + tests/performance/superpi/test_superpi.py | 53 + tests/requirements.txt | 4 +- tests/{ => validation}/democfg/cfg.json | 0 tests/{ => validation}/democfg/democfg.ino | 0 .../{ => validation}/democfg/test_democfg.py | 0 .../hello_world/hello_world.ino | 0 .../hello_world/test_hello_world.py | 0 tests/{ => validation}/nvs/cfg.json | 0 tests/{ => validation}/nvs/nvs.ino | 0 tests/{ => validation}/nvs/test_nvs.py | 0 tests/{ => validation}/periman/periman.ino | 0 .../{ => validation}/periman/test_periman.py | 0 tests/{ => validation}/timer/test_timer.py | 0 tests/{ => validation}/timer/timer.ino | 0 tests/{ => validation}/touch/test_touch.py | 0 tests/{ => validation}/touch/touch.ino | 0 tests/{ => validation}/uart/test_uart.py | 0 tests/{ => validation}/uart/uart.ino | 0 tests/{ => validation}/unity/test_unity.py | 0 tests/{ => validation}/unity/unity.ino | 0 52 files changed, 8193 insertions(+), 57 deletions(-) create mode 100644 tests/performance/coremark/core_list_join.c create mode 100644 tests/performance/coremark/core_main.c create mode 100644 tests/performance/coremark/core_matrix.c create mode 100644 tests/performance/coremark/core_portme.c create mode 100644 tests/performance/coremark/core_portme.h create mode 100644 tests/performance/coremark/core_state.c create mode 100644 tests/performance/coremark/core_util.c create mode 100644 tests/performance/coremark/coremark.h create mode 100644 tests/performance/coremark/coremark.ino create mode 100644 tests/performance/coremark/test_coremark.py create mode 100644 tests/performance/fibonacci/fibonacci.ino create mode 100644 tests/performance/fibonacci/test_fibonacci.py create mode 100644 tests/performance/psramspeed/.skip.esp32c3 create mode 100644 tests/performance/psramspeed/.skip.esp32c6 create mode 100644 tests/performance/psramspeed/.skip.esp32h2 create mode 100644 tests/performance/psramspeed/psramspeed.ino create mode 100644 tests/performance/psramspeed/test_psramspeed.py create mode 100644 tests/performance/ramspeed/cfg.json create mode 100644 tests/performance/ramspeed/ramspeed.ino create mode 100644 tests/performance/ramspeed/test_ramspeed.py create mode 100644 tests/performance/superpi/fftsg_h.cpp create mode 100644 tests/performance/superpi/fftsg_h.h create mode 100644 tests/performance/superpi/pi_fftcs.cpp create mode 100644 tests/performance/superpi/pi_fftcs.h create mode 100644 tests/performance/superpi/superpi.ino create mode 100644 tests/performance/superpi/test_superpi.py rename tests/{ => validation}/democfg/cfg.json (100%) rename tests/{ => validation}/democfg/democfg.ino (100%) rename tests/{ => validation}/democfg/test_democfg.py (100%) rename tests/{ => validation}/hello_world/hello_world.ino (100%) rename tests/{ => validation}/hello_world/test_hello_world.py (100%) rename tests/{ => validation}/nvs/cfg.json (100%) rename tests/{ => validation}/nvs/nvs.ino (100%) rename tests/{ => validation}/nvs/test_nvs.py (100%) rename tests/{ => validation}/periman/periman.ino (100%) rename tests/{ => validation}/periman/test_periman.py (100%) rename tests/{ => validation}/timer/test_timer.py (100%) rename tests/{ => validation}/timer/timer.ino (100%) rename tests/{ => validation}/touch/test_touch.py (100%) rename tests/{ => validation}/touch/touch.ino (100%) rename tests/{ => validation}/uart/test_uart.py (100%) rename tests/{ => validation}/uart/uart.ino (100%) rename tests/{ => validation}/unity/test_unity.py (100%) rename tests/{ => validation}/unity/unity.ino (100%) diff --git a/.flake8 b/.flake8 index 881c4c629..5a2ed0b5b 100644 --- a/.flake8 +++ b/.flake8 @@ -1,12 +1,10 @@ # Source: https://github.com/arduino/tooling-project-assets/blob/main/workflow-templates/assets/check-python/.flake8 # See: https://flake8.pycqa.org/en/latest/user/configuration.html -# The code style defined in this file is the official standardized style to be used in all Arduino tooling projects and -# should not be modified. [flake8] doctests = True # W503 and W504 are mutually exclusive. PEP 8 recommends line break before. ignore = W503,E203 -max-complexity = 10 +max-complexity = 20 max-line-length = 120 select = E,W,F,C,N diff --git a/.github/scripts/sketch_utils.sh b/.github/scripts/sketch_utils.sh index e8da865a0..73a9ef8a7 100755 --- a/.github/scripts/sketch_utils.sh +++ b/.github/scripts/sketch_utils.sh @@ -121,7 +121,7 @@ function build_sketch(){ # build_sketch [ex fi if [ -z "$fqbn" ]; then - echo "No FQBN passed or unvalid chip: $target" + echo "No FQBN passed or invalid chip: $target" exit 1 fi @@ -139,7 +139,7 @@ function build_sketch(){ # build_sketch [ex echo "Skipping $sketchname for target $target" exit 0 fi - + ARDUINO_CACHE_DIR="$HOME/.arduino/cache.tmp" if [ -n "$ARDUINO_BUILD_DIR" ]; then build_dir="$ARDUINO_BUILD_DIR" @@ -177,7 +177,7 @@ function build_sketch(){ # build_sketch [ex --build-path "$build_dir" \ $xtra_opts "${sketchdir}" \ > $output_file - + exit_status=$? if [ $exit_status -ne 0 ]; then echo ""ERROR: Compilation failed with error code $exit_status"" @@ -198,11 +198,11 @@ function build_sketch(){ # build_sketch [ex # Extract the desired substring using sed lib_sketch_name=$(echo "$directory_path" | sed "s|$constant_part||") #append json file where key is fqbn, sketch name, sizes -> extracted values - echo "{\"name\": \"$lib_sketch_name\", + echo "{\"name\": \"$lib_sketch_name\", \"sizes\": [{ - \"flash_bytes\": $flash_bytes, - \"flash_percentage\": $flash_percentage, - \"ram_bytes\": $ram_bytes, + \"flash_bytes\": $flash_bytes, + \"flash_percentage\": $flash_percentage, + \"ram_bytes\": $ram_bytes, \"ram_percentage\": $ram_percentage }] }," >> "$sizes_file" @@ -365,6 +365,7 @@ function build_sketches(){ # build_sketches > "$sizes_file" fi diff --git a/.github/scripts/tests_build.sh b/.github/scripts/tests_build.sh index 724e2171b..54778fab2 100755 --- a/.github/scripts/tests_build.sh +++ b/.github/scripts/tests_build.sh @@ -2,8 +2,8 @@ USAGE=" USAGE: - ${0} -c - Example: ${0} -c -t esp32 -i 0 -m 15 + ${0} -c -type + Example: ${0} -c -type validation -t esp32 -i 0 -m 15 ${0} -s sketch_name Example: ${0} -s hello_world -t esp32 ${0} -clean @@ -11,10 +11,11 @@ USAGE: " function clean(){ - rm -rf tests/*/build*/ rm -rf tests/.pytest_cache - rm -rf tests/*/__pycache__/ - rm -rf tests/*/*.xml + find tests/ -type d -name 'build*' -exec rm -rf "{}" \+ + find tests/ -type d -name '__pycache__' -exec rm -rf "{}" \+ + find tests/ -name '*.xml' -exec rm -rf "{}" \+ + find tests/ -name 'result_*.json' -exec rm -rf "{}" \+ } SCRIPTS_DIR="./.github/scripts" @@ -35,6 +36,10 @@ while [ ! -z "$1" ]; do echo "$USAGE" exit 0 ;; + -type ) + shift + test_type=$1 + ;; -clean ) clean exit 0 @@ -52,12 +57,25 @@ source ${SCRIPTS_DIR}/install-arduino-core-esp32.sh args="-ai $ARDUINO_IDE_PATH -au $ARDUINO_USR_PATH" +if [[ $test_type == "all" ]] || [[ -z $test_type ]]; then + if [ -n "$sketch" ]; then + tmp_sketch_path=$(find tests -name $sketch.ino) + test_type=$(basename $(dirname $(dirname "$tmp_sketch_path"))) + echo "Sketch $sketch test type: $test_type" + test_folder="$PWD/tests/$test_type" + else + test_folder="$PWD/tests" + fi +else + test_folder="$PWD/tests/$test_type" +fi + if [ $chunk_build -eq 1 ]; then BUILD_CMD="${SCRIPTS_DIR}/sketch_utils.sh chunk_build" - args+=" -p $PWD/tests" + args+=" -p $test_folder" else BUILD_CMD="${SCRIPTS_DIR}/sketch_utils.sh build" - args+=" -s $PWD/tests/$sketch" + args+=" -s $test_folder/$sketch" fi ${BUILD_CMD} ${args} $* diff --git a/.github/scripts/tests_run.sh b/.github/scripts/tests_run.sh index ef56fcf2d..0e2d8b01f 100755 --- a/.github/scripts/tests_run.sh +++ b/.github/scripts/tests_run.sh @@ -15,9 +15,9 @@ function run_test() { fi if [ $len -eq 1 ]; then - # build_dir="tests/$sketchname/build" + # build_dir="$sketchdir/build" build_dir="$HOME/.arduino/tests/$sketchname/build.tmp" - report_file="tests/$sketchname/$sketchname.xml" + report_file="$sketchdir/$sketchname.xml" fi for i in `seq 0 $(($len - 1))` @@ -28,9 +28,9 @@ function run_test() { fi if [ $len -ne 1 ]; then - # build_dir="tests/$sketchname/build$i" + # build_dir="$sketchdir/build$i" build_dir="$HOME/.arduino/tests/$sketchname/build$i.tmp" - report_file="tests/$sketchname/$sketchname$i.xml" + report_file="$sketchdir/$sketchname$i.xml" fi pytest tests --build-dir $build_dir -k test_$sketchname --junit-xml=$report_file @@ -79,6 +79,10 @@ while [ ! -z "$1" ]; do echo "$USAGE" exit 0 ;; + -type ) + shift + test_type=$1 + ;; * ) break ;; @@ -88,21 +92,39 @@ done source ${SCRIPTS_DIR}/install-arduino-ide.sh +# If sketch is provided and test type is not, test type is inferred from the sketch path +if [[ $test_type == "all" ]] || [[ -z $test_type ]]; then + if [ -n "$sketch" ]; then + tmp_sketch_path=$(find tests -name $sketch.ino) + test_type=$(basename $(dirname $(dirname "$tmp_sketch_path"))) + echo "Sketch $sketch test type: $test_type" + test_folder="$PWD/tests/$test_type" + else + test_folder="$PWD/tests" + fi +else + test_folder="$PWD/tests/$test_type" +fi + if [ $chunk_run -eq 0 ]; then - run_test $target $PWD/tests/$sketch/$sketch.ino $options $erase + if [ -z $sketch ]; then + echo "ERROR: Sketch name is required for single test run" + exit 1 + fi + run_test $target $test_folder/$sketch/$sketch.ino $options $erase else if [ "$chunk_max" -le 0 ]; then echo "ERROR: Chunks count must be positive number" - return 1 + exit 1 fi if [ "$chunk_index" -ge "$chunk_max" ] && [ "$chunk_max" -ge 2 ]; then echo "ERROR: Chunk index must be less than chunks count" - return 1 + exit 1 fi set +e - ${COUNT_SKETCHES} $PWD/tests $target + ${COUNT_SKETCHES} $test_folder $target sketchcount=$? set -e sketches=$(cat sketches.txt) @@ -123,7 +145,8 @@ else start_index=$(( $chunk_index * $chunk_size )) if [ "$sketchcount" -le "$start_index" ]; then echo "Skipping job" - return 0 + touch ~/.test_skipped + exit 0 fi end_index=$(( $(( $chunk_index + 1 )) * $chunk_size )) diff --git a/.github/workflows/hil.yml b/.github/workflows/hil.yml index bc3afe419..f86bf8017 100644 --- a/.github/workflows/hil.yml +++ b/.github/workflows/hil.yml @@ -18,11 +18,14 @@ jobs: gen_chunks: if: | contains(github.event.pull_request.labels.*.name, 'hil_test') || + contains(github.event.pull_request.labels.*.name, 'perf_test') || (github.event_name == 'schedule' && github.repository == 'espressif/arduino-esp32') name: Generate Chunks matrix runs-on: ubuntu-latest outputs: chunks: ${{ steps.gen-chunks.outputs.chunks }} + test_folder: ${{ steps.gen-chunks.outputs.test_folder }} + test_type: ${{ steps.gen-chunks.outputs.test_type }} steps: - name: Checkout Repository uses: actions/checkout@v4 @@ -31,7 +34,19 @@ jobs: id: gen-chunks run: | set +e - .github/scripts/sketch_utils.sh count tests + if [ "${{contains(github.event.pull_request.labels.*.name, 'hil_test')}}" == "true" ] && \ + [ "${{contains(github.event.pull_request.labels.*.name, 'perf_test')}}" == "false" ]; then + test_folder="tests/validation" + test_type="validation" + elif [ "${{contains(github.event.pull_request.labels.*.name, 'hil_test')}}" == "false" ] && \ + [ "${{contains(github.event.pull_request.labels.*.name, 'perf_test')}}" == "true" ]; then + test_folder="tests/performance" + test_type="performance" + else + test_folder="tests" + test_type="all" + fi + .github/scripts/sketch_utils.sh count $test_folder sketches=$? if [[ $sketches -ge ${{env.MAX_CHUNKS}} ]]; then $sketches=${{env.MAX_CHUNKS}} @@ -39,7 +54,9 @@ jobs: set -e rm sketches.txt CHUNKS=$(jq -c -n '$ARGS.positional' --args `seq 0 1 $((sketches - 1))`) - echo "chunks=${CHUNKS}" >>$GITHUB_OUTPUT + echo "chunks=${CHUNKS}" >> $GITHUB_OUTPUT + echo "test_folder=${test_folder}" >> $GITHUB_OUTPUT + echo "test_type=${test_type}" >> $GITHUB_OUTPUT Build: needs: gen_chunks @@ -52,17 +69,21 @@ jobs: steps: - name: Checkout Repository uses: actions/checkout@v4 + - name: Build sketches run: | - bash .github/scripts/tests_build.sh -c -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} + bash .github/scripts/tests_build.sh -c -type ${{ needs.gen_chunks.outputs.test_type }} -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} + - name: Upload ${{matrix.chip}}-${{matrix.chunks}} artifacts uses: actions/upload-artifact@v4 with: name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts - path: | - ~/.arduino/tests/*/build*.tmp/*.bin - ~/.arduino/tests/*/build*.tmp/*.json if-no-files-found: error + path: | + ~/.build_skipped + ~/.arduino/tests/**/build*.tmp/*.bin + ~/.arduino/tests/**/build*.tmp/*.json + Test: needs: [gen_chunks, Build] name: ${{matrix.chip}}-Test#${{matrix.chunks}} @@ -77,36 +98,49 @@ jobs: options: --privileged steps: - - name: Checkout repository - uses: actions/checkout@v4 + - name: Checkout repository + uses: actions/checkout@v4 - - name: Download ${{matrix.chip}}-${{matrix.chunks}} artifacts - uses: actions/download-artifact@v4 - with: - name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts - path: ~/.arduino/tests/ + - name: Download ${{matrix.chip}}-${{matrix.chunks}} artifacts + uses: actions/download-artifact@v4 + with: + name: ${{matrix.chip}}-${{matrix.chunks}}.artifacts + path: ~/ - - name: Install dependencies - run: | - pip install -U pip - pip install -r tests/requirements.txt --extra-index-url https://dl.espressif.com/pypi - apt update && apt install -y -qq jq + - name: Install dependencies + run: | + pip install -U pip + pip install -r tests/requirements.txt --extra-index-url https://dl.espressif.com/pypi + apt update && apt install -y -qq jq - - name: Run Tests - run: | - bash .github/scripts/tests_run.sh -c -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} -e + - name: Run Tests + run: | + bash .github/scripts/tests_run.sh -c -type ${{ needs.gen_chunks.outputs.test_type }} -t ${{matrix.chip}} -i ${{matrix.chunks}} -m ${{env.MAX_CHUNKS}} -e - - name: Upload test result artifacts - uses: actions/upload-artifact@v4 - if: always() - with: - name: test_results-${{matrix.chip}}-${{matrix.chunks}} - path: tests/*/*.xml + - name: Check if tests were skipped + id: check-test-skipped + run: | + if [ -f ~/.test_skipped ]; then + echo "skipped=true" >> $GITHUB_OUTPUT + else + echo "skipped=false" >> $GITHUB_OUTPUT + fi + + - name: Upload test result artifacts + uses: actions/upload-artifact@v4 + if: ${{ always() && steps.check-test-skipped.outputs.skipped == 'false' }} + with: + name: test_results-${{matrix.chip}}-${{matrix.chunks}} + if-no-files-found: error + path: | + tests/**/*.xml + tests/**/result_*.json event_file: name: "Event File" if: | contains(github.event.pull_request.labels.*.name, 'hil_test') || + contains(github.event.pull_request.labels.*.name, 'perf_test') || github.event_name == 'schedule' needs: Test runs-on: ubuntu-latest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3fa63413a..b06629896 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,8 @@ -exclude: ".github/.*" +exclude: | + (?x)( + ^\.github\/| + ^tests\/performance\/coremark\/.*\.[ch]$ + ) default_language_version: # force all unspecified python hooks to run python3 diff --git a/tests/.gitignore b/tests/.gitignore index d9333804a..4b548d270 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,3 +1,4 @@ build*/ __pycache__/ *.xml +result_*.json diff --git a/tests/performance/coremark/core_list_join.c b/tests/performance/coremark/core_list_join.c new file mode 100644 index 000000000..a5154284a --- /dev/null +++ b/tests/performance/coremark/core_list_join.c @@ -0,0 +1,495 @@ +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ + +#include "coremark.h" +/* +Topic: Description + Benchmark using a linked list. + + Linked list is a common data structure used in many applications. + + For our purposes, this will excercise the memory units of the processor. + In particular, usage of the list pointers to find and alter data. + + We are not using Malloc since some platforms do not support this library. + + Instead, the memory block being passed in is used to create a list, + and the benchmark takes care not to add more items then can be + accomodated by the memory block. The porting layer will make sure + that we have a valid memory block. + + All operations are done in place, without using any extra memory. + + The list itself contains list pointers and pointers to data items. + Data items contain the following: + + idx - An index that captures the initial order of the list. + data - Variable data initialized based on the input parameters. The 16b are divided as follows: + o Upper 8b are backup of original data. + o Bit 7 indicates if the lower 7 bits are to be used as is or calculated. + o Bits 0-2 indicate type of operation to perform to get a 7b value. + o Bits 3-6 provide input for the operation. + +*/ + +/* local functions */ + +list_head *core_list_find(list_head *list,list_data *info); +list_head *core_list_reverse(list_head *list); +list_head *core_list_remove(list_head *item); +list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified); +list_head *core_list_insert_new(list_head *insert_point + , list_data *info, list_head **memblock, list_data **datablock + , list_head *memblock_end, list_data *datablock_end); +typedef ee_s32(*list_cmp)(list_data *a, list_data *b, core_results *res); +list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res); + +ee_s16 calc_func(ee_s16 *pdata, core_results *res) { + ee_s16 data=*pdata; + ee_s16 retval; + ee_u8 optype=(data>>7) & 1; /* bit 7 indicates if the function result has been cached */ + if (optype) /* if cached, use cache */ + return (data & 0x007f); + else { /* otherwise calculate and cache the result */ + ee_s16 flag=data & 0x7; /* bits 0-2 is type of function to perform */ + ee_s16 dtype=((data>>3) & 0xf); /* bits 3-6 is specific data for the operation */ + dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */ + switch (flag) { + case 0: + if (dtype<0x22) /* set min period for bit corruption */ + dtype=0x22; + retval=core_bench_state(res->size,res->memblock[3],res->seed1,res->seed2,dtype,res->crc); + if (res->crcstate==0) + res->crcstate=retval; + break; + case 1: + retval=core_bench_matrix(&(res->mat),dtype,res->crc); + if (res->crcmatrix==0) + res->crcmatrix=retval; + break; + default: + retval=data; + break; + } + res->crc=crcu16(retval,res->crc); + retval &= 0x007f; + *pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */ + return retval; + } +} +/* Function: cmp_complex + Compare the data item in a list cell. + + Can be used by mergesort. +*/ +ee_s32 cmp_complex(list_data *a, list_data *b, core_results *res) { + ee_s16 val1=calc_func(&(a->data16),res); + ee_s16 val2=calc_func(&(b->data16),res); + return val1 - val2; +} + +/* Function: cmp_idx + Compare the idx item in a list cell, and regen the data. + + Can be used by mergesort. +*/ +ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) { + if (res==NULL) { + a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16>>8)); + b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16>>8)); + } + return a->idx - b->idx; +} + +void copy_info(list_data *to,list_data *from) { + to->data16=from->data16; + to->idx=from->idx; +} + +/* Benchmark for linked list: + - Try to find multiple data items. + - List sort + - Operate on data from list (crc) + - Single remove/reinsert + * At the end of this function, the list is back to original state +*/ +ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) { + ee_u16 retval=0; + ee_u16 found=0,missed=0; + list_head *list=res->list; + ee_s16 find_num=res->seed3; + list_head *this_find; + list_head *finder, *remover; + list_data info; + ee_s16 i; + + info.idx=finder_idx; + /* find values in the list, and change the list each time (reverse and cache if value found) */ + for (i=0; inext->info->data16 >> 8) & 1; + } + else { + found++; + if (this_find->info->data16 & 0x1) /* use found value */ + retval+=(this_find->info->data16 >> 9) & 1; + /* and cache next item at the head of the list (if any) */ + if (this_find->next != NULL) { + finder = this_find->next; + this_find->next = finder->next; + finder->next=list->next; + list->next=finder; + } + } + if (info.idx>=0) + info.idx++; +#if CORE_DEBUG + ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found); +#endif + } + retval+=found*4-missed; + /* sort the list by data content and remove one item*/ + if (finder_idx>0) + list=core_list_mergesort(list,cmp_complex,res); + remover=core_list_remove(list->next); + /* CRC data content of list from location of index N forward, and then undo remove */ + finder=core_list_find(list,&info); + if (!finder) + finder=list->next; + while (finder) { + retval=crc16(list->info->data16,retval); + finder=finder->next; + } +#if CORE_DEBUG + ee_printf("List sort 1: %04x\n",retval); +#endif + remover=core_list_undo_remove(remover,list->next); + /* sort the list by index, in effect returning the list to original state */ + list=core_list_mergesort(list,cmp_idx,NULL); + /* CRC data content of list */ + finder=list->next; + while (finder) { + retval=crc16(list->info->data16,retval); + finder=finder->next; + } +#if CORE_DEBUG + ee_printf("List sort 2: %04x\n",retval); +#endif + return retval; +} +/* Function: core_list_init + Initialize list with data. + + Parameters: + blksize - Size of memory to be initialized. + memblock - Pointer to memory block. + seed - Actual values chosen depend on the seed parameter. + The seed parameter MUST be supplied from a source that cannot be determined at compile time + + Returns: + Pointer to the head of the list. + +*/ +list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) { + /* calculated pointers for the list */ + ee_u32 per_item=16+sizeof(struct list_data_s); + ee_u32 size=(blksize/per_item)-2; /* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */ + list_head *memblock_end=memblock+size; + list_data *datablock=(list_data *)(memblock_end); + list_data *datablock_end=datablock+size; + /* some useful variables */ + ee_u32 i; + list_head *finder,*list=memblock; + list_data info; + + /* create a fake items for the list head and tail */ + list->next=NULL; + list->info=datablock; + list->info->idx=0x0000; + list->info->data16=(ee_s16)0x8080; + memblock++; + datablock++; + info.idx=0x7fff; + info.data16=(ee_s16)0xffff; + core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end); + + /* then insert size items */ + for (i=0; inext; + i=1; + while (finder->next!=NULL) { + if (iinfo->idx=i++; + else { + ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */ + finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */ + } + finder=finder->next; + } + list = core_list_mergesort(list,cmp_idx,NULL); +#if CORE_DEBUG + ee_printf("Initialized list:\n"); + finder=list; + while (finder) { + ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16); + finder=finder->next; + } + ee_printf("\n"); +#endif + return list; +} + +/* Function: core_list_insert + Insert an item to the list + + Parameters: + insert_point - where to insert the item. + info - data for the cell. + memblock - pointer for the list header + datablock - pointer for the list data + memblock_end - end of region for list headers + datablock_end - end of region for list data + + Returns: + Pointer to new item. +*/ +list_head *core_list_insert_new(list_head *insert_point, list_data *info, list_head **memblock, list_data **datablock + , list_head *memblock_end, list_data *datablock_end) { + list_head *newitem; + + if ((*memblock+1) >= memblock_end) + return NULL; + if ((*datablock+1) >= datablock_end) + return NULL; + + newitem=*memblock; + (*memblock)++; + newitem->next=insert_point->next; + insert_point->next=newitem; + + newitem->info=*datablock; + (*datablock)++; + copy_info(newitem->info,info); + + return newitem; +} + +/* Function: core_list_remove + Remove an item from the list. + + Operation: + For a singly linked list, remove by copying the data from the next item + over to the current cell, and unlinking the next item. + + Note: + since there is always a fake item at the end of the list, no need to check for NULL. + + Returns: + Removed item. +*/ +list_head *core_list_remove(list_head *item) { + list_data *tmp; + list_head *ret=item->next; + /* swap data pointers */ + tmp=item->info; + item->info=ret->info; + ret->info=tmp; + /* and eliminate item */ + item->next=item->next->next; + ret->next=NULL; + return ret; +} + +/* Function: core_list_undo_remove + Undo a remove operation. + + Operation: + Since we want each iteration of the benchmark to be exactly the same, + we need to be able to undo a remove. + Link the removed item back into the list, and switch the info items. + + Parameters: + item_removed - Return value from the + item_modified - List item that was modified during + + Returns: + The item that was linked back to the list. + +*/ +list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified) { + list_data *tmp; + /* swap data pointers */ + tmp=item_removed->info; + item_removed->info=item_modified->info; + item_modified->info=tmp; + /* and insert item */ + item_removed->next=item_modified->next; + item_modified->next=item_removed; + return item_removed; +} + +/* Function: core_list_find + Find an item in the list + + Operation: + Find an item by idx (if not 0) or specific data value + + Parameters: + list - list head + info - idx or data to find + + Returns: + Found item, or NULL if not found. +*/ +list_head *core_list_find(list_head *list,list_data *info) { + if (info->idx>=0) { + while (list && (list->info->idx != info->idx)) + list=list->next; + return list; + } else { + while (list && ((list->info->data16 & 0xff) != info->data16)) + list=list->next; + return list; + } +} +/* Function: core_list_reverse + Reverse a list + + Operation: + Rearrange the pointers so the list is reversed. + + Parameters: + list - list head + info - idx or data to find + + Returns: + Found item, or NULL if not found. +*/ + +list_head *core_list_reverse(list_head *list) { + list_head *next=NULL, *tmp; + while (list) { + tmp=list->next; + list->next=next; + next=list; + list=tmp; + } + return next; +} +/* Function: core_list_mergesort + Sort the list in place without recursion. + + Description: + Use mergesort, as for linked list this is a realistic solution. + Also, since this is aimed at embedded, care was taken to use iterative rather then recursive algorithm. + The sort can either return the list to original order (by idx) , + or use the data item to invoke other other algorithms and change the order of the list. + + Parameters: + list - list to be sorted. + cmp - cmp function to use + + Returns: + New head of the list. + + Note: + We have a special header for the list that will always be first, + but the algorithm could theoretically modify where the list starts. + + */ +list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res) { + list_head *p, *q, *e, *tail; + ee_s32 insize, nmerges, psize, qsize, i; + + insize = 1; + + while (1) { + p = list; + list = NULL; + tail = NULL; + + nmerges = 0; /* count number of merges we do in this pass */ + + while (p) { + nmerges++; /* there exists a merge to be done */ + /* step `insize' places along from p */ + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next; + if (!q) break; + } + + /* if q hasn't fallen off end, we have two lists to merge */ + qsize = insize; + + /* now we have two lists; merge them */ + while (psize > 0 || (qsize > 0 && q)) { + + /* decide whether next element of merge comes from p or q */ + if (psize == 0) { + /* p is empty; e must come from q. */ + e = q; q = q->next; qsize--; + } else if (qsize == 0 || !q) { + /* q is empty; e must come from p. */ + e = p; p = p->next; psize--; + } else if (cmp(p->info,q->info,res) <= 0) { + /* First element of p is lower (or same); e must come from p. */ + e = p; p = p->next; psize--; + } else { + /* First element of q is lower; e must come from q. */ + e = q; q = q->next; qsize--; + } + + /* add the next element to the merged list */ + if (tail) { + tail->next = e; + } else { + list = e; + } + tail = e; + } + + /* now p has stepped `insize' places along, and q has too */ + p = q; + } + + tail->next = NULL; + + /* If we have done only one merge, we're finished. */ + if (nmerges <= 1) /* allow for nmerges==0, the empty list case */ + return list; + + /* Otherwise repeat, merging lists twice the size */ + insize *= 2; + } +#if COMPILER_REQUIRES_SORT_RETURN + return list; +#endif +} diff --git a/tests/performance/coremark/core_main.c b/tests/performance/coremark/core_main.c new file mode 100644 index 000000000..61619744e --- /dev/null +++ b/tests/performance/coremark/core_main.c @@ -0,0 +1,356 @@ +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ + +/* File: core_main.c + This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results. +*/ +#include "coremark.h" + +/* Function: iterate + Run the benchmark for a specified number of iterations. + + Operation: + For each type of benchmarked algorithm: + a - Initialize the data block for the algorithm. + b - Execute the algorithm N times. + + Returns: + NULL. +*/ +static ee_u16 list_known_crc[] = {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1}; +static ee_u16 matrix_known_crc[] = {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747}; +static ee_u16 state_known_crc[] = {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84}; +void *iterate(void *pres) { + ee_u32 i; + ee_u16 crc; + core_results *res=(core_results *)pres; + ee_u32 iterations=res->iterations; + res->crc=0; + res->crclist=0; + res->crcmatrix=0; + res->crcstate=0; + + for (i=0; icrc=crcu16(crc,res->crc); + crc=core_bench_list(res,-1); + res->crc=crcu16(crc,res->crc); + if (i==0) res->crclist=res->crc; + } + return NULL; +} + +#if (SEED_METHOD==SEED_ARG) +ee_s32 get_seed_args(int i, int argc, char *argv[]); +#define get_seed(x) (ee_s16)get_seed_args(x,argc,argv) +#define get_seed_32(x) get_seed_args(x,argc,argv) +#else /* via function or volatile */ +ee_s32 get_seed_32(int i); +#define get_seed(x) (ee_s16)get_seed_32(x) +#endif + +#if (MEM_METHOD==MEM_STATIC) +ee_u8 static_memblk[TOTAL_DATA_SIZE]; +#endif +char *mem_name[3] = {"Static","Heap","Stack"}; +/* Function: main + Main entry routine for the benchmark. + This function is responsible for the following steps: + + 1 - Initialize input seeds from a source that cannot be determined at compile time. + 2 - Initialize memory block for use. + 3 - Run and time the benchmark. + 4 - Report results, testing the validity of the output if the seeds are known. + + Arguments: + 1 - first seed : Any value + 2 - second seed : Must be identical to first for iterations to be identical + 3 - third seed : Any value, should be at least an order of magnitude less then the input size, but bigger then 32. + 4 - Iterations : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs + +*/ + +#if MAIN_HAS_NOARGC +MAIN_RETURN_TYPE main(void) { + int argc=0; + char *argv[1]; +#else +MAIN_RETURN_TYPE main(int argc, char *argv[]) { +#endif + ee_u16 i,j=0,num_algorithms=0; + ee_s16 known_id=-1,total_errors=0; + ee_u16 seedcrc=0; + CORE_TICKS total_time; + core_results results[MULTITHREAD]; +#if (MEM_METHOD==MEM_STACK) + ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD]; +#endif + /* first call any initializations needed */ + portable_init(&(results[0].port), &argc, argv); + /* First some checks to make sure benchmark will run ok */ + if (sizeof(struct list_head_s)>128) { + ee_printf("list_head structure too big for comparable data!\n"); + return MAIN_RETURN_VAL; + } + results[0].seed1=get_seed(1); + results[0].seed2=get_seed(2); + results[0].seed3=get_seed(3); + results[0].iterations=get_seed_32(4); +#if CORE_DEBUG + results[0].iterations=1; +#endif + results[0].execs=get_seed_32(5); + if (results[0].execs==0) { /* if not supplied, execute all algorithms */ + results[0].execs=ALL_ALGORITHMS_MASK; + } + /* put in some default values based on one seed only for easy testing */ + if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */ + results[0].seed1=0; + results[0].seed2=0; + results[0].seed3=0x66; + } + if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */ + results[0].seed1=0x3415; + results[0].seed2=0x3415; + results[0].seed3=0x66; + } +#if (MEM_METHOD==MEM_STATIC) + results[0].memblock[0]=(void *)static_memblk; + results[0].size=TOTAL_DATA_SIZE; + results[0].err=0; + #if (MULTITHREAD>1) + #error "Cannot use a static data area with multiple contexts!" + #endif +#elif (MEM_METHOD==MEM_MALLOC) + for (i=0 ; i1) + if (default_num_contexts>MULTITHREAD) { + default_num_contexts=MULTITHREAD; + } + for (i=0 ; i=0) { + for (i=0 ; i 0) + ee_printf("Iterations/Sec : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time)); +#else + ee_printf("Total time (secs): %d\n",time_in_secs(total_time)); + if (time_in_secs(total_time) > 0) + ee_printf("Iterations/Sec : %d\n",default_num_contexts*results[0].iterations/time_in_secs(total_time)); +#endif + if (time_in_secs(total_time) < 10) { + ee_printf("ERROR! Must execute for at least 10 secs for a valid result!\n"); + total_errors++; + } + + ee_printf("Iterations : %lu\n", (long unsigned) default_num_contexts*results[0].iterations); + ee_printf("Compiler version : %s\n",COMPILER_VERSION); + ee_printf("Compiler flags : %s\n",COMPILER_FLAGS); +#if (MULTITHREAD>1) + ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts); +#endif + ee_printf("Memory location : %s\n",MEM_LOCATION); + /* output for verification */ + ee_printf("seedcrc : 0x%04x\n",seedcrc); + if (results[0].execs & ID_LIST) + for (i=0 ; i1) + ee_printf(" / %d:%s",default_num_contexts,PARALLEL_METHOD); +#endif + ee_printf("\n"); + } +#endif + } + if (total_errors>0) + ee_printf("Errors detected\n"); + if (total_errors<0) + ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n"); + +#if (MEM_METHOD==MEM_MALLOC) + for (i=0 ; i>(from)) & (~(0xffffffff << (to)))) + +#if CORE_DEBUG +void printmat(MATDAT *A, ee_u32 N, char *name) { + ee_u32 i,j; + ee_printf("Matrix %s [%dx%d]:\n",name,N,N); + for (i=0; i N times, + changing the matrix values slightly by a constant amount each time. +*/ +ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) { + ee_u32 N=p->N; + MATRES *C=p->C; + MATDAT *A=p->A; + MATDAT *B=p->B; + MATDAT val=(MATDAT)seed; + + crc=crc16(matrix_test(N,C,A,B,val),crc); + + return crc; +} + +/* Function: matrix_test + Perform matrix manipulation. + + Parameters: + N - Dimensions of the matrix. + C - memory for result matrix. + A - input matrix + B - operator matrix (not changed during operations) + + Returns: + A CRC value that captures all results calculated in the function. + In particular, crc of the value calculated on the result matrix + after each step by . + + Operation: + + 1 - Add a constant value to all elements of a matrix. + 2 - Multiply a matrix by a constant. + 3 - Multiply a matrix by a vector. + 4 - Multiply a matrix by a matrix. + 5 - Add a constant value to all elements of a matrix. + + After the last step, matrix A is back to original contents. +*/ +ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) { + ee_u16 crc=0; + MATDAT clipval=matrix_big(val); + + matrix_add_const(N,A,val); /* make sure data changes */ +#if CORE_DEBUG + printmat(A,N,"matrix_add_const"); +#endif + matrix_mul_const(N,C,A,val); + crc=crc16(matrix_sum(N,C,clipval),crc); +#if CORE_DEBUG + printmatC(C,N,"matrix_mul_const"); +#endif + matrix_mul_vect(N,C,A,B); + crc=crc16(matrix_sum(N,C,clipval),crc); +#if CORE_DEBUG + printmatC(C,N,"matrix_mul_vect"); +#endif + matrix_mul_matrix(N,C,A,B); + crc=crc16(matrix_sum(N,C,clipval),crc); +#if CORE_DEBUG + printmatC(C,N,"matrix_mul_matrix"); +#endif + matrix_mul_matrix_bitextract(N,C,A,B); + crc=crc16(matrix_sum(N,C,clipval),crc); +#if CORE_DEBUG + printmatC(C,N,"matrix_mul_matrix_bitextract"); +#endif + + matrix_add_const(N,A,-val); /* return matrix to initial value */ + return crc; +} + +/* Function : matrix_init + Initialize the memory block for matrix benchmarking. + + Parameters: + blksize - Size of memory to be initialized. + memblk - Pointer to memory block. + seed - Actual values chosen depend on the seed parameter. + p - pointers to containing initialized matrixes. + + Returns: + Matrix dimensions. + + Note: + The seed parameter MUST be supplied from a source that cannot be determined at compile time +*/ +ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) { + ee_u32 N=0; + MATDAT *A; + MATDAT *B; + ee_s32 order=1; + MATDAT val; + ee_u32 i=0,j=0; + if (seed==0) + seed=1; + while (jA=A; + p->B=B; + p->C=(MATRES *)align_mem(B+N*N); + p->N=N; +#if CORE_DEBUG + printmat(A,N,"A"); + printmat(B,N,"B"); +#endif + return N; +} + +/* Function: matrix_sum + Calculate a function that depends on the values of elements in the matrix. + + For each element, accumulate into a temporary variable. + + As long as this value is under the parameter clipval, + add 1 to the result if the element is bigger then the previous. + + Otherwise, reset the accumulator and add 10 to the result. +*/ +ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) { + MATRES tmp=0,prev=0,cur=0; + ee_s16 ret=0; + ee_u32 i,j; + for (i=0; iclipval) { + ret+=10; + tmp=0; + } else { + ret += (cur>prev) ? 1 : 0; + } + prev=cur; + } + } + return ret; +} + +/* Function: matrix_mul_const + Multiply a matrix by a constant. + This could be used as a scaler for instance. +*/ +void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) { + ee_u32 i,j; + for (i=0; i 1) +static uint8_t next_core = 0; +#endif + +#if VALIDATION_RUN + volatile ee_s32 seed1_volatile=0x3415; + volatile ee_s32 seed2_volatile=0x3415; + volatile ee_s32 seed3_volatile=0x66; +#endif +#if PERFORMANCE_RUN + volatile ee_s32 seed1_volatile=0x0; + volatile ee_s32 seed2_volatile=0x0; + volatile ee_s32 seed3_volatile=0x66; +#endif +#if PROFILE_RUN + volatile ee_s32 seed1_volatile=0x8; + volatile ee_s32 seed2_volatile=0x8; + volatile ee_s32 seed3_volatile=0x8; +#endif + volatile ee_s32 seed4_volatile=ITERATIONS; + volatile ee_s32 seed5_volatile=0; +/* Porting : Timing functions + How to capture time and convert to seconds must be ported to whatever is supported by the platform. + e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc. + Sample implementation for standard time.h and windows.h definitions included. +*/ +CORETIMETYPE barebones_clock() { + return Arduino_millis(); +} +/* Define : TIMER_RES_DIVIDER + Divider to trade off timer resolution and total time that can be measured. + + Use lower values to increase resolution, but make sure that overflow does not occur. + If there are issues with the return value overflowing, increase this value. + */ +#define CLOCKS_PER_SEC 1000.0 +#define TIMER_RES_DIVIDER 1 + +#define GETMYTIME(_t) (*_t=barebones_clock()) +#define MYTIMEDIFF(fin,ini) ((fin)-(ini)) +#define TIMER_RES_DIVIDER 1 +#define SAMPLE_TIME_IMPLEMENTATION 1 +#define EE_TICKS_PER_SEC (CLOCKS_PER_SEC / TIMER_RES_DIVIDER) + +/** Define Host specific (POSIX), or target specific global time variables. */ +static CORETIMETYPE start_time_val, stop_time_val; + +/* Function : start_time + This function will be called right before starting the timed portion of the benchmark. + + Implementation may be capturing a system timer (as implemented in the example code) + or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0. +*/ +void start_time(void) { + GETMYTIME(&start_time_val ); +} +/* Function : stop_time + This function will be called right after ending the timed portion of the benchmark. + + Implementation may be capturing a system timer (as implemented in the example code) + or other system parameters - e.g. reading the current value of cpu cycles counter. +*/ +void stop_time(void) { + GETMYTIME(&stop_time_val ); +} +/* Function : get_time + Return an abstract "ticks" number that signifies time on the system. + + Actual value returned may be cpu cycles, milliseconds or any other value, + as long as it can be converted to seconds by . + This methodology is taken to accomodate any hardware or simulated platform. + The sample implementation returns millisecs by default, + and the resolution is controlled by +*/ +CORE_TICKS get_time(void) { + CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val)); + return elapsed; +} +/* Function : time_in_secs + Convert the value returned by get_time to seconds. + + The type is used to accomodate systems with no support for floating point. + Default implementation implemented by the EE_TICKS_PER_SEC macro above. +*/ +secs_ret time_in_secs(CORE_TICKS ticks) { + secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC; + return retval; +} + +ee_u32 default_num_contexts = MULTITHREAD; + +/* Function : portable_init + Target specific initialization code + Test for some common mistakes. +*/ +void portable_init(core_portable *p, int *argc, char *argv[]) +{ + // Serial.begin(9600); + // #error "Call board initialization routines in portable init (if needed), in particular initialize UART!\n" + if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) { + ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n"); + } + if (sizeof(ee_u32) != 4) { + ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n"); + } + p->portable_id=1; +} +/* Function : portable_fini + Target specific final code +*/ +void portable_fini(core_portable *p) +{ + p->portable_id=0; +} + +void iterate_task(void *arg) +{ + iterate(arg); + vTaskDelete(NULL); +} + +#if (MULTITHREAD > 1) +ee_u8 core_start_parallel(core_results *res) +{ + int ret; + res->port.task = NULL; + ret = xTaskCreatePinnedToCore(iterate_task, /* Function to implement the task */ + "CoreMarkTask", /* Name of the task */ + 10000, /* Stack size in words */ + (void *)res, /* Task input parameter */ + 20, /* Priority of the task */ + &(res->port.task), /* Task handle */ + next_core); /* Core where the task should run */ + + next_core = (next_core + 1) % MULTITHREAD; + return (ee_u8) ret; +} + +ee_u8 core_stop_parallel(core_results *res) +{ + while (eTaskGetState(res->port.task) != eDeleted); + res->port.task = NULL; + return 0; +} +#endif + diff --git a/tests/performance/coremark/core_portme.h b/tests/performance/coremark/core_portme.h new file mode 100644 index 000000000..9511aafba --- /dev/null +++ b/tests/performance/coremark/core_portme.h @@ -0,0 +1,217 @@ +#include "Arduino.h" +#include +#include + +// a minor hack to rename the main function, so we can call it from C++ +#define main(ignore) coremark_main(void) + +#define FLAGS_STR "(flags unknown)" + +#define PERFORMANCE_RUN 1 + +// 0 means auto-detect number of iterations for 10 second test +#define ITERATIONS 0 + +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ +/* Topic : Description + This file contains configuration constants required to execute on different platforms +*/ +#ifndef CORE_PORTME_H +#define CORE_PORTME_H +/************************/ +/* Data types and settings */ +/************************/ +/* Configuration : HAS_FLOAT + Define to 1 if the platform supports floating point. +*/ +#ifndef HAS_FLOAT +#define HAS_FLOAT 1 +#endif +/* Configuration : HAS_TIME_H + Define to 1 if platform has the time.h header file, + and implementation of functions thereof. +*/ +#ifndef HAS_TIME_H +#define HAS_TIME_H 0 +#endif +/* Configuration : USE_CLOCK + Define to 1 if platform has the time.h header file, + and implementation of functions thereof. +*/ +#ifndef USE_CLOCK +#define USE_CLOCK 0 +#endif +/* Configuration : HAS_STDIO + Define to 1 if the platform has stdio.h. +*/ +#ifndef HAS_STDIO +#define HAS_STDIO 1 +#endif +/* Configuration : HAS_PRINTF + Define to 1 if the platform has stdio.h and implements the printf function. +*/ +#ifndef HAS_PRINTF +#define HAS_PRINTF 0 +#endif + + +/* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION + Initialize these strings per platform +*/ +#ifndef COMPILER_VERSION + #ifdef __GNUC__ + #define COMPILER_VERSION "GCC"__VERSION__ + #else + #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)" + #endif +#endif +#ifndef COMPILER_FLAGS + #define COMPILER_FLAGS FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */ +#endif +#ifndef MEM_LOCATION + #define MEM_LOCATION "STACK" +#endif + +/* Data Types : + To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in . + + *Imprtant* : + ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!! +*/ +typedef int16_t ee_s16; +typedef uint16_t ee_u16; +typedef int32_t ee_s32; +typedef double ee_f32; +typedef uint8_t ee_u8; +typedef uint32_t ee_u32; +typedef uintptr_t ee_ptr_int; +typedef size_t ee_size_t; +#define NULL ((void *)0) +/* align_mem : + This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks. +*/ +#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x) - 1) & ~3)) + +/* Configuration : CORE_TICKS + Define type of return from the timing functions. + */ +#define CORETIMETYPE ee_u32 +typedef ee_u32 CORE_TICKS; + +/* Configuration : SEED_METHOD + Defines method to get seed values that cannot be computed at compile time. + + Valid values : + SEED_ARG - from command line. + SEED_FUNC - from a system function. + SEED_VOLATILE - from volatile variables. +*/ +#ifndef SEED_METHOD +#define SEED_METHOD SEED_VOLATILE +#endif + +/* Configuration : MEM_METHOD + Defines method to get a block of memry. + + Valid values : + MEM_MALLOC - for platforms that implement malloc and have malloc.h. + MEM_STATIC - to use a static memory array. + MEM_STACK - to allocate the data block on the stack (NYI). +*/ +#ifndef MEM_METHOD +#define MEM_METHOD MEM_STACK +#endif + +/* Configuration : MULTITHREAD + Define for parallel execution + + Valid values : + 1 - only one context (default). + N>1 - will execute N copies in parallel. + + Note : + If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined. + + Two sample implementations are provided. Use or to enable them. + + It is valid to have a different implementation of and in , + to fit a particular architecture. +*/ +#ifndef MULTITHREAD +#define MULTITHREAD CONFIG_SOC_CPU_CORES_NUM +#define PARALLEL_METHOD "FreeRTOS" +#define USE_PTHREAD 0 +#define USE_FORK 0 +#define USE_SOCKET 0 +#endif + +/* Configuration : MAIN_HAS_NOARGC + Needed if platform does not support getting arguments to main. + + Valid values : + 0 - argc/argv to main is supported + 1 - argc/argv to main is not supported + + Note : + This flag only matters if MULTITHREAD has been defined to a value greater then 1. +*/ +#ifndef MAIN_HAS_NOARGC +#define MAIN_HAS_NOARGC 1 +#endif + +/* Configuration : MAIN_HAS_NORETURN + Needed if platform does not support returning a value from main. + + Valid values : + 0 - main returns an int, and return value will be 0. + 1 - platform does not support returning a value from main +*/ +#ifndef MAIN_HAS_NORETURN +#define MAIN_HAS_NORETURN 0 +#endif + +/* Variable : default_num_contexts + Not used for this simple port, must cintain the value 1. +*/ +extern ee_u32 default_num_contexts; + +typedef struct CORE_PORTABLE_S { +#if (MULTITHREAD > 1) + TaskHandle_t task; +#endif + ee_u8 portable_id; +} core_portable; + +/* target specific init/fini */ +void portable_init(core_portable *p, int *argc, char *argv[]); +void portable_fini(core_portable *p); + +#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN) +#if (TOTAL_DATA_SIZE==1200) +#define PROFILE_RUN 1 +#elif (TOTAL_DATA_SIZE==2000) +#define PERFORMANCE_RUN 1 +#else +#define VALIDATION_RUN 1 +#endif +#endif + +int ee_printf(const char *fmt, ...); + +#endif /* CORE_PORTME_H */ diff --git a/tests/performance/coremark/core_state.c b/tests/performance/coremark/core_state.c new file mode 100644 index 000000000..bb3193308 --- /dev/null +++ b/tests/performance/coremark/core_state.c @@ -0,0 +1,277 @@ +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ + +#include "coremark.h" +/* local functions */ +enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count); + +/* +Topic: Description + Simple state machines like this one are used in many embedded products. + + For more complex state machines, sometimes a state transition table implementation is used instead, + trading speed of direct coding for ease of maintenance. + + Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour, + we are using a small moore machine. + + In particular, this machine tests type of string input, + trying to determine whether the input is a number or something else. + (see core_state.png). +*/ + +/* Function: core_bench_state + Benchmark function + + Go over the input twice, once direct, and once after introducing some corruption. +*/ +ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, + ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc) +{ + ee_u32 final_counts[NUM_CORE_STATES]; + ee_u32 track_counts[NUM_CORE_STATES]; + ee_u8 *p=memblock; + ee_u32 i; + + +#if CORE_DEBUG + ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc); +#endif + for (i=0; i0) { + for(i=0;i>3) & 0x3]; + next=4; + break; + case 3: /* float */ + case 4: /* float */ + buf=floatpat[(seed>>3) & 0x3]; + next=8; + break; + case 5: /* scientific */ + case 6: /* scientific */ + buf=scipat[(seed>>3) & 0x3]; + next=8; + break; + case 7: /* invalid */ + buf=errpat[(seed>>3) & 0x3]; + next=8; + break; + default: /* Never happen, just to make some compilers happy */ + break; + } + } + size++; + while (total='0') & (c<='9')) ? 1 : 0; + return retval; +} + +/* Function: core_state_transition + Actual state machine. + + The state machine will continue scanning until either: + 1 - an invalid input is detcted. + 2 - a valid number has been detected. + + The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid). +*/ + +enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) { + ee_u8 *str=*instr; + ee_u8 NEXT_SYMBOL; + enum CORE_STATE state=CORE_START; + for( ; *str && state != CORE_INVALID; str++ ) { + NEXT_SYMBOL = *str; + if (NEXT_SYMBOL==',') /* end of this input */ { + str++; + break; + } + switch(state) { + case CORE_START: + if(ee_isdigit(NEXT_SYMBOL)) { + state = CORE_INT; + } + else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) { + state = CORE_S1; + } + else if( NEXT_SYMBOL == '.' ) { + state = CORE_FLOAT; + } + else { + state = CORE_INVALID; + transition_count[CORE_INVALID]++; + } + transition_count[CORE_START]++; + break; + case CORE_S1: + if(ee_isdigit(NEXT_SYMBOL)) { + state = CORE_INT; + transition_count[CORE_S1]++; + } + else if( NEXT_SYMBOL == '.' ) { + state = CORE_FLOAT; + transition_count[CORE_S1]++; + } + else { + state = CORE_INVALID; + transition_count[CORE_S1]++; + } + break; + case CORE_INT: + if( NEXT_SYMBOL == '.' ) { + state = CORE_FLOAT; + transition_count[CORE_INT]++; + } + else if(!ee_isdigit(NEXT_SYMBOL)) { + state = CORE_INVALID; + transition_count[CORE_INT]++; + } + break; + case CORE_FLOAT: + if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) { + state = CORE_S2; + transition_count[CORE_FLOAT]++; + } + else if(!ee_isdigit(NEXT_SYMBOL)) { + state = CORE_INVALID; + transition_count[CORE_FLOAT]++; + } + break; + case CORE_S2: + if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) { + state = CORE_EXPONENT; + transition_count[CORE_S2]++; + } + else { + state = CORE_INVALID; + transition_count[CORE_S2]++; + } + break; + case CORE_EXPONENT: + if(ee_isdigit(NEXT_SYMBOL)) { + state = CORE_SCIENTIFIC; + transition_count[CORE_EXPONENT]++; + } + else { + state = CORE_INVALID; + transition_count[CORE_EXPONENT]++; + } + break; + case CORE_SCIENTIFIC: + if(!ee_isdigit(NEXT_SYMBOL)) { + state = CORE_INVALID; + transition_count[CORE_INVALID]++; + } + break; + default: + break; + } + } + *instr=str; + return state; +} diff --git a/tests/performance/coremark/core_util.c b/tests/performance/coremark/core_util.c new file mode 100644 index 000000000..581adcc24 --- /dev/null +++ b/tests/performance/coremark/core_util.c @@ -0,0 +1,210 @@ +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ + +#include "coremark.h" +/* Function: get_seed + Get a values that cannot be determined at compile time. + + Since different embedded systems and compilers are used, 3 different methods are provided: + 1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that + reads the value of a volatile variable from memory at run time. + Please note, if using this method, you would need to modify core_portme.c to generate training profile. + 2 - Command line arguments. This is the preferred method if command line arguments are supported. + 3 - System function. If none of the first 2 methods is available on the platform, + a system function which is not a stub can be used. + + e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions. +*/ +#if (SEED_METHOD==SEED_VOLATILE) + extern volatile ee_s32 seed1_volatile; + extern volatile ee_s32 seed2_volatile; + extern volatile ee_s32 seed3_volatile; + extern volatile ee_s32 seed4_volatile; + extern volatile ee_s32 seed5_volatile; + ee_s32 get_seed_32(int i) { + ee_s32 retval; + switch (i) { + case 1: + retval=seed1_volatile; + break; + case 2: + retval=seed2_volatile; + break; + case 3: + retval=seed3_volatile; + break; + case 4: + retval=seed4_volatile; + break; + case 5: + retval=seed5_volatile; + break; + default: + retval=0; + break; + } + return retval; + } +#elif (SEED_METHOD==SEED_ARG) +ee_s32 parseval(char *valstring) { + ee_s32 retval=0; + ee_s32 neg=1; + int hexmode=0; + if (*valstring == '-') { + neg=-1; + valstring++; + } + if ((valstring[0] == '0') && (valstring[1] == 'x')) { + hexmode=1; + valstring+=2; + } + /* first look for digits */ + if (hexmode) { + while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) { + ee_s32 digit=*valstring-'0'; + if (digit>9) + digit=10+*valstring-'a'; + retval*=16; + retval+=digit; + valstring++; + } + } else { + while ((*valstring >= '0') && (*valstring <= '9')) { + ee_s32 digit=*valstring-'0'; + retval*=10; + retval+=digit; + valstring++; + } + } + /* now add qualifiers */ + if (*valstring=='K') + retval*=1024; + if (*valstring=='M') + retval*=1024*1024; + + retval*=neg; + return retval; +} + +ee_s32 get_seed_args(int i, int argc, char *argv[]) { + if (argc>i) + return parseval(argv[i]); + return 0; +} + +#elif (SEED_METHOD==SEED_FUNC) +/* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */ +ee_s32 get_seed_32(int i) { + ee_s32 retval; + switch (i) { + case 1: + retval=portme_sys1(); + break; + case 2: + retval=portme_sys2(); + break; + case 3: + retval=portme_sys3(); + break; + case 4: + retval=portme_sys4(); + break; + case 5: + retval=portme_sys5(); + break; + default: + retval=0; + break; + } + return retval; +} +#endif + +/* Function: crc* + Service functions to calculate 16b CRC code. + +*/ +ee_u16 crcu8(ee_u8 data, ee_u16 crc ) +{ + ee_u8 i=0,x16=0,carry=0; + + for (i = 0; i < 8; i++) + { + x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1)); + data >>= 1; + + if (x16 == 1) + { + crc ^= 0x4002; + carry = 1; + } + else + carry = 0; + crc >>= 1; + if (carry) + crc |= 0x8000; + else + crc &= 0x7fff; + } + return crc; +} +ee_u16 crcu16(ee_u16 newval, ee_u16 crc) { + crc=crcu8( (ee_u8) (newval) ,crc); + crc=crcu8( (ee_u8) ((newval)>>8) ,crc); + return crc; +} +ee_u16 crcu32(ee_u32 newval, ee_u16 crc) { + crc=crc16((ee_s16) newval ,crc); + crc=crc16((ee_s16) (newval>>16) ,crc); + return crc; +} +ee_u16 crc16(ee_s16 newval, ee_u16 crc) { + return crcu16((ee_u16)newval, crc); +} + +ee_u8 check_data_types() { + ee_u8 retval=0; + if (sizeof(ee_u8) != 1) { + ee_printf("ERROR: ee_u8 is not an 8b datatype!\n"); + retval++; + } + if (sizeof(ee_u16) != 2) { + ee_printf("ERROR: ee_u16 is not a 16b datatype!\n"); + retval++; + } + if (sizeof(ee_s16) != 2) { + ee_printf("ERROR: ee_s16 is not a 16b datatype!\n"); + retval++; + } + if (sizeof(ee_s32) != 4) { + ee_printf("ERROR: ee_s32 is not a 32b datatype!\n"); + retval++; + } + if (sizeof(ee_u32) != 4) { + ee_printf("ERROR: ee_u32 is not a 32b datatype!\n"); + retval++; + } + if (sizeof(ee_ptr_int) != sizeof(int *)) { + ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n"); + retval++; + } + if (retval>0) { + ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n"); + } + return retval; +} diff --git a/tests/performance/coremark/coremark.h b/tests/performance/coremark/coremark.h new file mode 100644 index 000000000..dc9f8c7ae --- /dev/null +++ b/tests/performance/coremark/coremark.h @@ -0,0 +1,174 @@ +/* +Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Original Author: Shay Gal-on +*/ + +/* Topic: Description + This file contains declarations of the various benchmark functions. +*/ + +/* Configuration: TOTAL_DATA_SIZE + Define total size for data algorithms will operate on +*/ +#ifndef TOTAL_DATA_SIZE +#define TOTAL_DATA_SIZE 2*1000 +#endif + +#define SEED_ARG 0 +#define SEED_FUNC 1 +#define SEED_VOLATILE 2 + +#define MEM_STATIC 0 +#define MEM_MALLOC 1 +#define MEM_STACK 2 + +#include "core_portme.h" + +#if HAS_STDIO +#include +#endif +#if HAS_PRINTF +#define ee_printf printf +#endif + +/* Actual benchmark execution in iterate */ +void *iterate(void *pres); + +/* Typedef: secs_ret + For machines that have floating point support, get number of seconds as a double. + Otherwise an unsigned int. +*/ +#if HAS_FLOAT +typedef double secs_ret; +#else +typedef ee_u32 secs_ret; +#endif + +#if MAIN_HAS_NORETURN +#define MAIN_RETURN_VAL +#define MAIN_RETURN_TYPE void +#else +#define MAIN_RETURN_VAL 0 +#define MAIN_RETURN_TYPE int +#endif + +void start_time(void); +void stop_time(void); +CORE_TICKS get_time(void); +secs_ret time_in_secs(CORE_TICKS ticks); + +/* Misc useful functions */ +ee_u16 crcu8(ee_u8 data, ee_u16 crc); +ee_u16 crc16(ee_s16 newval, ee_u16 crc); +ee_u16 crcu16(ee_u16 newval, ee_u16 crc); +ee_u16 crcu32(ee_u32 newval, ee_u16 crc); +ee_u8 check_data_types(); +void *portable_malloc(ee_size_t size); +void portable_free(void *p); +ee_s32 parseval(char *valstring); + +/* Algorithm IDS */ +#define ID_LIST (1<<0) +#define ID_MATRIX (1<<1) +#define ID_STATE (1<<2) +#define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE) +#define NUM_ALGORITHMS 3 + +/* list data structures */ +typedef struct list_data_s { + ee_s16 data16; + ee_s16 idx; +} list_data; + +typedef struct list_head_s { + struct list_head_s *next; + struct list_data_s *info; +} list_head; + + +/*matrix benchmark related stuff */ +#define MATDAT_INT 1 +#if MATDAT_INT +typedef ee_s16 MATDAT; +typedef ee_s32 MATRES; +#else +typedef ee_f16 MATDAT; +typedef ee_f32 MATRES; +#endif + +typedef struct MAT_PARAMS_S { + int N; + MATDAT *A; + MATDAT *B; + MATRES *C; +} mat_params; + +/* state machine related stuff */ +/* List of all the possible states for the FSM */ +typedef enum CORE_STATE { + CORE_START=0, + CORE_INVALID, + CORE_S1, + CORE_S2, + CORE_INT, + CORE_FLOAT, + CORE_EXPONENT, + CORE_SCIENTIFIC, + NUM_CORE_STATES +} core_state_e ; + + +/* Helper structure to hold results */ +typedef struct RESULTS_S { + /* inputs */ + ee_s16 seed1; /* Initializing seed */ + ee_s16 seed2; /* Initializing seed */ + ee_s16 seed3; /* Initializing seed */ + void *memblock[4]; /* Pointer to safe memory location */ + ee_u32 size; /* Size of the data */ + ee_u32 iterations; /* Number of iterations to execute */ + ee_u32 execs; /* Bitmask of operations to execute */ + struct list_head_s *list; + mat_params mat; + /* outputs */ + ee_u16 crc; + ee_u16 crclist; + ee_u16 crcmatrix; + ee_u16 crcstate; + ee_s16 err; + /* ultithread specific */ + core_portable port; +} core_results; + +/* Multicore execution handling */ +#if (MULTITHREAD>1) +ee_u8 core_start_parallel(core_results *res); +ee_u8 core_stop_parallel(core_results *res); +#endif + +/* list benchmark functions */ +list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed); +ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx); + +/* state benchmark functions */ +void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p); +ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, + ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc); + +/* matrix benchmark functions */ +ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p); +ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc); + diff --git a/tests/performance/coremark/coremark.ino b/tests/performance/coremark/coremark.ino new file mode 100644 index 000000000..776db7874 --- /dev/null +++ b/tests/performance/coremark/coremark.ino @@ -0,0 +1,118 @@ +/* + CoreMark benchmark for ESP32 using Arduino's C++ environment with multithreading support. + + Based on https://github.com/PaulStoffregen/CoreMark/tree/master + Modified to run on ESP32 by Lucas Saavedra Vaz, 2024. +*/ + +#include +#include + +#include + +// Timeout for the task watchdog timer +#define TWDT_TIMEOUT_S 20 + +// Number of runs to average +#define N_RUNS 3 + +// A way to call the C-only coremark function from Arduino's C++ environment +extern "C" int coremark_main(void); + +void setup() { + Serial.begin(115200); + while (!Serial) { + delay(10); + } + + // To avoid the watchdog timer from resetting the ESP32 while running CoreMark we + // need to reconfigure it to have a longer timeout. + esp_task_wdt_config_t config = { + .timeout_ms = TWDT_TIMEOUT_S * 1000, + .idle_core_mask = 0, + .trigger_panic = false, + }; + + esp_task_wdt_reconfigure(&config); + + log_d("Starting CoreMark test"); + Serial.printf("Runs: %d\n", N_RUNS); + Serial.printf("Cores: %d\n", CONFIG_SOC_CPU_CORES_NUM); + Serial.flush(); + for (int i = 0; i < N_RUNS; i++) { + Serial.printf("Run %d", i); + coremark_main(); + Serial.flush(); + } + log_d("CoreMark test finished"); +} + +void loop() { + vTaskDelete(NULL); +} + +// CoreMark calls this function to print results. +extern "C" int ee_printf(const char *format, ...) { + va_list args; + va_start(args, format); + for (; *format; format++) { + if (*format == '%') { + bool islong = false; + format++; + if (*format == '%') { + Serial.print(*format); + continue; + } + if (*format == '-') { + format++; // ignore size + } + while (*format >= '0' && *format <= '9') { + format++; // ignore size + } + if (*format == 'l') { + islong = true; + format++; + } + if (*format == '\0') { + break; + } + if (*format == 's') { + Serial.print((char *)va_arg(args, int)); + } else if (*format == 'f') { + Serial.print(va_arg(args, double)); + } else if (*format == 'd') { + if (islong) { + Serial.print(va_arg(args, long)); + } else { + Serial.print(va_arg(args, int)); + } + } else if (*format == 'u') { + if (islong) { + Serial.print(va_arg(args, unsigned long)); + } else { + Serial.print(va_arg(args, unsigned int)); + } + } else if (*format == 'x') { + if (islong) { + Serial.print(va_arg(args, unsigned long), HEX); + } else { + Serial.print(va_arg(args, unsigned int), HEX); + } + } else if (*format == 'c') { + Serial.print(va_arg(args, int)); + } + } else { + if (*format == '\n') { + Serial.print('\r'); + } + Serial.print(*format); + } + } + va_end(args); + return 1; +} + +// CoreMark calls this function to measure elapsed time +extern "C" uint32_t Arduino_millis(void) { + return millis(); +} diff --git a/tests/performance/coremark/test_coremark.py b/tests/performance/coremark/test_coremark.py new file mode 100644 index 000000000..befd7c3a1 --- /dev/null +++ b/tests/performance/coremark/test_coremark.py @@ -0,0 +1,58 @@ +import json +import logging +import os + + +def test_coremark(dut, request): + LOGGER = logging.getLogger(__name__) + + # Match "Runs: %d" + res = dut.expect(r"Runs: (\d+)", timeout=60) + runs = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of runs: {}".format(runs)) + assert runs > 0, "Invalid number of runs" + + # Match "Cores: %d" + res = dut.expect(r"Cores: (\d+)", timeout=60) + cores = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of cores: {}".format(cores)) + assert cores > 0, "Invalid number of cores" + + total_score = 0 + + for i in range(runs): + # Match "Run %d" + res = dut.expect(r"Run (\d+)", timeout=120) + run = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Run {}".format(run)) + assert run == i, "Invalid run number" + + score = 0 + # Match "CoreMark 1.0 : %d" + res = dut.expect(r"CoreMark 1.0 : (\d+)\.(\d+)", timeout=120) + score = float(res.group(0).decode("utf-8").split(" ")[3]) + LOGGER.info("CoreMark score: {}".format(score)) + assert score > 0 and score < 10000, "Impossible CoreMark score" + total_score += score + + avg_score = round(total_score / runs, 2) + LOGGER.info("Average CoreMark score: {}".format(avg_score)) + assert avg_score > 0 and avg_score < 10000, "Impossible CoreMark score" + + # Create JSON with results and write it to file + # Always create a JSON with this format (so it can be merged later on): + # { TEST_NAME_STR: TEST_RESULTS_DICT } + results = {"coremark": {"runs": runs, "cores": cores, "avg_score": avg_score}} + + current_folder = os.path.dirname(request.path) + file_index = 0 + report_file = os.path.join(current_folder, "result_coremark" + str(file_index) + ".json") + while os.path.exists(report_file): + report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json") + file_index += 1 + + with open(report_file, "w") as f: + try: + f.write(json.dumps(results)) + except Exception as e: + LOGGER.warning("Failed to write results to file: {}".format(e)) diff --git a/tests/performance/fibonacci/fibonacci.ino b/tests/performance/fibonacci/fibonacci.ino new file mode 100644 index 000000000..01fd6f7be --- /dev/null +++ b/tests/performance/fibonacci/fibonacci.ino @@ -0,0 +1,48 @@ +/* + Fibonacci calculation test for Arduino and ESP32. + Created by Lucas Saavedra Vaz, 2024 +*/ + +#include + +// Number of runs to average +#define N_RUNS 3 + +// Fibonacci number to calculate. Keep between 35 and 45. +#define FIB_N 40 + +uint64_t fib(uint32_t n) { + if (n < 2) { + return n; + } + return fib(n - 1) + fib(n - 2); +} + +void setup() { + uint64_t fibonacci; + + Serial.begin(115200); + while (!Serial) { + delay(10); + } + + log_d("Starting fibonacci calculation"); + Serial.printf("Runs: %d\n", N_RUNS); + Serial.printf("N: %d\n", FIB_N); + Serial.flush(); + for (int i = 0; i < N_RUNS; i++) { + Serial.printf("Run %d", i); + unsigned long start = millis(); + fibonacci = fib(FIB_N); + unsigned long elapsed = millis() - start; + Serial.printf("Fibonacci(N): %llu\n", fibonacci); + Serial.printf("Time: %lu.%03lu s\n", elapsed / 1000, elapsed % 1000); + Serial.flush(); + } + + log_d("Fibonacci calculation test done"); +} + +void loop() { + vTaskDelete(NULL); +} diff --git a/tests/performance/fibonacci/test_fibonacci.py b/tests/performance/fibonacci/test_fibonacci.py new file mode 100644 index 000000000..622ea77ee --- /dev/null +++ b/tests/performance/fibonacci/test_fibonacci.py @@ -0,0 +1,78 @@ +import json +import logging +import os + + +def test_fibonacci(dut, request): + LOGGER = logging.getLogger(__name__) + + # Fibonacci results starting from fib(35) to fib(45) + fib_results = [ + 9227465, + 14930352, + 24157817, + 39088169, + 63245986, + 102334155, + 165580141, + 267914296, + 433494437, + 701408733, + ] + + # Match "Runs: %d" + res = dut.expect(r"Runs: (\d+)", timeout=60) + runs = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of runs: {}".format(runs)) + assert runs > 0, "Invalid number of runs" + + # Match "N: %d" + res = dut.expect(r"N: (\d+)", timeout=300) + fib_n = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Calculating Fibonacci({})".format(fib_n)) + assert fib_n > 30 and fib_n < 50, "Invalid Fibonacci number" + + list_time = [] + + for i in range(runs): + # Match "Run %d" + res = dut.expect(r"Run (\d+)", timeout=120) + run = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Run {}".format(run)) + assert run == i, "Invalid run number" + + # Match "Fibonacci(N): %llu" + res = dut.expect(r"Fibonacci\(N\): (\d+)", timeout=300) + fib_result = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Fibonacci({}) = {}".format(fib_n, fib_result)) + assert fib_result > 0, "Invalid Fibonacci result" + + # Check if the result is correct + assert fib_result == fib_results[fib_n - 35] + + # Match "Time: %lu.%03lu s" + res = dut.expect(r"Time: (\d+)\.(\d+) s", timeout=300) + time = float(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Time on run {}: {} s".format(i, time)) + assert time > 0 and time < 1000, "Invalid time" + list_time.append(time) + + avg_time = round(sum(list_time) / len(list_time), 3) + + # Create JSON with results and write it to file + # Always create a JSON with this format (so it can be merged later on): + # { TEST_NAME_STR: TEST_RESULTS_DICT } + results = {"fibonacci": {"runs": runs, "fib_n": fib_n, "avg_time": avg_time}} + + current_folder = os.path.dirname(request.path) + file_index = 0 + report_file = os.path.join(current_folder, "result_fibonacci" + str(file_index) + ".json") + while os.path.exists(report_file): + report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json") + file_index += 1 + + with open(report_file, "w") as f: + try: + f.write(json.dumps(results)) + except Exception as e: + LOGGER.warning("Failed to write results to file: {}".format(e)) diff --git a/tests/performance/psramspeed/.skip.esp32c3 b/tests/performance/psramspeed/.skip.esp32c3 new file mode 100644 index 000000000..e69de29bb diff --git a/tests/performance/psramspeed/.skip.esp32c6 b/tests/performance/psramspeed/.skip.esp32c6 new file mode 100644 index 000000000..e69de29bb diff --git a/tests/performance/psramspeed/.skip.esp32h2 b/tests/performance/psramspeed/.skip.esp32h2 new file mode 100644 index 000000000..e69de29bb diff --git a/tests/performance/psramspeed/psramspeed.ino b/tests/performance/psramspeed/psramspeed.ino new file mode 100644 index 000000000..81175e6b3 --- /dev/null +++ b/tests/performance/psramspeed/psramspeed.ino @@ -0,0 +1,266 @@ +/* + Based on the ramspeed test from NuttX. + https://github.com/apache/nuttx-apps/blob/master/benchmarks/ramspeed/ramspeed_main.c + Modified for Arduino and ESP32 by Lucas Saavedra Vaz, 2024 +*/ + +#include + +// Test settings + +// Number of runs to average +#define N_RUNS 3 + +// Value to fill the memory with +#define FILL_VALUE 0x00 + +// Number of copies to be performed in each test +#define N_COPIES 400 + +// Start size for the tests. Value must be a power of 2. +// Values lower or equal than 32 KB may cause the operations to use the cache instead of the PSRAM. +#define START_SIZE 65536 + +// Max size to be copied. Must be bigger than 32 and it will be floored to the nearest power of 2 +#define MAX_TEST_SIZE 512 * 1024 // 512KB + +// Implementation macros + +#if defined(UINTPTR_MAX) && UINTPTR_MAX > 0xFFFFFFFF +#define MEM_UNIT uint64_t +#define ALIGN_MASK 0x7 +#else +#define MEM_UNIT uint32_t +#define ALIGN_MASK 0x3 +#endif + +#define COPY32 \ + *d32 = *s32; \ + d32++; \ + s32++; +#define COPY8 \ + *d8 = *s8; \ + d8++; \ + s8++; +#define SET32(x) \ + *d32 = x; \ + d32++; +#define SET8(x) \ + *d8 = x; \ + d8++; +#define REPEAT8(expr) expr expr expr expr expr expr expr expr + +/* Functions */ + +static void *mock_memcpy(void *dst, const void *src, size_t len) { + uint8_t *d8 = (uint8_t *)dst; + const uint8_t *s8 = (uint8_t *)src; + + uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK; + uintptr_t s_align = (uintptr_t)s8 & ALIGN_MASK; + uint32_t *d32; + const uint32_t *s32; + + /* Byte copy for unaligned memories */ + + if (s_align != d_align) { + while (len > 32) { + REPEAT8(COPY8); + REPEAT8(COPY8); + REPEAT8(COPY8); + REPEAT8(COPY8); + len -= 32; + } + + while (len) { + COPY8; + len--; + } + + return dst; + } + + /* Make the memories aligned */ + + if (d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while (d_align && len) { + COPY8; + d_align--; + len--; + } + } + + d32 = (uint32_t *)d8; + s32 = (uint32_t *)s8; + while (len > 32) { + REPEAT8(COPY32); + len -= 32; + } + + while (len > 4) { + COPY32; + len -= 4; + } + + d8 = (uint8_t *)d32; + s8 = (const uint8_t *)s32; + while (len) { + COPY8; + len--; + } + + return dst; +} + +static void mock_memset(void *dst, uint8_t v, size_t len) { + uint8_t *d8 = (uint8_t *)dst; + uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK; + uint32_t v32; + uint32_t *d32; + + /* Make the address aligned */ + + if (d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while (d_align && len) { + SET8(v); + len--; + d_align--; + } + } + + v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24); + + d32 = (uint32_t *)d8; + + while (len > 32) { + REPEAT8(SET32(v32)); + len -= 32; + } + + while (len > 4) { + SET32(v32); + len -= 4; + } + + d8 = (uint8_t *)d32; + while (len) { + SET8(v); + len--; + } +} + +static void print_rate(const char *name, uint64_t bytes, uint32_t cost_time) { + uint32_t rate; + if (cost_time == 0) { + Serial.println("Error: Too little time taken, please increase N_COPIES"); + return; + } + + rate = bytes * 1000 / cost_time / 1024; + Serial.printf("%s Rate = %" PRIu32 " KB/s Time: %" PRIu32 " ms\n", name, rate, cost_time); +} + +static void memcpy_speed_test(void *dest, const void *src, size_t size, uint32_t repeat_cnt) { + uint32_t start_time; + uint32_t cost_time_system; + uint32_t cost_time_mock; + uint32_t cnt; + uint32_t step; + uint64_t total_size; + + for (step = START_SIZE; step <= size; step <<= 1) { + total_size = (uint64_t)step * (uint64_t)repeat_cnt; + + Serial.printf("Memcpy %" PRIu32 " Bytes test\n", step); + + start_time = millis(); + + for (cnt = 0; cnt < repeat_cnt; cnt++) { + memcpy(dest, src, step); + } + + cost_time_system = millis() - start_time; + + start_time = millis(); + + for (cnt = 0; cnt < repeat_cnt; cnt++) { + mock_memcpy(dest, src, step); + } + + cost_time_mock = millis() - start_time; + + print_rate("System memcpy():", total_size, cost_time_system); + print_rate("Mock memcpy():", total_size, cost_time_mock); + } +} + +static void memset_speed_test(void *dest, uint8_t value, size_t size, uint32_t repeat_num) { + uint32_t start_time; + uint32_t cost_time_system; + uint32_t cost_time_mock; + uint32_t cnt; + uint32_t step; + uint64_t total_size; + + for (step = START_SIZE; step <= size; step <<= 1) { + total_size = (uint64_t)step * (uint64_t)repeat_num; + + Serial.printf("Memset %" PRIu32 " Bytes test\n", step); + + start_time = millis(); + + for (cnt = 0; cnt < repeat_num; cnt++) { + memset(dest, value, step); + } + + cost_time_system = millis() - start_time; + + start_time = millis(); + + for (cnt = 0; cnt < repeat_num; cnt++) { + mock_memset(dest, value, step); + } + + cost_time_mock = millis() - start_time; + + print_rate("System memset():", total_size, cost_time_system); + print_rate("Mock memset():", total_size, cost_time_mock); + } +} + +/* Main */ + +void setup() { + Serial.begin(115200); + while (!Serial) { + delay(10); + } + + void *dest = ps_malloc(MAX_TEST_SIZE); + const void *src = ps_malloc(MAX_TEST_SIZE); + + if (!dest || !src) { + Serial.println("Memory allocation failed"); + return; + } + + log_d("Starting PSRAM speed test"); + Serial.printf("Runs: %d\n", N_RUNS); + Serial.printf("Copies: %d\n", N_COPIES); + Serial.printf("Max test size: %d\n", MAX_TEST_SIZE); + Serial.flush(); + for (int i = 0; i < N_RUNS; i++) { + Serial.printf("Run %d", i); + memcpy_speed_test(dest, src, MAX_TEST_SIZE, N_COPIES); + Serial.flush(); + memset_speed_test(dest, FILL_VALUE, MAX_TEST_SIZE, N_COPIES); + Serial.flush(); + } + log_d("PSRAM speed test done"); +} + +void loop() { + vTaskDelete(NULL); +} diff --git a/tests/performance/psramspeed/test_psramspeed.py b/tests/performance/psramspeed/test_psramspeed.py new file mode 100644 index 000000000..8d0515807 --- /dev/null +++ b/tests/performance/psramspeed/test_psramspeed.py @@ -0,0 +1,105 @@ +import json +import logging +import os + +from collections import defaultdict + + +def test_psramspeed(dut, request): + LOGGER = logging.getLogger(__name__) + + runs_results = [] + + # Match "Runs: %d" + res = dut.expect(r"Runs: (\d+)", timeout=60) + runs = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of runs: {}".format(runs)) + assert runs > 0, "Invalid number of runs" + + # Match "Copies: %d" + res = dut.expect(r"Copies: (\d+)", timeout=60) + copies = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of copies in each test: {}".format(copies)) + assert copies > 0, "Invalid number of copies" + + # Match "Max test size: %lu" + res = dut.expect(r"Max test size: (\d+)", timeout=60) + max_test_size = int(res.group(0).decode("utf-8").split(" ")[3]) + LOGGER.info("Max test size: {}".format(max_test_size)) + assert max_test_size > 0, "Invalid max test size" + + for i in range(runs): + # Match "Run %d" + res = dut.expect(r"Run (\d+)", timeout=120) + run = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Run {}".format(run)) + assert run == i, "Invalid run number" + + for j in range(2): + while True: + # Match "Memcpy/Memtest %d Bytes test" + res = dut.expect(r"(Memcpy|Memset) (\d+) Bytes test", timeout=60) + current_test = res.group(0).decode("utf-8").split(" ")[0].lower() + current_test_size = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Current {} test size: {}".format(current_test, current_test_size)) + assert current_test_size > 0, "Invalid test size" + + for k in range(2): + # Match "System/Mock memcpy/memtest(): Rate = %d KB/s Time: %d ms" or "Error: %s" + res = dut.expect( + r"((System|Mock) (memcpy|memset)\(\): Rate = (\d+) KB/s Time: (\d+) ms|^Error)", timeout=90 + ) + implementation = res.group(0).decode("utf-8").split(" ")[0].lower() + assert implementation != "error:", "Error detected in test output" + test_type = res.group(0).decode("utf-8").split(" ")[1].lower()[:-3] + rate = int(res.group(0).decode("utf-8").split(" ")[4]) + time = int(res.group(0).decode("utf-8").split(" ")[7]) + assert rate > 0, "Invalid rate" + assert time > 0, "Invalid time" + assert test_type == current_test, "Missing test output" + LOGGER.info("{} {}: Rate = {} KB/s. Time = {} ms".format(implementation, test_type, rate, time)) + + runs_results.append(((current_test, str(current_test_size), implementation), (rate, time))) + + if current_test_size == max_test_size: + break + + LOGGER.info("=============================================================") + + # Calculate average rate and time for each test size + sums = defaultdict(lambda: {"rate_sum": 0, "time_sum": 0}) + + for (test, size, impl), (rate, time) in runs_results: + sums[(test, size, impl)]["rate_sum"] += rate + sums[(test, size, impl)]["time_sum"] += time + + avg_results = {} + for (test, size, impl) in sums: + rate_avg = round(sums[(test, size, impl)]["rate_sum"] / runs, 2) + time_avg = round(sums[(test, size, impl)]["time_sum"] / runs, 2) + LOGGER.info( + "Test: {}-{}-{}: Average rate = {} KB/s. Average time = {} ms".format(test, size, impl, rate_avg, time_avg) + ) + if test not in avg_results: + avg_results[test] = {} + if size not in avg_results[test]: + avg_results[test][size] = {} + avg_results[test][size][impl] = {"avg_rate": rate_avg, "avg_time": time_avg} + + # Create JSON with results and write it to file + # Always create a JSON with this format (so it can be merged later on): + # { TEST_NAME_STR: TEST_RESULTS_DICT } + results = {"psramspeed": {"runs": runs, "copies": copies, "max_test_size": max_test_size, "results": avg_results}} + + current_folder = os.path.dirname(request.path) + file_index = 0 + report_file = os.path.join(current_folder, "result_psramspeed" + str(file_index) + ".json") + while os.path.exists(report_file): + report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json") + file_index += 1 + + with open(report_file, "w") as f: + try: + f.write(json.dumps(results)) + except Exception as e: + LOGGER.warning("Failed to write results to file: {}".format(e)) diff --git a/tests/performance/ramspeed/cfg.json b/tests/performance/ramspeed/cfg.json new file mode 100644 index 000000000..dc67d4016 --- /dev/null +++ b/tests/performance/ramspeed/cfg.json @@ -0,0 +1,40 @@ +{ + "targets": [ + { + "name": "esp32", + "fqbn":[ + "espressif:esp32:esp32:PSRAM=disabled,PartitionScheme=huge_app" + ] + }, + { + "name": "esp32s2", + "fqbn": [ + "espressif:esp32:esp32s2:PSRAM=disabled,PartitionScheme=huge_app" + ] + }, + { + "name": "esp32c3", + "fqbn": [ + "espressif:esp32:esp32c3:PartitionScheme=huge_app" + ] + }, + { + "name": "esp32s3", + "fqbn": [ + "espressif:esp32:esp32s3:PSRAM=disabled,USBMode=default,PartitionScheme=huge_app" + ] + }, + { + "name": "esp32c6", + "fqbn": [ + "espressif:esp32:esp32c6:PartitionScheme=huge_app" + ] + }, + { + "name": "esp32h2", + "fqbn": [ + "espressif:esp32:esp32h2:PartitionScheme=huge_app" + ] + } + ] +} diff --git a/tests/performance/ramspeed/ramspeed.ino b/tests/performance/ramspeed/ramspeed.ino new file mode 100644 index 000000000..e0ab0db4c --- /dev/null +++ b/tests/performance/ramspeed/ramspeed.ino @@ -0,0 +1,262 @@ +/* + Based on the ramspeed test from NuttX. + https://github.com/apache/nuttx-apps/blob/master/benchmarks/ramspeed/ramspeed_main.c + Modified for Arduino and ESP32 by Lucas Saavedra Vaz, 2024 +*/ + +#include + +// Test settings + +// Number of runs to average +#define N_RUNS 3 + +// Value to fill the memory with +#define FILL_VALUE 0x00 + +// Number of copies to be performed in each test +#define N_COPIES 50000 + +// Max size to be copied. Must be bigger than 32 and it will be floored to the nearest power of 2 +#define MAX_TEST_SIZE 64 * 1024 // 64KB + +// Implementation macros + +#if defined(UINTPTR_MAX) && UINTPTR_MAX > 0xFFFFFFFF +#define MEM_UNIT uint64_t +#define ALIGN_MASK 0x7 +#else +#define MEM_UNIT uint32_t +#define ALIGN_MASK 0x3 +#endif + +#define COPY32 \ + *d32 = *s32; \ + d32++; \ + s32++; +#define COPY8 \ + *d8 = *s8; \ + d8++; \ + s8++; +#define SET32(x) \ + *d32 = x; \ + d32++; +#define SET8(x) \ + *d8 = x; \ + d8++; +#define REPEAT8(expr) expr expr expr expr expr expr expr expr + +/* Functions */ + +static void *mock_memcpy(void *dst, const void *src, size_t len) { + uint8_t *d8 = (uint8_t *)dst; + const uint8_t *s8 = (uint8_t *)src; + + uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK; + uintptr_t s_align = (uintptr_t)s8 & ALIGN_MASK; + uint32_t *d32; + const uint32_t *s32; + + /* Byte copy for unaligned memories */ + + if (s_align != d_align) { + while (len > 32) { + REPEAT8(COPY8); + REPEAT8(COPY8); + REPEAT8(COPY8); + REPEAT8(COPY8); + len -= 32; + } + + while (len) { + COPY8; + len--; + } + + return dst; + } + + /* Make the memories aligned */ + + if (d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while (d_align && len) { + COPY8; + d_align--; + len--; + } + } + + d32 = (uint32_t *)d8; + s32 = (uint32_t *)s8; + while (len > 32) { + REPEAT8(COPY32); + len -= 32; + } + + while (len > 4) { + COPY32; + len -= 4; + } + + d8 = (uint8_t *)d32; + s8 = (const uint8_t *)s32; + while (len) { + COPY8; + len--; + } + + return dst; +} + +static void mock_memset(void *dst, uint8_t v, size_t len) { + uint8_t *d8 = (uint8_t *)dst; + uintptr_t d_align = (uintptr_t)d8 & ALIGN_MASK; + uint32_t v32; + uint32_t *d32; + + /* Make the address aligned */ + + if (d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while (d_align && len) { + SET8(v); + len--; + d_align--; + } + } + + v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24); + + d32 = (uint32_t *)d8; + + while (len > 32) { + REPEAT8(SET32(v32)); + len -= 32; + } + + while (len > 4) { + SET32(v32); + len -= 4; + } + + d8 = (uint8_t *)d32; + while (len) { + SET8(v); + len--; + } +} + +static void print_rate(const char *name, uint64_t bytes, uint32_t cost_time) { + uint32_t rate; + if (cost_time == 0) { + Serial.println("Error: Too little time taken, please increase N_COPIES"); + return; + } + + rate = bytes * 1000 / cost_time / 1024; + Serial.printf("%s Rate = %" PRIu32 " KB/s Time: %" PRIu32 " ms\n", name, rate, cost_time); +} + +static void memcpy_speed_test(void *dest, const void *src, size_t size, uint32_t repeat_cnt) { + uint32_t start_time; + uint32_t cost_time_system; + uint32_t cost_time_mock; + uint32_t cnt; + uint32_t step; + uint64_t total_size; + + for (step = 32; step <= size; step <<= 1) { + total_size = (uint64_t)step * (uint64_t)repeat_cnt; + + Serial.printf("Memcpy %" PRIu32 " Bytes test\n", step); + + start_time = millis(); + + for (cnt = 0; cnt < repeat_cnt; cnt++) { + memcpy(dest, src, step); + } + + cost_time_system = millis() - start_time; + + start_time = millis(); + + for (cnt = 0; cnt < repeat_cnt; cnt++) { + mock_memcpy(dest, src, step); + } + + cost_time_mock = millis() - start_time; + + print_rate("System memcpy():", total_size, cost_time_system); + print_rate("Mock memcpy():", total_size, cost_time_mock); + } +} + +static void memset_speed_test(void *dest, uint8_t value, size_t size, uint32_t repeat_num) { + uint32_t start_time; + uint32_t cost_time_system; + uint32_t cost_time_mock; + uint32_t cnt; + uint32_t step; + uint64_t total_size; + + for (step = 32; step <= size; step <<= 1) { + total_size = (uint64_t)step * (uint64_t)repeat_num; + + Serial.printf("Memset %" PRIu32 " Bytes test\n", step); + + start_time = millis(); + + for (cnt = 0; cnt < repeat_num; cnt++) { + memset(dest, value, step); + } + + cost_time_system = millis() - start_time; + + start_time = millis(); + + for (cnt = 0; cnt < repeat_num; cnt++) { + mock_memset(dest, value, step); + } + + cost_time_mock = millis() - start_time; + + print_rate("System memset():", total_size, cost_time_system); + print_rate("Mock memset():", total_size, cost_time_mock); + } +} + +/* Main */ + +void setup() { + Serial.begin(115200); + while (!Serial) { + delay(10); + } + + void *dest = malloc(MAX_TEST_SIZE); + const void *src = malloc(MAX_TEST_SIZE); + + if (!dest || !src) { + Serial.println("Memory allocation failed"); + return; + } + + log_d("Starting RAM speed test"); + Serial.printf("Runs: %d\n", N_RUNS); + Serial.printf("Copies: %d\n", N_COPIES); + Serial.printf("Max test size: %d\n", MAX_TEST_SIZE); + Serial.flush(); + for (int i = 0; i < N_RUNS; i++) { + Serial.printf("Run %d", i); + memcpy_speed_test(dest, src, MAX_TEST_SIZE, N_COPIES); + Serial.flush(); + memset_speed_test(dest, FILL_VALUE, MAX_TEST_SIZE, N_COPIES); + Serial.flush(); + } + log_d("RAM speed test done"); +} + +void loop() { + vTaskDelete(NULL); +} diff --git a/tests/performance/ramspeed/test_ramspeed.py b/tests/performance/ramspeed/test_ramspeed.py new file mode 100644 index 000000000..b4c3cee7f --- /dev/null +++ b/tests/performance/ramspeed/test_ramspeed.py @@ -0,0 +1,105 @@ +import json +import logging +import os + +from collections import defaultdict + + +def test_ramspeed(dut, request): + LOGGER = logging.getLogger(__name__) + + runs_results = [] + + # Match "Runs: %d" + res = dut.expect(r"Runs: (\d+)", timeout=60) + runs = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of runs: {}".format(runs)) + assert runs > 0, "Invalid number of runs" + + # Match "Copies: %d" + res = dut.expect(r"Copies: (\d+)", timeout=60) + copies = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of copies in each test: {}".format(copies)) + assert copies > 0, "Invalid number of copies" + + # Match "Max test size: %lu" + res = dut.expect(r"Max test size: (\d+)", timeout=60) + max_test_size = int(res.group(0).decode("utf-8").split(" ")[3]) + LOGGER.info("Max test size: {}".format(max_test_size)) + assert max_test_size > 0, "Invalid max test size" + + for i in range(runs): + # Match "Run %d" + res = dut.expect(r"Run (\d+)", timeout=120) + run = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Run {}".format(run)) + assert run == i, "Invalid run number" + + for j in range(2): + while True: + # Match "Memcpy/Memtest %d Bytes test" + res = dut.expect(r"(Memcpy|Memset) (\d+) Bytes test", timeout=60) + current_test = res.group(0).decode("utf-8").split(" ")[0].lower() + current_test_size = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Current {} test size: {}".format(current_test, current_test_size)) + assert current_test_size > 0, "Invalid test size" + + for k in range(2): + # Match "System/Mock memcpy/memtest(): Rate = %d KB/s Time: %d ms" or "Error: %s" + res = dut.expect( + r"((System|Mock) (memcpy|memset)\(\): Rate = (\d+) KB/s Time: (\d+) ms|^Error)", timeout=90 + ) + implementation = res.group(0).decode("utf-8").split(" ")[0].lower() + assert implementation != "error:", "Error detected in test output" + test_type = res.group(0).decode("utf-8").split(" ")[1].lower()[:-3] + rate = int(res.group(0).decode("utf-8").split(" ")[4]) + time = int(res.group(0).decode("utf-8").split(" ")[7]) + assert rate > 0, "Invalid rate" + assert time > 0, "Invalid time" + assert test_type == current_test, "Missing test output" + LOGGER.info("{} {}: Rate = {} KB/s. Time = {} ms".format(implementation, test_type, rate, time)) + + runs_results.append(((current_test, str(current_test_size), implementation), (rate, time))) + + if current_test_size == max_test_size: + break + + LOGGER.info("=============================================================") + + # Calculate average rate and time for each test size + sums = defaultdict(lambda: {"rate_sum": 0, "time_sum": 0}) + + for (test, size, impl), (rate, time) in runs_results: + sums[(test, size, impl)]["rate_sum"] += rate + sums[(test, size, impl)]["time_sum"] += time + + avg_results = {} + for (test, size, impl) in sums: + rate_avg = round(sums[(test, size, impl)]["rate_sum"] / runs, 2) + time_avg = round(sums[(test, size, impl)]["time_sum"] / runs, 2) + LOGGER.info( + "Test: {}-{}-{}: Average rate = {} KB/s. Average time = {} ms".format(test, size, impl, rate_avg, time_avg) + ) + if test not in avg_results: + avg_results[test] = {} + if size not in avg_results[test]: + avg_results[test][size] = {} + avg_results[test][size][impl] = {"avg_rate": rate_avg, "avg_time": time_avg} + + # Create JSON with results and write it to file + # Always create a JSON with this format (so it can be merged later on): + # { TEST_NAME_STR: TEST_RESULTS_DICT } + results = {"ramspeed": {"runs": runs, "copies": copies, "max_test_size": max_test_size, "results": avg_results}} + + current_folder = os.path.dirname(request.path) + file_index = 0 + report_file = os.path.join(current_folder, "result_ramspeed" + str(file_index) + ".json") + while os.path.exists(report_file): + report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json") + file_index += 1 + + with open(report_file, "w") as f: + try: + f.write(json.dumps(results)) + except Exception as e: + LOGGER.warning("Failed to write results to file: {}".format(e)) diff --git a/tests/performance/superpi/fftsg_h.cpp b/tests/performance/superpi/fftsg_h.cpp new file mode 100644 index 000000000..8361b5a57 --- /dev/null +++ b/tests/performance/superpi/fftsg_h.cpp @@ -0,0 +1,2329 @@ +/* + Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999. + https://github.com/Fibonacci43/SuperPI + Modified for Arduino by Lucas Saavedra Vaz, 2024. +*/ + +#include + +void cdft(int n, int isgn, double *a) { + if (isgn >= 0) { + cftfsub(n, a); + } else { + cftbsub(n, a); + } +} + +void rdft(int n, int isgn, double *a) { + double xi; + + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a); + rftfsub(n, a); + } else if (n == 4) { + cftfsub(n, a); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a); + cftbsub(n, a); + } else if (n == 4) { + cftbsub(n, a); + } + } +} + +void ddct(int n, int isgn, double *a) { + int j; + double xr; + + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = a[j] - a[j - 1]; + a[j] += a[j - 1]; + } + a[1] = a[0] - xr; + a[0] += xr; + if (n > 4) { + rftbsub(n, a); + cftbsub(n, a); + } else if (n == 4) { + cftbsub(n, a); + } + } + if (n > 4) { + dctsub(n, a); + } else { + dctsub4(n, a); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a); + rftfsub(n, a); + } else if (n == 4) { + cftfsub(n, a); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = a[j] - a[j + 1]; + a[j] += a[j + 1]; + } + a[n - 1] = xr; + } +} + +void ddst(int n, int isgn, double *a) { + int j; + double xr; + + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = -a[j] - a[j - 1]; + a[j] -= a[j - 1]; + } + a[1] = a[0] + xr; + a[0] -= xr; + if (n > 4) { + rftbsub(n, a); + cftbsub(n, a); + } else if (n == 4) { + cftbsub(n, a); + } + } + if (n > 4) { + dstsub(n, a); + } else { + dstsub4(n, a); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a); + rftfsub(n, a); + } else if (n == 4) { + cftfsub(n, a); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = -a[j] - a[j + 1]; + a[j] -= a[j + 1]; + } + a[n - 1] = -xr; + } +} + +void dfct(int n, double *a) { + int j, k, m, mh; + double xr, xi, yr, yi, an; + + m = n >> 1; + for (j = 0; j < m; j++) { + k = n - j; + xr = a[j] + a[k]; + a[j] -= a[k]; + a[k] = xr; + } + an = a[n]; + while (m >= 2) { + ddct(m, 1, a); + bitrv1(m, a); + mh = m >> 1; + xi = a[m]; + a[m] = a[0]; + a[0] = an - xi; + an += xi; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[m + k]; + xi = a[m + j]; + yr = a[j]; + yi = a[k]; + a[m + j] = yr; + a[m + k] = yi; + a[j] = xr - xi; + a[k] = xr + xi; + } + xr = a[mh]; + a[mh] = a[m + mh]; + a[m + mh] = xr; + m = mh; + } + xi = a[1]; + a[1] = a[0]; + a[0] = an + xi; + a[n] = an - xi; + bitrv1(n, a); +} + +void dfst(int n, double *a) { + int j, k, m, mh; + double xr, xi, yr, yi; + + m = n >> 1; + for (j = 1; j < m; j++) { + k = n - j; + xr = a[j] - a[k]; + a[j] += a[k]; + a[k] = xr; + } + a[0] = a[m]; + while (m >= 2) { + ddst(m, 1, a); + bitrv1(m, a); + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[m + k]; + xi = a[m + j]; + yr = a[j]; + yi = a[k]; + a[m + j] = yr; + a[m + k] = yi; + a[j] = xr + xi; + a[k] = xr - xi; + } + a[m] = a[0]; + a[0] = a[m + mh]; + a[m + mh] = a[mh]; + m = mh; + } + a[1] = a[0]; + a[0] = 0; + bitrv1(n, a); +} + +/* -------- child routines -------- */ + +void cftfsub(int n, double *a) { + int m; + + if (n > 32) { + m = n >> 2; + cftmdl1(n, a); + if (n > CDFT_RECURSIVE_N) { + cftrec1(m, a); + cftrec2(m, &a[m]); + cftrec1(m, &a[2 * m]); + cftrec1(m, &a[3 * m]); + } else if (m > 32) { + cftexp1(n, a); + } else { + cftfx41(n, a); + } + bitrv2(n, a); + } else if (n > 8) { + if (n == 32) { + cftf161(a); + bitrv216(a); + } else { + cftf081(a); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + +void cftbsub(int n, double *a) { + int m; + + if (n > 32) { + m = n >> 2; + cftb1st(n, a); + if (n > CDFT_RECURSIVE_N) { + cftrec1(m, a); + cftrec2(m, &a[m]); + cftrec1(m, &a[2 * m]); + cftrec1(m, &a[3 * m]); + } else if (m > 32) { + cftexp1(n, a); + } else { + cftfx41(n, a); + } + bitrv2conj(n, a); + } else if (n > 8) { + if (n == 32) { + cftf161(a); + bitrv216neg(a); + } else { + cftf081(a); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + +void bitrv2(int n, double *a) { + int j0, k0, j1, k1, l, m, i, j, k; + double xr, xi, yr, yi; + + l = n >> 2; + m = 2; + while (m < l) { + l >>= 1; + m <<= 1; + } + if (m == l) { + j0 = 0; + for (k0 = 0; k0 < m; k0 += 2) { + k = k0; + for (j = j0; j < j0 + k0; j += 2) { + xr = a[j]; + xi = a[j + 1]; + yr = a[k]; + yi = a[k + 1]; + a[j] = yr; + a[j + 1] = yi; + a[k] = xr; + a[k + 1] = xi; + j1 = j + m; + k1 = k + 2 * m; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += m; + k1 -= m; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += m; + k1 += 2 * m; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + j1 = j0 + k0 + m; + k1 = j1 + m; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + } + } else { + j0 = 0; + for (k0 = 2; k0 < m; k0 += 2) { + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + k = k0; + for (j = j0; j < j0 + k0; j += 2) { + xr = a[j]; + xi = a[j + 1]; + yr = a[k]; + yi = a[k + 1]; + a[j] = yr; + a[j + 1] = yi; + a[k] = xr; + a[k + 1] = xi; + j1 = j + m; + k1 = k + m; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + } + } +} + +void bitrv2conj(int n, double *a) { + int j0, k0, j1, k1, l, m, i, j, k; + double xr, xi, yr, yi; + + l = n >> 2; + m = 2; + while (m < l) { + l >>= 1; + m <<= 1; + } + if (m == l) { + j0 = 0; + for (k0 = 0; k0 < m; k0 += 2) { + k = k0; + for (j = j0; j < j0 + k0; j += 2) { + xr = a[j]; + xi = -a[j + 1]; + yr = a[k]; + yi = -a[k + 1]; + a[j] = yr; + a[j + 1] = yi; + a[k] = xr; + a[k + 1] = xi; + j1 = j + m; + k1 = k + 2 * m; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += m; + k1 -= m; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += m; + k1 += 2 * m; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + k1 = j0 + k0; + a[k1 + 1] = -a[k1 + 1]; + j1 = k1 + m; + k1 = j1 + m; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + k1 += m; + a[k1 + 1] = -a[k1 + 1]; + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + } + } else { + a[1] = -a[1]; + a[m + 1] = -a[m + 1]; + j0 = 0; + for (k0 = 2; k0 < m; k0 += 2) { + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + k = k0; + for (j = j0; j < j0 + k0; j += 2) { + xr = a[j]; + xi = -a[j + 1]; + yr = a[k]; + yi = -a[k + 1]; + a[j] = yr; + a[j + 1] = yi; + a[k] = xr; + a[k + 1] = xi; + j1 = j + m; + k1 = k + m; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + k1 = j0 + k0; + a[k1 + 1] = -a[k1 + 1]; + a[k1 + m + 1] = -a[k1 + m + 1]; + } + } +} + +void bitrv216(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + +void bitrv216neg(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i, + x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + +void bitrv208(double *a) { + double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + +void bitrv208neg(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + +void bitrv1(int n, double *a) { + int j0, k0, j1, k1, l, m, i, j, k; + double x; + + l = n >> 2; + m = 1; + while (m < l) { + l >>= 1; + m <<= 1; + } + if (m == l) { + j0 = 0; + for (k0 = 0; k0 < m; k0++) { + k = k0; + for (j = j0; j < j0 + k0; j++) { + x = a[j]; + a[j] = a[k]; + a[k] = x; + j1 = j + m; + k1 = k + 2 * m; + x = a[j1]; + a[j1] = a[k1]; + a[k1] = x; + j1 += m; + k1 -= m; + x = a[j1]; + a[j1] = a[k1]; + a[k1] = x; + j1 += m; + k1 += 2 * m; + x = a[j1]; + a[j1] = a[k1]; + a[k1] = x; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + j1 = j0 + k0 + m; + k1 = j1 + m; + x = a[j1]; + a[j1] = a[k1]; + a[k1] = x; + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + } + } else { + j0 = 0; + for (k0 = 1; k0 < m; k0++) { + for (i = n >> 1; i > (j0 ^= i); i >>= 1); + k = k0; + for (j = j0; j < j0 + k0; j++) { + x = a[j]; + a[j] = a[k]; + a[k] = x; + j1 = j + m; + k1 = k + m; + x = a[j1]; + a[j1] = a[k1]; + a[k1] = x; + for (i = n >> 1; i > (k ^= i); i >>= 1); + } + } + } +} + +void cftb1st(int n, double *a) { + int i, i0, j, j0, j1, j2, j3, m, mh; + double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i, ss1, ss3; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + ew = M_PI_2 / m; + w1r = cos(2 * ew); + w1i = sin(2 * ew); + wk1r = w1r; + wk1i = w1i; + ss1 = 2 * w1i; + wk3i = 2 * ss1 * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + ss3 = 2 * wk3i; + i = 0; + for (;;) { + i0 = i + 4 * CDFT_LOOP_DIV; + if (i0 > mh - 4) { + i0 = mh - 4; + } + for (j = i + 2; j < i0; j += 4) { + wd1r -= ss1 * wk1i; + wd1i += ss1 * wk1r; + wd3r -= ss3 * wk3i; + wd3i += ss3 * wk3r; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j + 2] + a[j2 + 2]; + x0i = -a[j + 3] - a[j2 + 3]; + x1r = a[j + 2] - a[j2 + 2]; + x1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j + 2] = x0r + x2r; + a[j + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + wk1r -= ss1 * wd1i; + wk1i += ss1 * wd1r; + wk3r -= ss3 * wd3i; + wk3i += ss3 * wd3r; + } + if (i0 == mh - 4) { + break; + } + wd1r = cos(ew * i0); + wd1i = sin(ew * i0); + wd3i = 4 * wd1i * wd1r; + wd3r = wd1r - wd3i * wd1i; + wd3i = wd1i - wd3i * wd1r; + wk1r = w1r * wd1r - w1i * wd1i; + wk1i = w1r * wd1i + w1i * wd1r; + wk3i = 4 * wk1i * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + i = i0; + } + wd1r -= ss1 * wk1i; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wd1r * (x0r - x0i); + a[j2 + 1] = wd1r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wd1r * (x0r + x0i); + a[j3 + 1] = -wd1r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + +void cftrec1(int n, double *a) { + int m; + + m = n >> 2; + cftmdl1(n, a); + if (n > CDFT_RECURSIVE_N) { + cftrec1(m, a); + cftrec2(m, &a[m]); + cftrec1(m, &a[2 * m]); + cftrec1(m, &a[3 * m]); + } else { + cftexp1(n, a); + } +} + +void cftrec2(int n, double *a) { + int m; + + m = n >> 2; + cftmdl2(n, a); + if (n > CDFT_RECURSIVE_N) { + cftrec1(m, a); + cftrec2(m, &a[m]); + cftrec1(m, &a[2 * m]); + cftrec2(m, &a[3 * m]); + } else { + cftexp2(n, a); + } +} + +void cftexp1(int n, double *a) { + int j, k, l; + + l = n >> 2; + while (l > 128) { + for (k = l; k < n; k <<= 2) { + for (j = k - l; j < n; j += 4 * k) { + cftmdl1(l, &a[j]); + cftmdl2(l, &a[k + j]); + cftmdl1(l, &a[2 * k + j]); + } + } + cftmdl1(l, &a[n - l]); + l >>= 2; + } + for (k = l; k < n; k <<= 2) { + for (j = k - l; j < n; j += 4 * k) { + cftmdl1(l, &a[j]); + cftfx41(l, &a[j]); + cftmdl2(l, &a[k + j]); + cftfx42(l, &a[k + j]); + cftmdl1(l, &a[2 * k + j]); + cftfx41(l, &a[2 * k + j]); + } + } + cftmdl1(l, &a[n - l]); + cftfx41(l, &a[n - l]); +} + +void cftexp2(int n, double *a) { + int j, k, l, m; + + m = n >> 1; + l = n >> 2; + while (l > 128) { + for (k = l; k < m; k <<= 2) { + for (j = k - l; j < m; j += 2 * k) { + cftmdl1(l, &a[j]); + cftmdl1(l, &a[m + j]); + } + for (j = 2 * k - l; j < m; j += 4 * k) { + cftmdl2(l, &a[j]); + cftmdl2(l, &a[m + j]); + } + } + l >>= 2; + } + for (k = l; k < m; k <<= 2) { + for (j = k - l; j < m; j += 2 * k) { + cftmdl1(l, &a[j]); + cftfx41(l, &a[j]); + cftmdl1(l, &a[m + j]); + cftfx41(l, &a[m + j]); + } + for (j = 2 * k - l; j < m; j += 4 * k) { + cftmdl2(l, &a[j]); + cftfx42(l, &a[j]); + cftmdl2(l, &a[m + j]); + cftfx42(l, &a[m + j]); + } + } +} + +void cftmdl1(int n, double *a) { + int i, i0, j, j0, j1, j2, j3, m, mh; + double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i, ss1, ss3; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + ew = M_PI_2 / m; + w1r = cos(2 * ew); + w1i = sin(2 * ew); + wk1r = w1r; + wk1i = w1i; + ss1 = 2 * w1i; + wk3i = 2 * ss1 * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + ss3 = 2 * wk3i; + i = 0; + for (;;) { + i0 = i + 4 * CDFT_LOOP_DIV; + if (i0 > mh - 4) { + i0 = mh - 4; + } + for (j = i + 2; j < i0; j += 4) { + wd1r -= ss1 * wk1i; + wd1i += ss1 * wk1r; + wd3r -= ss3 * wk3i; + wd3i += ss3 * wk3r; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j + 2] + a[j2 + 2]; + x0i = a[j + 3] + a[j2 + 3]; + x1r = a[j + 2] - a[j2 + 2]; + x1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j + 2] = x0r + x2r; + a[j + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + wk1r -= ss1 * wd1i; + wk1i += ss1 * wd1r; + wk3r -= ss3 * wd3i; + wk3i += ss3 * wd3r; + } + if (i0 == mh - 4) { + break; + } + wd1r = cos(ew * i0); + wd1i = sin(ew * i0); + wd3i = 4 * wd1i * wd1r; + wd3r = wd1r - wd3i * wd1i; + wd3i = wd1i - wd3i * wd1r; + wk1r = w1r * wd1r - w1i * wd1i; + wk1i = w1r * wd1i + w1i * wd1r; + wk3i = 4 * wk1i * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + i = i0; + } + wd1r -= ss1 * wk1i; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wd1r * (x0r - x0i); + a[j2 + 1] = wd1r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wd1r * (x0r + x0i); + a[j3 + 1] = -wd1r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + +void cftmdl2(int n, double *a) { + int i, i0, j, j0, j1, j2, j3, m, mh; + double ew, w1r, w1i, wn4r, wk1r, wk1i, wk3r, wk3i, wl1r, wl1i, wl3r, wl3i, wd1r, wd1i, wd3r, wd3i, we1r, we1i, we3r, we3i, ss1, ss3; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = WR5000; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + wl1r = 1; + wl1i = 0; + wl3r = 1; + wl3i = 0; + we1r = wn4r; + we1i = wn4r; + we3r = -wn4r; + we3i = -wn4r; + ew = M_PI_2 / (2 * m); + w1r = cos(2 * ew); + w1i = sin(2 * ew); + wk1r = w1r; + wk1i = w1i; + wd1r = wn4r * (w1r - w1i); + wd1i = wn4r * (w1i + w1r); + ss1 = 2 * w1i; + wk3i = 2 * ss1 * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + ss3 = 2 * wk3i; + wd3r = -wn4r * (wk3r - wk3i); + wd3i = -wn4r * (wk3i + wk3r); + i = 0; + for (;;) { + i0 = i + 4 * CDFT_LOOP_DIV; + if (i0 > mh - 4) { + i0 = mh - 4; + } + for (j = i + 2; j < i0; j += 4) { + wl1r -= ss1 * wk1i; + wl1i += ss1 * wk1r; + wl3r -= ss3 * wk3i; + wl3i += ss3 * wk3r; + we1r -= ss1 * wd1i; + we1i += ss1 * wd1r; + we3r -= ss3 * wd3i; + we3i += ss3 * wd3r; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + x0r = a[j + 2] - a[j2 + 3]; + x0i = a[j + 3] + a[j2 + 2]; + x1r = a[j + 2] + a[j2 + 3]; + x1i = a[j + 3] - a[j2 + 2]; + x2r = a[j1 + 2] - a[j3 + 3]; + x2i = a[j1 + 3] + a[j3 + 2]; + x3r = a[j1 + 2] + a[j3 + 3]; + x3i = a[j1 + 3] - a[j3 + 2]; + y0r = wl1r * x0r - wl1i * x0i; + y0i = wl1r * x0i + wl1i * x0r; + y2r = we1r * x2r - we1i * x2i; + y2i = we1r * x2i + we1i * x2r; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + y0r = wl3r * x1r + wl3i * x1i; + y0i = wl3r * x1i - wl3i * x1r; + y2r = we3r * x3r + we3i * x3i; + y2i = we3r * x3i - we3i * x3r; + a[j2 + 2] = y0r + y2r; + a[j2 + 3] = y0i + y2i; + a[j3 + 2] = y0r - y2r; + a[j3 + 3] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + x0r = a[j0 - 2] - a[j2 - 1]; + x0i = a[j0 - 1] + a[j2 - 2]; + x1r = a[j0 - 2] + a[j2 - 1]; + x1i = a[j0 - 1] - a[j2 - 2]; + x2r = a[j1 - 2] - a[j3 - 1]; + x2i = a[j1 - 1] + a[j3 - 2]; + x3r = a[j1 - 2] + a[j3 - 1]; + x3i = a[j1 - 1] - a[j3 - 2]; + y0r = we1i * x0r - we1r * x0i; + y0i = we1i * x0i + we1r * x0r; + y2r = wl1i * x2r - wl1r * x2i; + y2i = wl1i * x2i + wl1r * x2r; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + y0r = we3i * x1r + we3r * x1i; + y0i = we3i * x1i - we3r * x1r; + y2r = wl3i * x3r + wl3r * x3i; + y2i = wl3i * x3i - wl3r * x3r; + a[j2 - 2] = y0r + y2r; + a[j2 - 1] = y0i + y2i; + a[j3 - 2] = y0r - y2r; + a[j3 - 1] = y0i - y2i; + wk1r -= ss1 * wl1i; + wk1i += ss1 * wl1r; + wk3r -= ss3 * wl3i; + wk3i += ss3 * wl3r; + wd1r -= ss1 * we1i; + wd1i += ss1 * we1r; + wd3r -= ss3 * we3i; + wd3i += ss3 * we3r; + } + if (i0 == mh - 4) { + break; + } + wl1r = cos(ew * i0); + wl1i = sin(ew * i0); + wl3i = 4 * wl1i * wl1r; + wl3r = wl1r - wl3i * wl1i; + wl3i = wl1i - wl3i * wl1r; + we1r = wn4r * (wl1r - wl1i); + we1i = wn4r * (wl1i + wl1r); + we3r = -wn4r * (wl3r - wl3i); + we3i = -wn4r * (wl3i + wl3r); + wk1r = w1r * wl1r - w1i * wl1i; + wk1i = w1r * wl1i + w1i * wl1r; + wk3i = 4 * wk1i * wk1r; + wk3r = wk1r - wk3i * wk1i; + wk3i = wk1i - wk3i * wk1r; + wd1r = wn4r * (wk1r - wk1i); + wd1i = wn4r * (wk1i + wk1r); + wd3r = -wn4r * (wk3r - wk3i); + wd3i = -wn4r * (wk3i + wk3r); + i = i0; + } + wl1r -= ss1 * wk1i; + wl1i += ss1 * wk1r; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] - a[j2 - 1]; + x0i = a[j0 - 1] + a[j2 - 2]; + x1r = a[j0 - 2] + a[j2 - 1]; + x1i = a[j0 - 1] - a[j2 - 2]; + x2r = a[j1 - 2] - a[j3 - 1]; + x2i = a[j1 - 1] + a[j3 - 2]; + x3r = a[j1 - 2] + a[j3 - 1]; + x3i = a[j1 - 1] - a[j3 - 2]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2 - 2] = y0r + y2r; + a[j2 - 1] = y0i + y2i; + a[j3 - 2] = y0r - y2r; + a[j3 - 1] = y0i - y2i; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wl1r * x0r - wl1i * x0i; + y0i = wl1r * x0i + wl1i * x0r; + y2r = wl1i * x2r - wl1r * x2i; + y2i = wl1i * x2i + wl1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wl1i * x1r - wl1r * x1i; + y0i = wl1i * x1i + wl1r * x1r; + y2r = wl1r * x3r - wl1i * x3i; + y2i = wl1r * x3i + wl1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; + x0r = a[j0 + 2] - a[j2 + 3]; + x0i = a[j0 + 3] + a[j2 + 2]; + x1r = a[j0 + 2] + a[j2 + 3]; + x1i = a[j0 + 3] - a[j2 + 2]; + x2r = a[j1 + 2] - a[j3 + 3]; + x2i = a[j1 + 3] + a[j3 + 2]; + x3r = a[j1 + 2] + a[j3 + 3]; + x3i = a[j1 + 3] - a[j3 + 2]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0 + 2] = y0r + y2r; + a[j0 + 3] = y0i + y2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2 + 2] = y0r + y2r; + a[j2 + 3] = y0i + y2i; + a[j3 + 2] = y0r - y2r; + a[j3 + 3] = y0i - y2i; +} + +void cftfx41(int n, double *a) { + if (n == 128) { + cftf161(a); + cftf162(&a[32]); + cftf161(&a[64]); + cftf161(&a[96]); + } else { + cftf081(a); + cftf082(&a[16]); + cftf081(&a[32]); + cftf081(&a[48]); + } +} + +void cftfx42(int n, double *a) { + if (n == 128) { + cftf161(a); + cftf162(&a[32]); + cftf161(&a[64]); + cftf162(&a[96]); + } else { + cftf081(a); + cftf082(&a[16]); + cftf081(&a[32]); + cftf082(&a[48]); + } +} + +void cftf161(double *a) { + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, y8r, y8i, + y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = WR5000; + wk1r = WR2500; + wk1i = WI2500; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + +void cftf162(double *a) { + double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r, x2i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = WR5000; + wk1r = WR1250; + wk1i = WI1250; + wk2r = WR2500; + wk2i = WI2500; + wk3r = WR3750; + wk3i = WI3750; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + +void cftf081(double *a) { + double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = WR5000; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + +void cftf082(double *a) { + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = WR5000; + wk1r = WR2500; + wk1i = WI2500; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + +void cftf040(double *a) { + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + +void cftb040(double *a) { + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + +void cftx020(double *a) { + double x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + +void rftfsub(int n, double *a) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 4) { + i0 = 4; + } + for (j = i - 4; j >= i0; j -= 4) { + k = n - j; + xr = a[j + 2] - a[k - 2]; + xi = a[j + 3] + a[k - 1]; + yr = wdr * xr - wdi * xi; + yi = wdr * xi + wdi * xr; + a[j + 2] -= yr; + a[j + 3] -= yi; + a[k - 2] += yr; + a[k - 1] -= yi; + wkr += ss * wdi; + wki += ss * (0.5 - wdr); + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + wdr += ss * wki; + wdi += ss * (0.5 - wkr); + } + if (i0 == 4) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } + xr = a[2] - a[n - 2]; + xi = a[3] + a[n - 1]; + yr = wdr * xr - wdi * xi; + yi = wdr * xi + wdi * xr; + a[2] -= yr; + a[3] -= yi; + a[n - 2] += yr; + a[n - 1] -= yi; +} + +void rftbsub(int n, double *a) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 4) { + i0 = 4; + } + for (j = i - 4; j >= i0; j -= 4) { + k = n - j; + xr = a[j + 2] - a[k - 2]; + xi = a[j + 3] + a[k - 1]; + yr = wdr * xr + wdi * xi; + yi = wdr * xi - wdi * xr; + a[j + 2] -= yr; + a[j + 3] -= yi; + a[k - 2] += yr; + a[k - 1] -= yi; + wkr += ss * wdi; + wki += ss * (0.5 - wdr); + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + wdr += ss * wki; + wdi += ss * (0.5 - wkr); + } + if (i0 == 4) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } + xr = a[2] - a[n - 2]; + xi = a[3] + a[n - 1]; + yr = wdr * xr + wdi * xi; + yi = wdr * xi - wdi * xr; + a[2] -= yr; + a[3] -= yi; + a[n - 2] += yr; + a[n - 1] -= yi; +} + +void dctsub(int n, double *a) { + int i, i0, j, k, m; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi; + + ec = M_PI_2 / n; + wkr = 0.5; + wki = 0.5; + w1r = cos(ec); + w1i = sin(ec); + wdr = 0.5 * (w1r - w1i); + wdi = 0.5 * (w1r + w1i); + ss = 2 * w1i; + m = n >> 1; + i = 0; + for (;;) { + i0 = i + 2 * DCST_LOOP_DIV; + if (i0 > m - 2) { + i0 = m - 2; + } + for (j = i + 2; j <= i0; j += 2) { + k = n - j; + xr = wdi * a[j - 1] - wdr * a[k + 1]; + xi = wdr * a[j - 1] + wdi * a[k + 1]; + wkr -= ss * wdi; + wki += ss * wdr; + yr = wki * a[j] - wkr * a[k]; + yi = wkr * a[j] + wki * a[k]; + wdr -= ss * wki; + wdi += ss * wkr; + a[k + 1] = xr; + a[k] = yr; + a[j - 1] = xi; + a[j] = yi; + } + if (i0 == m - 2) { + break; + } + wdr = cos(ec * i0); + wdi = sin(ec * i0); + wkr = 0.5 * (wdr - wdi); + wki = 0.5 * (wdr + wdi); + wdr = wkr * w1r - wki * w1i; + wdi = wkr * w1i + wki * w1r; + i = i0; + } + xr = wdi * a[m - 1] - wdr * a[m + 1]; + a[m - 1] = wdr * a[m - 1] + wdi * a[m + 1]; + a[m + 1] = xr; + a[m] *= wki + ss * wdr; +} + +void dstsub(int n, double *a) { + int i, i0, j, k, m; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi; + + ec = M_PI_2 / n; + wkr = 0.5; + wki = 0.5; + w1r = cos(ec); + w1i = sin(ec); + wdr = 0.5 * (w1r - w1i); + wdi = 0.5 * (w1r + w1i); + ss = 2 * w1i; + m = n >> 1; + i = 0; + for (;;) { + i0 = i + 2 * DCST_LOOP_DIV; + if (i0 > m - 2) { + i0 = m - 2; + } + for (j = i + 2; j <= i0; j += 2) { + k = n - j; + xr = wdi * a[k + 1] - wdr * a[j - 1]; + xi = wdr * a[k + 1] + wdi * a[j - 1]; + wkr -= ss * wdi; + wki += ss * wdr; + yr = wki * a[k] - wkr * a[j]; + yi = wkr * a[k] + wki * a[j]; + wdr -= ss * wki; + wdi += ss * wkr; + a[j - 1] = xr; + a[j] = yr; + a[k + 1] = xi; + a[k] = yi; + } + if (i0 == m - 2) { + break; + } + wdr = cos(ec * i0); + wdi = sin(ec * i0); + wkr = 0.5 * (wdr - wdi); + wki = 0.5 * (wdr + wdi); + wdr = wkr * w1r - wki * w1i; + wdi = wkr * w1i + wki * w1r; + i = i0; + } + xr = wdi * a[m + 1] - wdr * a[m - 1]; + a[m + 1] = wdr * a[m + 1] + wdi * a[m - 1]; + a[m - 1] = xr; + a[m] *= wki + ss * wdr; +} + +void dctsub4(int n, double *a) { + int m; + double wki, wdr, wdi, xr; + + wki = WR5000; + m = n >> 1; + if (m == 2) { + wdr = wki * WI2500; + wdi = wki * WR2500; + xr = wdi * a[1] - wdr * a[3]; + a[1] = wdr * a[1] + wdi * a[3]; + a[3] = xr; + } + a[m] *= wki; +} + +void dstsub4(int n, double *a) { + int m; + double wki, wdr, wdi, xr; + + wki = WR5000; + m = n >> 1; + if (m == 2) { + wdr = wki * WI2500; + wdi = wki * WR2500; + xr = wdi * a[3] - wdr * a[1]; + a[3] = wdr * a[3] + wdi * a[1]; + a[1] = xr; + } + a[m] *= wki; +} diff --git a/tests/performance/superpi/fftsg_h.h b/tests/performance/superpi/fftsg_h.h new file mode 100644 index 000000000..3158ce80a --- /dev/null +++ b/tests/performance/superpi/fftsg_h.h @@ -0,0 +1,88 @@ +/* + Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999. + https://github.com/Fibonacci43/SuperPI + Modified for Arduino by Lucas Saavedra Vaz, 2024. +*/ + +#pragma once + +#include + +#ifndef M_PI_2 +#define M_PI_2 1.570796326794896619231321691639751442098584699687 +#endif +#ifndef WR5000 /* cos(M_PI_2*0.5000) */ +#define WR5000 0.707106781186547524400844362104849039284835937688 +#endif +#ifndef WR2500 /* cos(M_PI_2*0.2500) */ +#define WR2500 0.923879532511286756128183189396788286822416625863 +#endif +#ifndef WI2500 /* sin(M_PI_2*0.2500) */ +#define WI2500 0.382683432365089771728459984030398866761344562485 +#endif +#ifndef WR1250 /* cos(M_PI_2*0.1250) */ +#define WR1250 0.980785280403230449126182236134239036973933730893 +#endif +#ifndef WI1250 /* sin(M_PI_2*0.1250) */ +#define WI1250 0.195090322016128267848284868477022240927691617751 +#endif +#ifndef WR3750 /* cos(M_PI_2*0.3750) */ +#define WR3750 0.831469612302545237078788377617905756738560811987 +#endif +#ifndef WI3750 /* sin(M_PI_2*0.3750) */ +#define WI3750 0.555570233019602224742830813948532874374937190754 +#endif + +#ifndef CDFT_RECURSIVE_N /* length of the recursive FFT mode */ +#define CDFT_RECURSIVE_N 512 /* <= (L1 cache size) / 16 */ +#endif + +#ifndef CDFT_LOOP_DIV /* control of the CDFT's speed & tolerance */ +#define CDFT_LOOP_DIV 32 +#endif + +#ifndef RDFT_LOOP_DIV /* control of the RDFT's speed & tolerance */ +#define RDFT_LOOP_DIV 64 +#endif + +#ifndef DCST_LOOP_DIV /* control of the DCT,DST's speed & tolerance */ +#define DCST_LOOP_DIV 64 +#endif + +void bitrv1(int n, double *a); +void bitrv2(int n, double *a); +void bitrv208(double *a); +void bitrv208neg(double *a); +void bitrv216(double *a); +void bitrv216neg(double *a); +void bitrv2conj(int n, double *a); +void cdft(int n, int isgn, double *a); +void cftb040(double *a); +void cftb1st(int n, double *a); +void cftbsub(int n, double *a); +void cftexp1(int n, double *a); +void cftexp2(int n, double *a); +void cftf040(double *a); +void cftf081(double *a); +void cftf082(double *a); +void cftf161(double *a); +void cftf162(double *a); +void cftfsub(int n, double *a); +void cftfx41(int n, double *a); +void cftfx42(int n, double *a); +void cftmdl1(int n, double *a); +void cftmdl2(int n, double *a); +void cftrec1(int n, double *a); +void cftrec2(int n, double *a); +void cftx020(double *a); +void dctsub(int n, double *a); +void dctsub4(int n, double *a); +void ddct(int n, int isgn, double *a); +void ddst(int n, int isgn, double *a); +void dfct(int n, double *a); +void dfst(int n, double *a); +void dstsub(int n, double *a); +void dstsub4(int n, double *a); +void rdft(int n, int isgn, double *a); +void rftbsub(int n, double *a); +void rftfsub(int n, double *a); diff --git a/tests/performance/superpi/pi_fftcs.cpp b/tests/performance/superpi/pi_fftcs.cpp new file mode 100644 index 000000000..bf83dd291 --- /dev/null +++ b/tests/performance/superpi/pi_fftcs.cpp @@ -0,0 +1,2214 @@ +/* + Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999. + https://github.com/Fibonacci43/SuperPI + Modified for Arduino by Lucas Saavedra Vaz, 2024. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "fftsg_h.h" +#include "pi_fftcs.h" + +void pi_calc(int nfft) { + int log2_nfft, radix, log10_radix, n, npow, nprc; +#if PRINT_DIGITS + int j = 0, k = 0, l = 0; +#endif + double err; + int *a, *b, *c, *e, *i1, *i2; + double *d1, *d2, *d3; + char *dgt; + uint32_t start_time; + double elap_time, loop_time; + log_d("Calculation of PI using FFT and AGM, %s", PI_FFTC_VER); + + // DGTINT is defined as short int, so it should be 2 bytes + assert(sizeof(DGTINT) == 2); + + log_d("initializing..."); + nfft /= 4; + start_time = millis(); + for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++); + nfft = 1 << log2_nfft; + n = nfft + 2; + a = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + b = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + c = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + e = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + i1 = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + i2 = (int *)malloc(2 * sizeof(int) + n * sizeof(DGTINT)); + d1 = (double *)malloc((nfft + 2) * sizeof(double)); + d2 = (double *)malloc((nfft + 2) * sizeof(double)); + d3 = (double *)malloc((nfft + 2) * sizeof(double)); + if (d3 == NULL) { + printf("Allocation Failure!\n"); + exit(1); + } + /* ---- radix test ---- */ + log10_radix = 1; + radix = 10; + err = mp_mul_radix_test(n, radix, nfft, d1); + err += DBL_EPSILON * (n * radix * radix / 4); + while (100 * err < DBL_ERROR_MARGIN && radix <= DGTINT_MAX / 20) { + err *= 100; + log10_radix++; + radix *= 10; + } + log_d("nfft= %d, radix= %d, error_margin= %g", nfft, radix, err); + log_d("calculating %d digits of PI...", log10_radix * (n - 2)); + /* + * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ---- + * c = sqrt(0.125); + * a = 1 + 3 * c; + * b = sqrt(a); + * e = b - 0.625; + * b = 2 * b; + * c = e - c; + * a = a + e; + * npow = 4; + * do { + * npow = 2 * npow; + * e = (a + b) / 2; + * b = sqrt(a * b); + * e = e - b; + * b = 2 * b; + * c = c - e; + * a = e + b; + * } while (e > SQRT_SQRT_EPSILON); + * e = e * e / 4; + * a = a + b; + * pi = (a * a - e - e / 2) / (a * c - e) / npow; + * ---- modification ---- + * This is a modified version of Gauss-Legendre formula + * (by T.Ooura). It is faster than original version. + * ---- reference ---- + * 1. E.Salamin, + * Computation of PI Using Arithmetic-Geometric Mean, + * Mathematics of Computation, Vol.30 1976. + * 2. R.P.Brent, + * Fast Multiple-Precision Evaluation of Elementary Functions, + * J. ACM 23 1976. + * 3. D.Takahasi, Y.Kanada, + * Calculation of PI to 51.5 Billion Decimal Digits on + * Distributed Memoriy Parallel Processors, + * Transactions of Information Processing Society of Japan, + * Vol.39 No.7 1998. + * 4. T.Ooura, + * Improvement of the PI Calculation Algorithm and + * Implementation of Fast Multiple-Precision Computation, + * Information Processing Society of Japan SIG Notes, + * 98-HPC-74, 1998. + */ + /* ---- c = 1 / sqrt(8) ---- */ + mp_invisqrt(n, radix, 8, c, i1, i2, nfft, d1, d2); + /* ---- a = 1 + 3 * c ---- */ + mp_imul(n, radix, c, 3, e); + mp_sscanf(n, log10_radix, (char *)"1", a); + mp_add(n, radix, a, e, a); + /* ---- b = sqrt(a) ---- */ + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2); + /* ---- e = b - 0.625 ---- */ + mp_sscanf(n, log10_radix, (char *)"0.625", e); + mp_sub(n, radix, b, e, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = e - c ---- */ + mp_sub(n, radix, e, c, c); + /* ---- a = a + e ---- */ + mp_add(n, radix, a, e, a); + log_d("AGM iteration"); + npow = 4; + elap_time = ((double)(millis() - start_time)) / 1000; + + do { + uint32_t start_loop_time = millis(); + npow *= 2; + /* ---- e = (a + b) / 2 ---- */ + mp_add(n, radix, a, b, e); + mp_idiv_2(n, radix, e, e); + /* ---- b = sqrt(a * b) ---- */ + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3); + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2); + /* ---- e = e - b ---- */ + mp_sub(n, radix, e, b, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = c - e ---- */ + mp_sub(n, radix, c, e, c); + /* ---- a = e + b ---- */ + mp_add(n, radix, e, b, a); + /* ---- convergence check ---- */ + nprc = -e[1]; + if (e[0] == 0) { + nprc = n; + } + loop_time = ((double)(millis() - start_loop_time)) / 1000; + elap_time += loop_time; + log_d("precision= %d: %0.2f sec", 4 * nprc * log10_radix, loop_time); + } while (4 * nprc <= n); + start_time = millis(); + /* ---- e = e * e / 4 (half precision) ---- */ + mp_idiv_2(n, radix, e, e); + mp_squh(n, radix, e, e, nfft, d1); + /* ---- a = a + b ---- */ + mp_add(n, radix, a, b, a); + /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */ + mp_mulhf(n, radix, a, c, c, i1, nfft, d1, d2); + mp_sub(n, radix, c, e, c); + mp_inv(n, radix, c, b, i1, i2, nfft, d2, d3); + mp_squhf_use_infft(n, radix, d1, a, a, i1, nfft, d2); + mp_sub(n, radix, a, e, a); + mp_idiv_2(n, radix, e, e); + mp_sub(n, radix, a, e, a); + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3); + mp_idiv(n, radix, a, npow, a); + /* ---- output ---- */ + dgt = (char *)d1; + mp_sprintf(n - 1, log10_radix, a, dgt); + elap_time = ((double)(millis() - start_time)) / 1000; + +#if PRINT_DIGITS + do { + if (!isdigit(*dgt)) { + if (isalpha(*dgt) != 0) { + fputc('\n', stdout); + fputc('\n', stdout); + } + fputc(*dgt, stdout); + fputc('\n', stdout); + fputc('\n', stdout); + j = 0; + k = 0; + l = 0; + continue; + } + fputc(*dgt, stdout); + if (++j >= DGT_PACK) { + j = 0; + if (++k >= DGT_PACK_LINE) { + k = 0; + fputc('\n', stdout); + if (++l >= DGT_LINE_BLOCK) { + l = 0; + fputc('\n', stdout); + } + } else { + fputc(' ', stdout); + } + } + } while (*dgt++ && *dgt != 'e'); + fputc('\n', stdout); + fprintf(stdout, "%s\n", dgt); +#endif + + free(d3); + free(d2); + free(d1); + free(i2); + free(i1); + free(e); + free(c); + free(b); + free(a); + /* ---- difftime ---- */ + log_d("%0.2f sec. (real time)", elap_time); +} + +/* -------- multiple precision routines -------- */ + +/* -------- mp_load routines -------- */ + +void mp_load_0(int n, int radix, int out[]) { + int j; + DGTINT *outr; + + outr = ((DGTINT *)&out[2]) - 2; + out[0] = 0; + out[1] = 0; + for (j = 2; j <= n + 1; j++) { + outr[j] = 0; + } +} + +void mp_load_1(int n, int radix, int out[]) { + int j; + DGTINT *outr; + + outr = ((DGTINT *)&out[2]) - 2; + out[0] = 1; + out[1] = 0; + outr[2] = 1; + for (j = 3; j <= n + 1; j++) { + outr[j] = 0; + } +} + +void mp_round(int n, int radix, int m, int inout[]) { + int j, x; + DGTINT *inoutr; + + inoutr = ((DGTINT *)&inout[2]) - 2; + if (m < n) { + for (j = n + 1; j > m + 2; j--) { + inoutr[j] = 0; + } + x = 2 * inoutr[m + 2]; + inoutr[m + 2] = 0; + if (x >= radix) { + for (j = m + 1; j >= 2; j--) { + x = inoutr[j] + 1; + if (x < radix) { + inoutr[j] = (DGTINT)x; + break; + } + inoutr[j] = 0; + } + if (x >= radix) { + inoutr[2] = 1; + inout[1]++; + } + } + } +} + +/* -------- mp_add routines -------- */ + +int mp_cmp(int n, int radix, int in1[], int in2[]) { + int mp_unsgn_cmp(int n, int in1[], int in2[]); + + if (in1[0] > in2[0]) { + return 1; + } else if (in1[0] < in2[0]) { + return -1; + } + return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]); +} + +void mp_add(int n, int radix, int in1[], int in2[], int out[]) { + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]); + int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn >= 0) { + if (outsgn > 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] + in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + +void mp_sub(int n, int radix, int in1[], int in2[], int out[]) { + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]); + int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn <= 0) { + if (outsgn < 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] - in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, (DGTINT *)&in1[2], (DGTINT *)&in2[2], (DGTINT *)&out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, (DGTINT *)&in2[2], (DGTINT *)&in1[2], (DGTINT *)&out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + +/* -------- mp_add child routines -------- */ + +int mp_unsgn_cmp(int n, int in1[], int in2[]) { + int j, cmp; + DGTINT *in1r, *in2r; + + in1r = ((DGTINT *)&in1[1]) - 1; + in2r = ((DGTINT *)&in2[1]) - 1; + cmp = in1[0] - in2[0]; + for (j = 1; j <= n && cmp == 0; j++) { + cmp = in1r[j] - in2r[j]; + } + if (cmp > 0) { + cmp = 1; + } else if (cmp < 0) { + cmp = -1; + } + return cmp; +} + +int mp_unexp_add(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]) { + int j, x, carry; + + carry = 0; + if (expdif == 0 && in1[0] + in2[0] >= radix) { + x = in1[n - 1] + in2[n - 1]; + carry = x >= radix ? -1 : 0; + for (j = n - 1; j > 0; j--) { + x = in1[j - 1] + in2[j - 1] - carry; + carry = x >= radix ? -1 : 0; + out[j] = (DGTINT)(x - (radix & carry)); + } + out[0] = (DGTINT)-carry; + } else { + if (expdif > n) { + expdif = n; + } + for (j = n - 1; j >= expdif; j--) { + x = in1[j] + in2[j - expdif] - carry; + carry = x >= radix ? -1 : 0; + out[j] = (DGTINT)(x - (radix & carry)); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] - carry; + carry = x >= radix ? -1 : 0; + out[j] = (DGTINT)(x - (radix & carry)); + } + if (carry != 0) { + for (j = n - 1; j > 0; j--) { + out[j] = out[j - 1]; + } + out[0] = (DGTINT)-carry; + } + } + return -carry; +} + +int mp_unexp_sub(int n, int radix, int expdif, DGTINT in1[], DGTINT in2[], DGTINT out[]) { + int j, x, borrow, ncancel; + + if (expdif > n) { + expdif = n; + } + borrow = 0; + for (j = n - 1; j >= expdif; j--) { + x = in1[j] - in2[j - expdif] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = (DGTINT)(x + (radix & borrow)); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = (DGTINT)(x + (radix & borrow)); + } + ncancel = 0; + for (j = 0; j < n && out[j] == 0; j++) { + ncancel = j + 1; + } + if (ncancel > 0 && ncancel < n) { + for (j = 0; j < n - ncancel; j++) { + out[j] = out[j + ncancel]; + } + for (j = n - ncancel; j < n; j++) { + out[j] = 0; + } + } + return ncancel; +} + +/* -------- mp_imul routines -------- */ + +void mp_imul(int n, int radix, int in1[], int in2, int out[]) { + void mp_unsgn_imul(int n, double dradix, int in1[], double din2, int out[]); + + if (in2 > 0) { + out[0] = in1[0]; + } else if (in2 < 0) { + out[0] = -in1[0]; + in2 = -in2; + } else { + out[0] = 0; + } + mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]); + if (out[0] == 0) { + out[1] = 0; + } +} + +int mp_idiv(int n, int radix, int in1[], int in2, int out[]) { + void mp_load_0(int n, int radix, int out[]); + void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, int out[]); + + if (in2 == 0) { + return -1; + } + if (in2 > 0) { + out[0] = in1[0]; + } else { + out[0] = -in1[0]; + in2 = -in2; + } + if (in1[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]); + return 0; +} + +void mp_idiv_2(int n, int radix, int in[], int out[]) { + int j, ix, carry, shift; + DGTINT *inr, *outr; + + inr = ((DGTINT *)&in[2]) - 2; + outr = ((DGTINT *)&out[2]) - 2; + out[0] = in[0]; + shift = 0; + if (inr[2] == 1) { + shift = 1; + } + out[1] = in[1] - shift; + carry = -shift; + for (j = 2; j <= n + 1 - shift; j++) { + ix = inr[j + shift] + (radix & carry); + carry = -(ix & 1); + outr[j] = (DGTINT)(ix >> 1); + } + if (shift > 0) { + outr[n + 1] = (DGTINT)((radix & carry) >> 1); + } +} + +/* -------- mp_imul child routines -------- */ + +void mp_unsgn_imul(int n, double dradix, int in1[], double din2, int out[]) { + int j, carry, shift; + double x, d1_radix; + DGTINT *in1r, *outr; + + in1r = ((DGTINT *)&in1[1]) - 1; + outr = ((DGTINT *)&out[1]) - 1; + d1_radix = 1.0 / dradix; + carry = 0; + for (j = n; j >= 1; j--) { + x = din2 * in1r[j] + carry + 0.5; + carry = (int)(d1_radix * x); + outr[j] = (DGTINT)(x - dradix * carry); + } + shift = 0; + x = carry + 0.5; + while (x > 1) { + x *= d1_radix; + shift++; + } + out[0] = in1[0] + shift; + if (shift > 0) { + while (shift > n) { + carry = (int)(d1_radix * carry + 0.5); + shift--; + } + for (j = n; j >= shift + 1; j--) { + outr[j] = outr[j - shift]; + } + for (j = shift; j >= 1; j--) { + x = carry + 0.5; + carry = (int)(d1_radix * x); + outr[j] = (DGTINT)(x - dradix * carry); + } + } +} + +void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, int out[]) { + int j, ix, carry, shift; + double x, d1_in2; + DGTINT *in1r, *outr; + + in1r = ((DGTINT *)&in1[1]) - 1; + outr = ((DGTINT *)&out[1]) - 1; + d1_in2 = 1.0 / din2; + shift = 0; + x = 0; + do { + shift++; + x *= dradix; + if (shift <= n) { + x += in1r[shift]; + } + } while (x < din2 - 0.5); + x += 0.5; + ix = (int)(d1_in2 * x); + carry = (int)(x - din2 * ix); + outr[1] = (DGTINT)ix; + shift--; + out[0] = in1[0] - shift; + if (shift >= n) { + shift = n - 1; + } + for (j = 2; j <= n - shift; j++) { + x = in1r[j + shift] + dradix * carry + 0.5; + ix = (int)(d1_in2 * x); + carry = (int)(x - din2 * ix); + outr[j] = (DGTINT)ix; + } + for (j = n - shift + 1; j <= n; j++) { + x = dradix * carry + 0.5; + ix = (int)(d1_in2 * x); + carry = (int)(x - din2 * ix); + outr[j] = (DGTINT)ix; + } +} + +/* -------- mp_mul routines -------- */ + +double mp_mul_radix_test(int n, int radix, int nfft, double tmpfft[]) { + void mp_mul_csqu(int nfft, double d1[]); + double mp_mul_d2i_test(int radix, int nfft, double din[]); + int j, ndata, radix_2; + + ndata = (nfft >> 1) + 1; + if (ndata > n) { + ndata = n; + } + tmpfft[nfft + 1] = radix - 1; + for (j = nfft; j > ndata; j--) { + tmpfft[j] = 0; + } + radix_2 = (radix + 1) / 2; + for (j = ndata; j > 2; j--) { + tmpfft[j] = radix_2; + } + tmpfft[2] = radix; + tmpfft[1] = radix - 1; + tmpfft[0] = 0; + mp_mul_csqu(nfft, tmpfft); + return 2 * mp_mul_d2i_test(radix, nfft, tmpfft); +} + +void mp_mul(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], double tmp3fft[]) { + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul_nt_out(int nfft, double d1[], double d2[]); + void mp_mul_cmul_nt_d2(int nfft, double d1[], double d2[]); + void mp_mul_cmul_nt_d1_add(int nfft, double d1[], double d2[], double d3[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + DGTINT *in1r, *in2r; + + in1r = ((DGTINT *)&in1[2]) - 2; + in2r = ((DGTINT *)&in2[2]) - 2; + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in1r[shift + 2] + in2r[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft); + mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft); + mp_mul_cmul_nt_out(nfft, tmp1fft, tmp3fft); + /* ---- tmp = (upper) in1 * (upper) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft); + mp_mul_cmul_nt_d2(nfft, tmp2fft, tmp1fft); + mp_mul_d2i(n, radix, nfft, tmp1fft, tmp); + /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */ + mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft); + mp_mul_cmul_nt_d1_add(nfft, tmp2fft, tmp1fft, tmp3fft); + /* ---- out = tmp + tmp3fft ---- */ + mp_mul_d2i(n_h, radix, nfft, tmp3fft, out); + mp_add(n, radix, out, tmp, out); +} + +void mp_squ(int n, int radix, int in[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[]) { + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul(int nfft, double d1[], double d2[]); + void mp_mul_csqu_nt_d1(int nfft, double d1[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + DGTINT *inr; + + inr = ((DGTINT *)&in[2]) - 2; + shift = (nfft >> 1) + 1; + while (n > shift) { + if (inr[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = 2 * (upper) in * (lower) in ---- */ + mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft); + mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft); + mp_mul_cmul(nfft, tmp1fft, tmp2fft); + mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp); + mp_add(n_h, radix, tmp, tmp, tmp); + /* ---- out = tmp + ((upper) in)^2 ---- */ + mp_mul_csqu_nt_d1(nfft, tmp1fft); + mp_mul_d2i(n, radix, nfft, tmp1fft, out); + mp_add(n, radix, out, tmp, out); +} + +void mp_mulhf(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double in1fft[], double tmpfft[]) { + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul(int nfft, double d1[], double d2[]); + void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + DGTINT *in2r; + + in2r = ((DGTINT *)&in2[2]) - 2; + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in2r[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = (upper) in1 * (upper) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in1, in1fft); + mp_mul_i2d(n, radix, nfft, 0, in2, tmpfft); + mp_mul_cmul(nfft, in1fft, tmpfft); + mp_mul_d2i(n, radix, nfft, tmpfft, tmp); + /* ---- out = tmp + (upper) in1 * (lower) in2 ---- */ + mp_mul_i2d(n, radix, nfft, shift, in2, tmpfft); + mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft); + mp_mul_d2i(n_h, radix, nfft, tmpfft, out); + mp_add(n, radix, out, tmp, out); +} + +void mp_mulhf_use_in1fft(int n, int radix, double in1fft[], int in2[], int out[], int tmp[], int nfft, double tmpfft[]) { + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + DGTINT *in2r; + + in2r = ((DGTINT *)&in2[2]) - 2; + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in2r[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = (upper) in1fft * (upper) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in2, tmpfft); + mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft); + mp_mul_d2i(n, radix, nfft, tmpfft, tmp); + /* ---- out = tmp + (upper) in1 * (lower) in2 ---- */ + mp_mul_i2d(n, radix, nfft, shift, in2, tmpfft); + mp_mul_cmul_nt_d1(nfft, in1fft, tmpfft); + mp_mul_d2i(n_h, radix, nfft, tmpfft, out); + mp_add(n, radix, out, tmp, out); +} + +void mp_squhf_use_infft(int n, int radix, double infft[], int in[], int out[], int tmp[], int nfft, double tmpfft[]) { + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]); + void mp_mul_csqu_nt_d1(int nfft, double d1[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + DGTINT *inr; + + inr = ((DGTINT *)&in[2]) - 2; + shift = (nfft >> 1) + 1; + while (n > shift) { + if (inr[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = (upper) infft * (lower) in ---- */ + mp_mul_i2d(n, radix, nfft, shift, in, tmpfft); + mp_mul_cmul_nt_d1(nfft, infft, tmpfft); + mp_mul_d2i(n_h, radix, nfft, tmpfft, tmp); + /* ---- out = tmp + ((upper) infft)^2 ---- */ + mp_mul_csqu_nt_d1(nfft, infft); + mp_mul_d2i(n, radix, nfft, infft, out); + mp_add(n, radix, out, tmp, out); +} + +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]) { + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul(int nfft, double d1[], double d2[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in1, in1fft); + mp_mul_i2d(n, radix, nfft, 0, in2, outfft); + mp_mul_cmul(nfft, in1fft, outfft); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + +void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]) { + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + DGTINT *in2r; + + in2r = ((DGTINT *)&in2[2]) - 2; + while (n > shift) { + if (in2r[shift + 2] != 0) { + break; + } + shift++; + } + mp_mul_i2d(n, radix, nfft, shift, in2, outfft); + mp_mul_cmul_nt_d1(nfft, in1fft, outfft); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + +void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]) { + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_csqu(int nfft, double d1[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in, outfft); + mp_mul_csqu(nfft, outfft); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + +void mp_squh_save_infft(int n, int radix, int in[], int out[], int nfft, double infft[], double outfft[]) { + void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]); + void mp_mul_csqu_save_d1(int nfft, double d1[], double d2[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in, infft); + mp_mul_csqu_save_d1(nfft, infft, outfft); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + +void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], int nfft) { + void mp_mul_csqu_nt_d1(int nfft, double d1[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_csqu_nt_d1(nfft, inoutfft); + mp_mul_d2i(n, radix, nfft, inoutfft, out); +} + +/* -------- mp_mul child routines -------- */ + +void mp_mul_i2d(int n, int radix, int nfft, int shift, int in[], double dout[]) { + int j, x, carry, ndata, radix_2, topdgt; + DGTINT *inr; + + inr = ((DGTINT *)&in[2]) - 2; + ndata = 0; + topdgt = 0; + if (n > shift) { + topdgt = inr[shift + 2]; + ndata = (nfft >> 1) + 1; + if (ndata > n - shift) { + ndata = n - shift; + } + } + dout[nfft + 1] = in[0] * topdgt; + for (j = nfft; j > ndata; j--) { + dout[j] = 0; + } + /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */ + if (ndata > 1) { + radix_2 = radix / 2; + carry = 0; + for (j = ndata + 1; j > 3; j--) { + x = inr[j + shift] - carry; + carry = x >= radix_2 ? -1 : 0; + dout[j - 1] = x - (radix & carry); + } + dout[2] = inr[shift + 3] - carry; + } + dout[1] = topdgt; + dout[0] = in[1] - shift; +} + +void mp_mul_cmul(int nfft, double d1[], double d2[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcmul(int n, double *a, double *b); + double xr, xi; + + cdft(nfft, 1, &d1[1]); + cdft(nfft, 1, &d2[1]); + d2[0] += d1[0]; + xr = d1[1] * d2[1] + d1[2] * d2[2]; + xi = d1[1] * d2[2] + d1[2] * d2[1]; + d2[1] = xr; + d2[2] = xi; + if (nfft > 2) { + mp_mul_rcmul(nfft, &d1[1], &d2[1]); + } + d2[nfft + 1] *= d1[nfft + 1]; + cdft(nfft, -1, &d2[1]); +} + +void mp_mul_cmul_nt_d1(int nfft, double d1[], double d2[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcmul_nt_in1(int n, double *a, double *b); + double xr, xi; + + cdft(nfft, 1, &d2[1]); + d2[0] += d1[0]; + xr = d1[1] * d2[1] + d1[2] * d2[2]; + xi = d1[1] * d2[2] + d1[2] * d2[1]; + d2[1] = xr; + d2[2] = xi; + if (nfft > 2) { + mp_mul_rcmul_nt_in1(nfft, &d1[1], &d2[1]); + } + d2[nfft + 1] *= d1[nfft + 1]; + cdft(nfft, -1, &d2[1]); +} + +void mp_mul_cmul_nt_d2(int nfft, double d1[], double d2[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcmul_nt_in2(int n, double *a, double *b); + double xr, xi; + + cdft(nfft, 1, &d1[1]); + d2[0] += d1[0]; + xr = d1[1] * d2[1] + d1[2] * d2[2]; + xi = d1[1] * d2[2] + d1[2] * d2[1]; + d2[1] = xr; + d2[2] = xi; + if (nfft > 2) { + mp_mul_rcmul_nt_in2(nfft, &d1[1], &d2[1]); + } + d2[nfft + 1] *= d1[nfft + 1]; + cdft(nfft, -1, &d2[1]); +} + +void mp_mul_cmul_nt_out(int nfft, double d1[], double d2[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcmul_nt_out(int n, double *a, double *b); + double xr, xi; + + cdft(nfft, 1, &d1[1]); + cdft(nfft, 1, &d2[1]); + d2[0] += d1[0]; + xr = d1[1] * d2[1] + d1[2] * d2[2]; + xi = d1[1] * d2[2] + d1[2] * d2[1]; + d2[1] = xr; + d2[2] = xi; + if (nfft > 2) { + mp_mul_rcmul_nt_out(nfft, &d1[1], &d2[1]); + } + d2[nfft + 1] *= d1[nfft + 1]; +} + +void mp_mul_cmul_nt_d1_add(int nfft, double d1[], double d2[], double d3[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcmul_nt_in1_add(int n, double *a, double *b, double *badd); + double xr, xi; + + cdft(nfft, 1, &d2[1]); + xr = d1[1] * d2[1] + d1[2] * d2[2]; + xi = d1[1] * d2[2] + d1[2] * d2[1]; + d3[1] += xr; + d3[2] += xi; + if (nfft > 2) { + mp_mul_rcmul_nt_in1_add(nfft, &d1[1], &d2[1], &d3[1]); + } + d3[nfft + 1] += d1[nfft + 1] * d2[nfft + 1]; + cdft(nfft, -1, &d3[1]); +} + +void mp_mul_csqu(int nfft, double d1[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcsqu(int n, double *a); + double xr, xi; + + cdft(nfft, 1, &d1[1]); + d1[0] *= 2; + xr = d1[1] * d1[1] + d1[2] * d1[2]; + xi = 2 * d1[1] * d1[2]; + d1[1] = xr; + d1[2] = xi; + if (nfft > 2) { + mp_mul_rcsqu(nfft, &d1[1]); + } + d1[nfft + 1] *= d1[nfft + 1]; + cdft(nfft, -1, &d1[1]); +} + +void mp_mul_csqu_save_d1(int nfft, double d1[], double d2[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcsqu_save(int n, double *a, double *b); + double xr, xi; + + cdft(nfft, 1, &d1[1]); + d2[0] = 2 * d1[0]; + xr = d1[1] * d1[1] + d1[2] * d1[2]; + xi = 2 * d1[1] * d1[2]; + d2[1] = xr; + d2[2] = xi; + if (nfft > 2) { + mp_mul_rcsqu_save(nfft, &d1[1], &d2[1]); + } + d2[nfft + 1] = d1[nfft + 1] * d1[nfft + 1]; + cdft(nfft, -1, &d2[1]); +} + +void mp_mul_csqu_nt_d1(int nfft, double d1[]) { + void cdft(int n, int isgn, double *a); + void mp_mul_rcsqu_nt_in(int n, double *a); + double xr, xi; + + d1[0] *= 2; + xr = d1[1] * d1[1] + d1[2] * d1[2]; + xi = 2 * d1[1] * d1[2]; + d1[1] = xr; + d1[2] = xi; + if (nfft > 2) { + mp_mul_rcsqu_nt_in(nfft, &d1[1]); + } + d1[nfft + 1] *= d1[nfft + 1]; + cdft(nfft, -1, &d1[1]); +} + +void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]) { + int j, carry, carry1, carry2, shift, ndata; + double x, scale, d1_radix, d1_radix2, pow_radix, topdgt; + DGTINT *outr; + + outr = ((DGTINT *)&out[2]) - 2; + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + topdgt = din[nfft + 1]; + x = topdgt < 0 ? -topdgt : topdgt; + shift = x + 0.5 >= radix ? 1 : 0; + /* ---- correction of cyclic convolution of din[1] ---- */ + x *= nfft * 0.5; + din[nfft + 1] = din[1] - x; + din[1] = x; + /* ---- output of digits ---- */ + ndata = n; + if (n > nfft + 1 + shift) { + ndata = nfft + 1 + shift; + for (j = n + 1; j > ndata + 1; j--) { + outr[j] = 0; + } + } + x = 0; + pow_radix = 1; + for (j = ndata + 1 - shift; j <= nfft + 1; j++) { + x += pow_radix * din[j]; + pow_radix *= d1_radix; + if (pow_radix < DBL_EPSILON) { + break; + } + } + x = d1_radix2 * (scale * x + 0.5); + carry2 = ((int)x) - 1; + carry = (int)(radix * (x - carry2) + 0.5); + for (j = ndata; j > 1; j--) { + x = d1_radix2 * (scale * din[j - shift] + carry + 0.5); + carry = carry2; + carry2 = ((int)x) - 1; + x = radix * (x - carry2); + carry1 = (int)x; + outr[j + 1] = (DGTINT)(radix * (x - carry1)); + carry += carry1; + } + x = carry + ((double)radix) * carry2 + 0.5; + if (shift == 0) { + x += scale * din[1]; + } + carry = (int)(d1_radix * x); + outr[2] = (DGTINT)(x - ((double)radix) * carry); + if (carry > 0) { + for (j = n + 1; j > 2; j--) { + outr[j] = outr[j - 1]; + } + outr[2] = (DGTINT)carry; + shift++; + } + /* ---- output of exp, sgn ---- */ + x = din[0] + shift + 0.5; + shift = ((int)x) - 1; + out[1] = shift + ((int)(x - shift)); + out[0] = topdgt > 0.5 ? 1 : -1; + if (outr[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} + +double mp_mul_d2i_test(int radix, int nfft, double din[]) { + int j, carry, carry1, carry2; + double x, scale, d1_radix, d1_radix2, err; + + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + /* ---- correction of cyclic convolution of din[1] ---- */ + x = din[nfft + 1] * nfft * 0.5; + if (x < 0) { + x = -x; + } + din[nfft + 1] = din[1] - x; + /* ---- check of digits ---- */ + err = 0; + carry = 0; + carry2 = 0; + for (j = nfft + 1; j > 1; j--) { + x = d1_radix2 * (scale * din[j] + carry + 0.5); + carry = carry2; + carry2 = ((int)x) - 1; + x = radix * (x - carry2); + carry1 = (int)x; + x = radix * (x - carry1); + carry += carry1; + x = x - 0.5 - ((int)x); + if (x > err) { + err = x; + } else if (-x > err) { + err = -x; + } + } + return err; +} + +/* -------- mp_mul child^2 routines (mix RFFT routines) -------- */ + +#ifndef M_PI_2 +#define M_PI_2 1.570796326794896619231321691639751442098584699687 +#endif + +#ifndef RDFT_LOOP_DIV /* control of the RDFT's speed & tolerance */ +#define RDFT_LOOP_DIV 64 +#endif + +void mp_mul_rcmul(int n, double *a, double *b) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, ajr, aji, akr, aki, bjr, bji, bkr, bki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + yr = b[i]; + yi = b[i + 1]; + b[i] = xr * yr - xi * yi; + b[i + 1] = xr * yi + xi * yr; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data a[] into RFFT data ---- */ + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + ajr = a[j] - yr; + aji = a[j + 1] - yi; + akr = a[k] + yr; + aki = a[k + 1] - yi; + a[j] = ajr; + a[j + 1] = aji; + a[k] = akr; + a[k + 1] = aki; + /* ---- transform CFFT data b[] into RFFT data ---- */ + xr = b[j] - b[k]; + xi = b[j + 1] + b[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = b[j] - yr; + xi = b[j + 1] - yi; + yr = b[k] + yr; + yi = b[k + 1] - yi; + /* ---- cmul ---- */ + bjr = ajr * xr - aji * xi; + bji = ajr * xi + aji * xr; + bkr = akr * yr - aki * yi; + bki = akr * yi + aki * yr; + /* ---- transform RFFT data bxx into CFFT data ---- */ + xr = bjr - bkr; + xi = bji + bki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + b[j] = bjr - yr; + b[j + 1] = bji - yi; + b[k] = bkr + yr; + b[k + 1] = bki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcmul_nt_in1(int n, double *a, double *b) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, bjr, bji, bkr, bki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + yr = b[i]; + yi = b[i + 1]; + b[i] = xr * yr - xi * yi; + b[i + 1] = xr * yi + xi * yr; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data b[] into RFFT data ---- */ + xr = b[j] - b[k]; + xi = b[j + 1] + b[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = b[j] - yr; + xi = b[j + 1] - yi; + yr = b[k] + yr; + yi = b[k + 1] - yi; + /* ---- cmul ---- */ + bjr = a[j] * xr - a[j + 1] * xi; + bji = a[j] * xi + a[j + 1] * xr; + bkr = a[k] * yr - a[k + 1] * yi; + bki = a[k] * yi + a[k + 1] * yr; + /* ---- transform RFFT data bxx into CFFT data ---- */ + xr = bjr - bkr; + xi = bji + bki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + b[j] = bjr - yr; + b[j + 1] = bji - yi; + b[k] = bkr + yr; + b[k + 1] = bki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcmul_nt_in2(int n, double *a, double *b) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, bjr, bji, bkr, bki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + yr = b[i]; + yi = b[i + 1]; + b[i] = xr * yr - xi * yi; + b[i + 1] = xr * yi + xi * yr; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data a[] into RFFT data ---- */ + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = a[j] - yr; + xi = a[j + 1] - yi; + yr = a[k] + yr; + yi = a[k + 1] - yi; + a[j] = xr; + a[j + 1] = xi; + a[k] = yr; + a[k + 1] = yi; + /* ---- cmul ---- */ + bjr = b[j] * xr - b[j + 1] * xi; + bji = b[j] * xi + b[j + 1] * xr; + bkr = b[k] * yr - b[k + 1] * yi; + bki = b[k] * yi + b[k + 1] * yr; + /* ---- transform RFFT data bxx into CFFT data ---- */ + xr = bjr - bkr; + xi = bji + bki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + b[j] = bjr - yr; + b[j + 1] = bji - yi; + b[k] = bkr + yr; + b[k + 1] = bki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcmul_nt_out(int n, double *a, double *b) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, ajr, aji, akr, aki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + yr = b[i]; + yi = b[i + 1]; + b[i] = xr * yr - xi * yi; + b[i + 1] = xr * yi + xi * yr; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data a[] into RFFT data ---- */ + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + ajr = a[j] - yr; + aji = a[j + 1] - yi; + akr = a[k] + yr; + aki = a[k + 1] - yi; + a[j] = ajr; + a[j + 1] = aji; + a[k] = akr; + a[k + 1] = aki; + /* ---- transform CFFT data b[] into RFFT data ---- */ + xr = b[j] - b[k]; + xi = b[j + 1] + b[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = b[j] - yr; + xi = b[j + 1] - yi; + yr = b[k] + yr; + yi = b[k + 1] - yi; + /* ---- cmul ---- */ + b[j] = ajr * xr - aji * xi; + b[j + 1] = ajr * xi + aji * xr; + b[k] = akr * yr - aki * yi; + b[k + 1] = akr * yi + aki * yr; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcmul_nt_in1_add(int n, double *a, double *b, double *badd) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, bjr, bji, bkr, bki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + yr = b[i]; + yi = b[i + 1]; + badd[i] += xr * yr - xi * yi; + badd[i + 1] += xr * yi + xi * yr; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data b[] into RFFT data ---- */ + xr = b[j] - b[k]; + xi = b[j + 1] + b[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = b[j] - yr; + xi = b[j + 1] - yi; + yr = b[k] + yr; + yi = b[k + 1] - yi; + /* ---- cmul + add ---- */ + bjr = badd[j] + (a[j] * xr - a[j + 1] * xi); + bji = badd[j + 1] + (a[j] * xi + a[j + 1] * xr); + bkr = badd[k] + (a[k] * yr - a[k + 1] * yi); + bki = badd[k + 1] + (a[k] * yi + a[k + 1] * yr); + /* ---- transform RFFT data bxx into CFFT data ---- */ + xr = bjr - bkr; + xi = bji + bki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + badd[j] = bjr - yr; + badd[j + 1] = bji - yi; + badd[k] = bkr + yr; + badd[k + 1] = bki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcsqu(int n, double *a) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, ajr, aji, akr, aki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + a[i] = xr * xr - xi * xi; + a[i + 1] = 2 * xr * xi; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data a[] into RFFT data ---- */ + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = a[j] - yr; + xi = a[j + 1] - yi; + yr = a[k] + yr; + yi = a[k + 1] - yi; + /* ---- csqu ---- */ + ajr = xr * xr - xi * xi; + aji = 2 * xr * xi; + akr = yr * yr - yi * yi; + aki = 2 * yr * yi; + /* ---- transform RFFT data axx into CFFT data ---- */ + xr = ajr - akr; + xi = aji + aki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] = ajr - yr; + a[j + 1] = aji - yi; + a[k] = akr + yr; + a[k + 1] = aki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcsqu_save(int n, double *a, double *b) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, ajr, aji, akr, aki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + b[i] = xr * xr - xi * xi; + b[i + 1] = 2 * xr * xi; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- transform CFFT data a[] into RFFT data ---- */ + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + xr = a[j] - yr; + xi = a[j + 1] - yi; + yr = a[k] + yr; + yi = a[k + 1] - yi; + a[j] = xr; + a[j + 1] = xi; + a[k] = yr; + a[k + 1] = yi; + /* ---- csqu ---- */ + ajr = xr * xr - xi * xi; + aji = 2 * xr * xi; + akr = yr * yr - yi * yi; + aki = 2 * yr * yi; + /* ---- transform RFFT data axx into CFFT data ---- */ + xr = ajr - akr; + xi = aji + aki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + b[j] = ajr - yr; + b[j + 1] = aji - yi; + b[k] = akr + yr; + b[k + 1] = aki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +void mp_mul_rcsqu_nt_in(int n, double *a) { + int i, i0, j, k; + double ec, w1r, w1i, wkr, wki, wdr, wdi, ss; + double xr, xi, yr, yi, ajr, aji, akr, aki; + + ec = 2 * M_PI_2 / n; + wkr = 0; + wki = 0; + wdi = cos(ec); + wdr = sin(ec); + wdi *= wdr; + wdr *= wdr; + w1r = 1 - 2 * wdr; + w1i = 2 * wdi; + ss = 2 * w1i; + i = n >> 1; + xr = a[i]; + xi = a[i + 1]; + a[i] = xr * xr - xi * xi; + a[i + 1] = 2 * xr * xi; + for (;;) { + i0 = i - 4 * RDFT_LOOP_DIV; + if (i0 < 2) { + i0 = 2; + } + for (j = i - 2; j >= i0; j -= 2) { + k = n - j; + xr = wkr + ss * wdi; + xi = wki + ss * (0.5 - wdr); + wkr = wdr; + wki = wdi; + wdr = xr; + wdi = xi; + /* ---- csqu ---- */ + xr = a[j]; + xi = a[j + 1]; + yr = a[k]; + yi = a[k + 1]; + ajr = xr * xr - xi * xi; + aji = 2 * xr * xi; + akr = yr * yr - yi * yi; + aki = 2 * yr * yi; + /* ---- transform RFFT data axx into CFFT data ---- */ + xr = ajr - akr; + xi = aji + aki; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] = ajr - yr; + a[j + 1] = aji - yi; + a[k] = akr + yr; + a[k + 1] = aki - yi; + } + if (i0 == 2) { + break; + } + wkr = 0.5 * sin(ec * i0); + wki = 0.5 * cos(ec * i0); + wdr = 0.5 - (wkr * w1r - wki * w1i); + wdi = wkr * w1i + wki * w1r; + wkr = 0.5 - wkr; + i = i0; + } +} + +/* -------- mp_inv routines -------- */ + +int mp_inv(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) { + int mp_get_nfft_init(int radix, int nfft_max); + void mp_inv_init(int n, int radix, int in[], int out[]); + int mp_inv_newton(int n, int radix, int in[], int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]); + int n_nwt, nfft_nwt, thr, prc; + + if (in[0] == 0) { + return -1; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_inv_init(n_nwt, radix, in, out); + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_inv_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft); +#ifdef DEBUG + printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc); +#endif + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + +int mp_sqrt(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) { + void mp_load_0(int n, int radix, int out[]); + int mp_get_nfft_init(int radix, int nfft_max); + void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]); + int mp_sqrt_newton(int n, int radix, int in[], int inout[], int inout_rev[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], int *n_tmp1fft); + int n_nwt, nfft_nwt, thr, prc, n_tmp1fft; + + if (in[0] < 0) { + return -1; + } else if (in[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_sqrt_init(n_nwt, radix, in, out, tmp1); + n_tmp1fft = 0; + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_sqrt_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, &n_tmp1fft); +#ifdef DEBUG + printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc); +#endif + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + +int mp_invisqrt(int n, int radix, int in, int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) { + int mp_get_nfft_init(int radix, int nfft_max); + void mp_invisqrt_init(int n, int radix, int in, int out[]); + int mp_invisqrt_newton(int n, int radix, int in, int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]); + int n_nwt, nfft_nwt, thr, prc; + + if (in <= 0) { + return -1; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_invisqrt_init(n_nwt, radix, in, out); + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_invisqrt_newton(n_nwt, radix, in, out, tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft); +#ifdef DEBUG + printf("n=%d, nfft=%d, prc=%d\n", n_nwt, nfft_nwt, prc); +#endif + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + +/* -------- mp_inv child routines -------- */ + +int mp_get_nfft_init(int radix, int nfft_max) { + int nfft_init; + double r; + + r = radix; + nfft_init = 1; + do { + r *= r; + nfft_init <<= 1; + } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max); + return nfft_init; +} + +void mp_inv_init(int n, int radix, int in[], int out[]) { + void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]); + double mp_unexp_mp2d(int n, int radix, DGTINT in[]); + int outexp; + double din; + + out[0] = in[0]; + outexp = -in[1]; + din = 1.0 / mp_unexp_mp2d(n, radix, (DGTINT *)&in[2]); + while (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, (DGTINT *)&out[2]); +} + +void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]) { + void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]); + double mp_unexp_mp2d(int n, int radix, DGTINT in[]); + int outexp; + double din; + + out[0] = 1; + out_rev[0] = 1; + outexp = in[1]; + din = mp_unexp_mp2d(n, radix, (DGTINT *)&in[2]); + if (outexp % 2 != 0) { + din *= radix; + outexp--; + } + outexp /= 2; + din = sqrt(din); + if (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, (DGTINT *)&out[2]); + outexp = -outexp; + din = 1.0 / din; + while (din < 1) { + din *= radix; + outexp--; + } + out_rev[1] = outexp; + mp_unexp_d2mp(n, radix, din, (DGTINT *)&out_rev[2]); +} + +void mp_invisqrt_init(int n, int radix, int in, int out[]) { + void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]); + int outexp; + double dout; + + out[0] = 1; + outexp = 0; + dout = sqrt(1.0 / in); + while (dout < 1) { + dout *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, dout, (DGTINT *)&out[2]); +} + +void mp_unexp_d2mp(int n, int radix, double din, DGTINT out[]) { + int j, x; + + for (j = 0; j < n; j++) { + x = (int)din; + if (x >= radix) { + x = radix - 1; + din = radix; + } + din = radix * (din - x); + out[j] = (DGTINT)x; + } +} + +double mp_unexp_mp2d(int n, int radix, DGTINT in[]) { + int j; + double d1_radix, dout; + + d1_radix = 1.0 / radix; + dout = 0; + for (j = n - 1; j >= 0; j--) { + dout = d1_radix * dout + in[j]; + } + return dout; +} + +int mp_inv_newton(int n, int radix, int in[], int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) { + void mp_load_1(int n, int radix, int out[]); + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]); + void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]); + int n_h, shift, prc; + + shift = (nfft >> 1) + 1; + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n, radix, inout, in, tmp1, nfft, tmp1fft, tmp2fft); + /* ---- tmp2 = 1 - tmp1 ---- */ + mp_load_1(n, radix, tmp2); + mp_sub(n, radix, tmp2, tmp1, tmp2); + /* ---- tmp2 -= inout * (lower) in (half precision) ---- */ + mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, nfft, tmp2fft); + mp_sub(n_h, radix, tmp2, tmp1, tmp2); + /* ---- get precision ---- */ + prc = -tmp2[1]; + if (tmp2[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp2 *= inout (half precision) ---- */ + mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, nfft, tmp2fft); + /* ---- inout += tmp2 ---- */ + mp_add(n, radix, inout, tmp2, inout); + return prc; +} + +int mp_sqrt_newton(int n, int radix, int in[], int inout[], int inout_rev[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], int *n_tmp1fft) { + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]); + void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]); + void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], int nfft); + int n_h, nfft_h, shift, prc; + + nfft_h = nfft >> 1; + shift = nfft_h + 1; + if (nfft_h < 2) { + nfft_h = 2; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */ + mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev); + if (*n_tmp1fft != nfft_h) { + mp_squh(n_h, radix, inout_rev, tmp, nfft_h, tmp1fft); + } else { + mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, nfft_h); + } + /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n_h, radix, inout, tmp, tmp, nfft, tmp1fft, tmp2fft); + mp_sub(n_h, radix, inout_rev, tmp, tmp); + /* ---- inout_rev += tmp ---- */ + mp_add(n_h, radix, inout_rev, tmp, inout_rev); + /* ---- tmp = in - inout^2 (half to normal precision) ---- */ + mp_squh_use_in1fft(n, radix, tmp1fft, tmp, nfft); + mp_sub(n, radix, in, tmp, tmp); + /* ---- get precision ---- */ + prc = in[1] - tmp[1]; + if (((DGTINT *)&in[2])[0] > ((DGTINT *)&tmp[2])[0]) { + prc++; + } + if (tmp[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */ + mp_round(n_h, radix, shift, inout_rev); + mp_mulh(n_h, radix, inout_rev, tmp, tmp, nfft, tmp1fft, tmp2fft); + *n_tmp1fft = nfft; + mp_idiv_2(n_h, radix, tmp, tmp); + /* ---- inout += tmp ---- */ + mp_add(n, radix, inout, tmp, inout); + return prc; +} + +int mp_invisqrt_newton(int n, int radix, int in, int inout[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]) { + void mp_load_1(int n, int radix, int out[]); + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_imul(int n, int radix, int in1[], int in2, int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + void mp_squh_save_infft(int n, int radix, int in[], int out[], int nfft, double infft[], double outfft[]); + void mp_mulh_use_in1fft(int n, int radix, double in1fft[], int shift, int in2[], int out[], int nfft, double outfft[]); + int n_h, shift, prc; + + shift = (nfft >> 1) + 1; + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp1 = in * inout^2 (half to normal precision) ---- */ + mp_round(n, radix, shift, inout); + mp_squh_save_infft(n, radix, inout, tmp1, nfft, tmp1fft, tmp2fft); + mp_imul(n, radix, tmp1, in, tmp1); + /* ---- tmp2 = 1 - tmp1 ---- */ + mp_load_1(n, radix, tmp2); + mp_sub(n, radix, tmp2, tmp1, tmp2); + /* ---- get precision ---- */ + prc = -tmp2[1]; + if (tmp2[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp2 *= inout / 2 (half precision) ---- */ + mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, nfft, tmp2fft); + mp_idiv_2(n_h, radix, tmp2, tmp2); + /* ---- inout += tmp2 ---- */ + mp_add(n, radix, inout, tmp2, inout); + return prc; +} + +/* -------- mp_io routines -------- */ + +void mp_sprintf(int n, int log10_radix, int in[], char out[]) { + int j, k, x, y, outexp, shift; + DGTINT *inr; + + inr = ((DGTINT *)&in[2]) - 2; + if (in[0] < 0) { + *out++ = '-'; + } + x = inr[2]; + shift = log10_radix; + for (k = log10_radix; k > 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + if (y != 0) { + shift = k; + } + } + out[0] = out[shift]; + out[1] = '.'; + for (k = 1; k <= log10_radix - shift; k++) { + out[k + 1] = out[k + shift]; + } + outexp = log10_radix - shift; + out += outexp + 2; + for (j = 3; j <= n + 1; j++) { + x = inr[j]; + for (k = log10_radix - 1; k >= 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + } + out += log10_radix; + } + *out++ = 'e'; + outexp += log10_radix * in[1]; + sprintf(out, "%d", outexp); +} + +void mp_sscanf(int n, int log10_radix, char in[], int out[]) { + char *s; + int j, x, outexp, outexp_mod; + DGTINT *outr; + + outr = ((DGTINT *)&out[2]) - 2; + while (*in == ' ') { + in++; + } + out[0] = 1; + if (*in == '-') { + out[0] = -1; + in++; + } else if (*in == '+') { + in++; + } + while (*in == ' ' || *in == '0') { + in++; + } + outexp = 0; + for (s = in; *s != '\0'; s++) { + if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') { + if (sscanf(++s, "%d", &outexp) != 1) { + outexp = 0; + } + break; + } + } + if (*in == '.') { + do { + outexp--; + while (*++in == ' '); + } while (*in == '0' && *in != '\0'); + } else if (*in != '\0') { + s = in; + while (*++s == ' '); + while (*s >= '0' && *s <= '9' && *s != '\0') { + outexp++; + while (*++s == ' '); + } + } + x = outexp / log10_radix; + outexp_mod = outexp - log10_radix * x; + if (outexp_mod < 0) { + x--; + outexp_mod += log10_radix; + } + out[1] = x; + x = 0; + j = 2; + for (s = in; *s != '\0'; s++) { + if (*s == '.' || *s == ' ') { + continue; + } + if (*s < '0' || *s > '9') { + break; + } + x = 10 * x + (*s - '0'); + if (--outexp_mod < 0) { + if (j > n + 1) { + break; + } + outr[j++] = (DGTINT)x; + x = 0; + outexp_mod = log10_radix - 1; + } + } + while (outexp_mod-- >= 0) { + x *= 10; + } + while (j <= n + 1) { + outr[j++] = (DGTINT)x; + x = 0; + } + if (outr[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} diff --git a/tests/performance/superpi/pi_fftcs.h b/tests/performance/superpi/pi_fftcs.h new file mode 100644 index 000000000..419b15613 --- /dev/null +++ b/tests/performance/superpi/pi_fftcs.h @@ -0,0 +1,47 @@ +/* + Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999. + https://github.com/Fibonacci43/SuperPI + Modified for Arduino by Lucas Saavedra Vaz, 2024. +*/ + +#pragma once + +#include + +#define PI_FFTC_VER "ver. LG1.1.2-MP1.5.2a.memsave" + +/* Please check the following macros before compiling */ +#ifndef DBL_ERROR_MARGIN +#define DBL_ERROR_MARGIN 0.4 /* must be < 0.5 */ +#endif + +#define DGTINT short int /* sizeof(DGTINT) == 2 */ +#define DGTINT_MAX SHRT_MAX + +#define DGT_PACK 10 +#define DGT_PACK_LINE 5 +#define DGT_LINE_BLOCK 20 + +void pi_calc(int nfft); +void mp_load_0(int n, int radix, int out[]); +void mp_load_1(int n, int radix, int out[]); +void mp_round(int n, int radix, int m, int inout[]); +int mp_cmp(int n, int radix, int in1[], int in2[]); +void mp_add(int n, int radix, int in1[], int in2[], int out[]); +void mp_sub(int n, int radix, int in1[], int in2[], int out[]); +void mp_imul(int n, int radix, int in1[], int in2, int out[]); +int mp_idiv(int n, int radix, int in1[], int in2, int out[]); +void mp_idiv_2(int n, int radix, int in[], int out[]); +double mp_mul_radix_test(int n, int radix, int nfft, double tmpfft[]); +void mp_mul(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[], double tmp3fft[]); +void mp_squ(int n, int radix, int in[], int out[], int tmp[], int nfft, double tmp1fft[], double tmp2fft[]); +void mp_mulhf(int n, int radix, int in1[], int in2[], int out[], int tmp[], int nfft, double in1fft[], double tmpfft[]); +void mp_mulhf_use_in1fft(int n, int radix, double in1fft[], int in2[], int out[], int tmp[], int nfft, double tmpfft[]); +void mp_squhf_use_infft(int n, int radix, double infft[], int in[], int out[], int tmp[], int nfft, double tmpfft[]); +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], int nfft, double in1fft[], double outfft[]); +void mp_squh(int n, int radix, int in[], int out[], int nfft, double outfft[]); +int mp_inv(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]); +int mp_sqrt(int n, int radix, int in[], int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]); +int mp_invisqrt(int n, int radix, int in, int out[], int tmp1[], int tmp2[], int nfft, double tmp1fft[], double tmp2fft[]); +void mp_sprintf(int n, int log10_radix, int in[], char out[]); +void mp_sscanf(int n, int log10_radix, char in[], int out[]); diff --git a/tests/performance/superpi/superpi.ino b/tests/performance/superpi/superpi.ino new file mode 100644 index 000000000..ffa6c932b --- /dev/null +++ b/tests/performance/superpi/superpi.ino @@ -0,0 +1,41 @@ +/* + Based on "Calculation of PI(= 3.14159...) using FFT and AGM" by T.Ooura, Nov. 1999. + https://github.com/Fibonacci43/SuperPI + Modified for Arduino by Lucas Saavedra Vaz, 2024. +*/ + +#include + +#include "pi_fftcs.h" + +// Number of runs to average +#define N_RUNS 3 + +// Number of decimal digits to calculate +#define DIGITS (1 << 14) + +void setup() { + Serial.begin(115200); + while (!Serial) { + delay(10); + } + + log_d("Starting PI calculation"); + Serial.printf("Runs: %d\n", N_RUNS); + Serial.printf("Digits: %d\n", DIGITS); + Serial.flush(); + for (int i = 0; i < N_RUNS; i++) { + Serial.printf("Run %d", i); + unsigned long start = millis(); + pi_calc(DIGITS); + unsigned long elapsed = millis() - start; + Serial.printf("Time: %lu.%03lu s\n", elapsed / 1000, elapsed % 1000); + Serial.flush(); + } + + log_d("PI calculation test done"); +} + +void loop() { + vTaskDelete(NULL); +} diff --git a/tests/performance/superpi/test_superpi.py b/tests/performance/superpi/test_superpi.py new file mode 100644 index 000000000..0bd7a3477 --- /dev/null +++ b/tests/performance/superpi/test_superpi.py @@ -0,0 +1,53 @@ +import json +import logging +import os + + +def test_superpi(dut, request): + LOGGER = logging.getLogger(__name__) + + # Match "Runs: %d" + res = dut.expect(r"Runs: (\d+)", timeout=60) + runs = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of runs: {}".format(runs)) + + # Match "Digits: %d" + res = dut.expect(r"Digits: (\d+)", timeout=60) + digits = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Number of decimal digits: {}".format(digits)) + + list_time = [] + + for i in range(runs): + # Match "Run %d" + res = dut.expect(r"Run (\d+)", timeout=120) + run = int(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Run {}".format(run)) + assert run == i, "Invalid run number" + + # Match "Time: %lu.%03lu s" + res = dut.expect(r"Time: (\d+)\.(\d+) s", timeout=300) + time = float(res.group(0).decode("utf-8").split(" ")[1]) + LOGGER.info("Time on run {}: {} s".format(i, time)) + assert time > 0 and time < 1000, "Invalid time" + list_time.append(time) + + avg_time = round(sum(list_time) / len(list_time), 3) + + # Create JSON with results and write it to file + # Always create a JSON with this format (so it can be merged later on): + # { TEST_NAME_STR: TEST_RESULTS_DICT } + results = {"superpi": {"runs": runs, "digits": digits, "avg_time": avg_time}} + + current_folder = os.path.dirname(request.path) + file_index = 0 + report_file = os.path.join(current_folder, "result_superpi" + str(file_index) + ".json") + while os.path.exists(report_file): + report_file = report_file.replace(str(file_index) + ".json", str(file_index + 1) + ".json") + file_index += 1 + + with open(report_file, "w") as f: + try: + f.write(json.dumps(results)) + except Exception as e: + LOGGER.warning("Failed to write results to file: {}".format(e)) diff --git a/tests/requirements.txt b/tests/requirements.txt index 896699b57..289166dfe 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ cryptography>=2.1.4 --only-binary cryptography pytest-cov -pytest-embedded-serial-esp>=1.3.4 -pytest-embedded-arduino>=1.3.4 +pytest-embedded-serial-esp>=1.10.0 +pytest-embedded-arduino>=1.10.0 diff --git a/tests/democfg/cfg.json b/tests/validation/democfg/cfg.json similarity index 100% rename from tests/democfg/cfg.json rename to tests/validation/democfg/cfg.json diff --git a/tests/democfg/democfg.ino b/tests/validation/democfg/democfg.ino similarity index 100% rename from tests/democfg/democfg.ino rename to tests/validation/democfg/democfg.ino diff --git a/tests/democfg/test_democfg.py b/tests/validation/democfg/test_democfg.py similarity index 100% rename from tests/democfg/test_democfg.py rename to tests/validation/democfg/test_democfg.py diff --git a/tests/hello_world/hello_world.ino b/tests/validation/hello_world/hello_world.ino similarity index 100% rename from tests/hello_world/hello_world.ino rename to tests/validation/hello_world/hello_world.ino diff --git a/tests/hello_world/test_hello_world.py b/tests/validation/hello_world/test_hello_world.py similarity index 100% rename from tests/hello_world/test_hello_world.py rename to tests/validation/hello_world/test_hello_world.py diff --git a/tests/nvs/cfg.json b/tests/validation/nvs/cfg.json similarity index 100% rename from tests/nvs/cfg.json rename to tests/validation/nvs/cfg.json diff --git a/tests/nvs/nvs.ino b/tests/validation/nvs/nvs.ino similarity index 100% rename from tests/nvs/nvs.ino rename to tests/validation/nvs/nvs.ino diff --git a/tests/nvs/test_nvs.py b/tests/validation/nvs/test_nvs.py similarity index 100% rename from tests/nvs/test_nvs.py rename to tests/validation/nvs/test_nvs.py diff --git a/tests/periman/periman.ino b/tests/validation/periman/periman.ino similarity index 100% rename from tests/periman/periman.ino rename to tests/validation/periman/periman.ino diff --git a/tests/periman/test_periman.py b/tests/validation/periman/test_periman.py similarity index 100% rename from tests/periman/test_periman.py rename to tests/validation/periman/test_periman.py diff --git a/tests/timer/test_timer.py b/tests/validation/timer/test_timer.py similarity index 100% rename from tests/timer/test_timer.py rename to tests/validation/timer/test_timer.py diff --git a/tests/timer/timer.ino b/tests/validation/timer/timer.ino similarity index 100% rename from tests/timer/timer.ino rename to tests/validation/timer/timer.ino diff --git a/tests/touch/test_touch.py b/tests/validation/touch/test_touch.py similarity index 100% rename from tests/touch/test_touch.py rename to tests/validation/touch/test_touch.py diff --git a/tests/touch/touch.ino b/tests/validation/touch/touch.ino similarity index 100% rename from tests/touch/touch.ino rename to tests/validation/touch/touch.ino diff --git a/tests/uart/test_uart.py b/tests/validation/uart/test_uart.py similarity index 100% rename from tests/uart/test_uart.py rename to tests/validation/uart/test_uart.py diff --git a/tests/uart/uart.ino b/tests/validation/uart/uart.ino similarity index 100% rename from tests/uart/uart.ino rename to tests/validation/uart/uart.ino diff --git a/tests/unity/test_unity.py b/tests/validation/unity/test_unity.py similarity index 100% rename from tests/unity/test_unity.py rename to tests/validation/unity/test_unity.py diff --git a/tests/unity/unity.ino b/tests/validation/unity/unity.ino similarity index 100% rename from tests/unity/unity.ino rename to tests/validation/unity/unity.ino