From 7974f0ff82eedc467fa7eda6ac2067f4026b0d0d Mon Sep 17 00:00:00 2001
From: Veikko Sariola <veikko.sariola@gmail.com>
Date: Wed, 30 Dec 2020 19:50:38 +0200
Subject: [PATCH] fix(x86): denormalize delay damp filters

the damp filters, after input was switched off, cause the CPU to spike up and causing the tracker audio to start chopping
---
 README.md                       | 8 ++++++++
 templates/amd64-386/effects.asm | 7 +++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3802dfc..9c7ded9 100644
--- a/README.md
+++ b/README.md
@@ -274,6 +274,14 @@ Future goals
     combining multiple signals into one sync. Oh, and we probably should dump
     the whole thing also as a texture to the shader; to fly through the song, in
     a very literal way.
+  - **Find a solution for denormalized signals**. Denormalized floating point
+    numbers (floating point numbers that are very very small) can result in 100x
+    CPU slow down. We got hit by this already: the damp filters in delay units
+    were denormalizing, resulting in the synth being unusable in real time. Need
+    to investigate a) where denormalization can happen; b) how to prevent it:
+    add & substract value; c) make this optional to the user. For quick
+    explanation about the potential massive CPU hit, see
+    https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
 
 Crazy ideas
 -----------
diff --git a/templates/amd64-386/effects.asm b/templates/amd64-386/effects.asm
index 4efff3b..1eed166 100644
--- a/templates/amd64-386/effects.asm
+++ b/templates/amd64-386/effects.asm
@@ -315,6 +315,9 @@ su_op_delay_loop:
         fld     dword [{{.Input "delay" "damp"}}]         ; da s*(1-da) dr*y+s p*p*x
         fmul    dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o*da s*(1-da) dr*y+s p*p*x, where o is stored
         faddp   st1, st0                                ; o*da+s*(1-da) dr*y+s p*p*x
+        {{- .Float 0.5 | .Prepare | indent 4}}
+        fadd    dword [{{.Float 0.5 | .Use}}]           ; add and sub small offset to prevent denormalization. WARNING: this is highly important, as the damp filters might denormalize and give 100x CPU penalty
+        fsub    dword [{{.Float 0.5 | .Use}}]           ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
         fst     dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o'=o*da+s*(1-da), o' dr*y+s p*p*x
         fmul    dword [{{.Input "delay" "feedback"}}]     ; f*o' dr*y+s p*p*x
         fadd    st0, st2                                ; f*o'+p*p*x dr*y+s p*p*x
@@ -333,8 +336,8 @@ su_op_delay_loop:
     fst     dword [{{.CX}}+su_delayline_wrk.dcin]   ; i'=s, s c*o-i
     faddp   st1                                 ; s+c*o-i
 {{- .Float 0.5 | .Prepare | indent 4}}
-    fadd    dword [{{.Float 0.5 | .Use}}]                     ; add and sub small offset to prevent denormalization
-    fsub    dword [{{.Float 0.5 | .Use}}]
+    fadd    dword [{{.Float 0.5 | .Use}}]          ; add and sub small offset to prevent denormalization. WARNING: this is highly important, as low pass filters might denormalize and give 100x CPU penalty
+    fsub    dword [{{.Float 0.5 | .Use}}]          ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
     fst     dword [{{.CX}}+su_delayline_wrk.dcout]  ; o'=s+c*o-i
     ret
 {{end}}