sointu/vm/compiler/templates/amd64-386/effects.asm

{{- if .HasOp "hold"}}
;-------------------------------------------------------------------------------
;   HOLD opcode: sample and hold the signal, reducing sample rate
;-------------------------------------------------------------------------------
;   Mono version:   holds the signal at a rate defined by the freq parameter
;   Stereo version: holds both channels
;-------------------------------------------------------------------------------
{{.Func "su_op_hold" "Opcode"}}
{{- if .Stereo "hold"}}
    {{.Call "su_effects_stereohelper"}}
{{- end}}
    fld     dword [{{.Input "hold" "holdfreq"}}]    ; f x
    fmul    st0, st0                        ; f^2 x
    fchs                                    ; -f^2 x
    fadd    dword [{{.WRK}}]              ; p-f^2 x
    fst     dword [{{.WRK}}]              ; p <- p-f^2
    fldz                                    ; 0 p x
    fucomip st1                             ; p x
    fstp    dword [{{.SP}}-4]                   ; t=p, x
    jc      short su_op_hold_holding        ; if (0 < p) goto holding
    fld1                                    ; 1 x
    fadd    dword [{{.SP}}-4]                   ; 1+t x
    fstp    dword [{{.WRK}}]   ; x
    fst     dword [{{.WRK}}+4] ; save holded value
    ret                                     ; x
su_op_hold_holding:
    fstp    st0                             ;
    fld     dword [{{.WRK}}+4] ; x
    ret
{{end}}


{{- if .HasOp "crush"}}
;-------------------------------------------------------------------------------
;   CRUSH opcode: quantize the signal to finite number of levels
;-------------------------------------------------------------------------------
;   Mono:   x   ->  e*int(x/e)              where e=2**(-24*resolution)
;   Stereo: l r ->  e*int(l/e) e*int(r/e)
;-------------------------------------------------------------------------------
{{.Func "su_op_crush" "Opcode"}}
{{- if .Stereo "crush"}}
    {{.Call "su_effects_stereohelper"}}
{{- end}}
    xor     eax, eax
    {{.Call "su_nonlinear_map"}}
    fxch    st0, st1
    fdiv    st0, st1
    frndint
    fmulp   st1, st0
    ret
{{end}}


{{- if .HasOp "gain"}}
;-------------------------------------------------------------------------------
;   GAIN opcode: apply gain on the signal
;-------------------------------------------------------------------------------
;   Mono:   x   ->  x*g
;   Stereo: l r ->  l*g r*g
;-------------------------------------------------------------------------------
{{.Func "su_op_gain" "Opcode"}}
{{- if .Stereo "gain"}}
    fld     dword [{{.Input "gain" "gain"}}] ; g l (r)
{{- if .Mono "invgain"}}
    jnc     su_op_gain_mono
{{- end}}
    fmul    st2, st0                             ; g l r/g
su_op_gain_mono:
    fmulp   st1, st0                             ; l/g (r/)
    ret
{{- else}}
    fmul    dword [{{.Input "gain" "gain"}}]
    ret
{{- end}}
{{end}}


{{- if .HasOp "invgain"}}
;-------------------------------------------------------------------------------
;   INVGAIN opcode: apply inverse gain on the signal
;-------------------------------------------------------------------------------
;   Mono:   x   ->  x/g
;   Stereo: l r ->  l/g r/g
;-------------------------------------------------------------------------------
{{.Func "su_op_invgain" "Opcode"}}
{{- if .Stereo "invgain"}}
    fld     dword [{{.Input "invgain" "invgain"}}] ; g l (r)
{{- if .Mono "invgain"}}
    jnc     su_op_invgain_mono
{{- end}}
    fdiv    st2, st0                             ; g l r/g
su_op_invgain_mono:
    fdivp   st1, st0                             ; l/g (r/)
    ret
{{- else}}
    fdiv    dword [{{.Input "invgain" "invgain"}}]
    ret
{{- end}}
{{end}}

{{- if .HasOp "dbgain"}}
;-------------------------------------------------------------------------------
;   DBGAIN opcode: apply gain on the signal, with gain given in decibels
;-------------------------------------------------------------------------------
;   Mono:   x   ->  x*g, where g = 2**((2*d-1)*6.643856189774724) i.e. -40dB to 40dB, d=[0..1]
;   Stereo: l r ->  l*g r*g
;-------------------------------------------------------------------------------
{{.Func "su_op_dbgain" "Opcode"}}
{{- if .Stereo "dbgain"}}
    fld     dword [{{.Input "dbgain" "decibels"}}] ; d l r
{{- .Prepare (.Float 0.5)}}
    fsub    dword [{{.Use (.Float 0.5)}}]          ; d-.5
    fadd    st0, st0                               ; 2*d-1
{{- .Prepare (.Float 6.643856189774724)}}
    fmul    dword [{{.Use (.Float 6.643856189774724)}}] ; (2*d-1)*6.643856189774724
    {{.Call "su_power"}}
{{- if .Mono "dbgain"}}
    jnc     su_op_dbgain_mono
{{- end}}
    fmul    st2, st0                             ; g l r/g
su_op_dbgain_mono:
    fmulp   st1, st0                             ; l/g (r/)
    ret
{{- else}}
    fld     dword [{{.Input "dbgain" "decibels"}}] ; d l
{{- .Prepare (.Float 0.5)}}
    fsub    dword [{{.Use (.Float 0.5)}}]          ; d-.5
    fadd    st0, st0                               ; 2*d-1
{{- .Prepare (.Float 6.643856189774724)}}
    fmul    dword [{{.Use (.Float 6.643856189774724)}}] ; (2*d-1)*6.643856189774724
    {{.Call "su_power"}}
    fmulp   st1, st0
    ret
{{- end}}
{{end}}

{{- if .HasOp "filter"}}
;-------------------------------------------------------------------------------
;   FILTER opcode: perform low/high/band-pass/notch etc. filtering on the signal
;-------------------------------------------------------------------------------
;   Mono:   x   ->  filtered(x)
;   Stereo: l r ->  filtered(l) filtered(r)
;-------------------------------------------------------------------------------
{{.Func "su_op_filter" "Opcode"}}
    lodsb ; load the flags to al
{{- if .Stereo "filter"}}
    {{.Call "su_effects_stereohelper"}}
{{- end}}
    fld     dword [{{.Input "filter" "resonance"}}] ; r x
    fld     dword [{{.Input "filter" "frequency"}}]; f r x
    fmul    st0, st0                        ; f2 x (square the input so we never get negative and also have a smoother behaviour in the lower frequencies)
    fst     dword [{{.WRK}}+12]                   ; f2 r x
    fmul    dword [{{.WRK}}+8]  ; f2*b r x
    fadd    dword [{{.WRK}}]   ; f2*b+l r x
    fst     dword [{{.WRK}}]   ; l'=f2*b+l r x
    fsubp   st2, st0                        ; r x-l'
    fmul    dword [{{.WRK}}+8]  ; r*b x-l'
    fsubp   st1, st0                        ; x-l'-r*b
    {{- .Float 0.5 | .Prepare | indent 4}}
    fadd    dword [{{.Float 0.5 | .Use}}]           ; add and sub small offset to prevent denormalization
    fsub    dword [{{.Float 0.5 | .Use}}]           ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
    fst     dword [{{.WRK}}+4]  ; h'=x-l'-r*b
    fmul    dword [{{.WRK}}+12]                   ; f2*h'
    fadd    dword [{{.WRK}}+8]  ; f2*h'+b
    fstp    dword [{{.WRK}}+8]  ; b'=f2*h'+b
    fldz                                    ; 0
{{- if .SupportsParamValue "filter" "lowpass" 1}}
    test    al, byte 0x40
    jz      short su_op_filter_skiplowpass
    fadd    dword [{{.WRK}}]
su_op_filter_skiplowpass:
{{- end}}
{{- if .SupportsParamValue "filter" "bandpass" 1}}
    test    al, byte 0x20
    jz      short su_op_filter_skipbandpass
    fadd    dword [{{.WRK}}+8]
su_op_filter_skipbandpass:
{{- end}}
{{- if .SupportsParamValue "filter" "highpass" 1}}
    test    al, byte 0x10
    jz      short su_op_filter_skiphighpass
    fadd    dword [{{.WRK}}+4]
su_op_filter_skiphighpass:
{{- end}}
{{- if .SupportsParamValue "filter" "bandpass" -1}}
    test    al, byte 0x08
    jz      short su_op_filter_skipnegbandpass
    fsub    dword [{{.WRK}}+8]
su_op_filter_skipnegbandpass:
{{- end}}
{{- if .SupportsParamValue "filter" "highpass" -1}}
    test    al, byte 0x04
    jz      short su_op_filter_skipneghighpass
    fsub    dword [{{.WRK}}+4]
su_op_filter_skipneghighpass:
{{- end}}
    ret
{{end}}


{{- if .HasOp "belleq"}}
;-------------------------------------------------------------------------------
;   BELLEQ opcode: perform second order bell eq filtering on the signal
;-------------------------------------------------------------------------------
;   Mono:   x   ->  eq(x)
;   Stereo: l r ->  eq(l) eq(r)
;-------------------------------------------------------------------------------
{{.Func "su_op_belleq" "Opcode"}}
{{- if .Stereo "belleq"}}
    {{.Call "su_effects_stereohelper"}}
{{- end}}
    ; Note: we calculate the gain first because su_power needs temp stack and everything here was crafted to stay altogether below max 4 temp stack
    ; The cost of staying at max 4 stack was a few extra instructions because of stack juggling.
    ; The bell filter biquad coefficients (see go_synth.go):
    ;   b0, b1, b2 = 1+u, -2*cos(w), 1-u
	;   a0, a1, a2 = 1+v, b1, 1-v
    ; where w=freq*freq, u=alpha*A, v=alpha/A, alpha=sin(w)*2*bandwidth, A=gain. The filter is implemented as:
    ;  y = (b0*x+s1)/a0 = ((1+u)*x + s1) / (1+v) = (x+u*x+s1)/(1+v)
    ;  s1' = b1*x - a1*y + s2 = b1*(x-y)+s2 = 2*cos(w)*(y-x)+s2
	;  s2' = b2*x - a2*y = (1-u)*x-(1-v)*y = x-y-u*x+v*y
    fld     dword [{{.Input "belleq" "gain"}}]        ; g x
	{{- .Float 0.5 | .Prepare | indent 4}}
    fsub    dword [{{.Float 0.5 | .Use}}]           ; g-0.5 x
    {{- .Float 6.643856189774724 | .Prepare | indent 4}}
    fmul    dword [{{.Use (.Float 6.643856189774724)}}]   ; (g-0.5)*6.643856189774724 x
    {{.Call "su_power"}}                            ; A=2^((g-0.5)*6.643856189774724) x
    fld     dword [{{.Input "belleq" "frequency"}}] ; f A x
    fmul    st0, st0                                ; f*f A x
    fadd    st0, st0                                ; w=2*f*f
    fsincos                                         ; cos(w) sin(w) A x
    fadd    st0, st0                                ; r=2*cos(w) sin(w) A x
    fld     dword [{{.Input "belleq" "bandwidth"}}]   ; b r sin(w) A x
    fadd    st0, st0                                ; 2*b r sin(w) A x
    fmulp   st2, st0                                ; r alpha=sin(w)*2*b A x
	fxch    st0, st1                                ; alpha r A x
    fdivr   st2, st0                                ; alpha r v=alpha/A x
	fmul    st0, st0                                ; alpha*alpha r v x
    fdiv    st0, st2                                ; u=alpha*A r v x
    fld1                                            ; 1 u r v x
    faddp   st3, st0                                ; u r v+1 x
    fmul    st0, st3                                ; u*x r v+1 x
    fld     dword [{{.WRK}}]                        ; s1 u*x r v+1 x
    fadd    st0, st1                                ; s1+u*x u*x r v+1 x
    fadd    st0, st4                                ; s1+u*x+x u*x r v+1 x
    fdiv    st0, st3                                ; y=(s1+u*x+x)/(v+1) u*x r v+1 x
    {{- .Float 0.5 | .Prepare | indent 4}}
    fadd    dword [{{.Float 0.5 | .Use}}]           ; add and sub small offset to prevent denormalization
    fsub    dword [{{.Float 0.5 | .Use}}]           ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
    fmul    st3, st0                                ; y u*x r v*y+y x
    fsub    st3, st0                                ; y u*x r v*y x
    fxch    st4, st0                                ; x u*x r v*y y
    fsubr   st0, st4                                ; y-x u*x r v*y y
    fmul    st2, st0                                ; y-x u*x r*(y-x) v*y y
    fsubp   st3, st0                                ; u*x r*(y-x) x-y+v*y y
    fsubp   st2, st0                                ; r*(y-x) x-y+v*y-u*x y
    fadd    dword [{{.WRK}}+4]                      ; s2+r*(y-x) x-y+v*y-u*x y
    fstp    dword [{{.WRK}}]                        ; x-y+v*y-u*x y
    fstp    dword [{{.WRK}}+4]                      ; y
    ret
{{end}}


{{- if .HasOp "clip"}}
;-------------------------------------------------------------------------------
;   CLIP opcode: clips the signal into [-1,1] range
;-------------------------------------------------------------------------------
;   Mono:   x   ->  min(max(x,-1),1)
;   Stereo: l r ->  min(max(l,-1),1) min(max(r,-1),1)
;-------------------------------------------------------------------------------
{{.Func "su_op_clip" "Opcode"}}
{{- if .Stereo "clip"}}
    {{.Call "su_effects_stereohelper"}}
{{- end}}
    {{.TailCall "su_clip"}}
{{end}}


{{- if .HasOp "pan" -}}
;-------------------------------------------------------------------------------
;   PAN opcode: pan the signal
;-------------------------------------------------------------------------------
;   Mono:   s   ->  s*(1-p) s*p
;   Stereo: l r ->  l*(1-p) r*p
;
;   where p is the panning in [0,1] range
;-------------------------------------------------------------------------------
{{.Func "su_op_pan" "Opcode"}}
{{- if .Stereo "pan"}}
    jc      su_op_pan_do    ; this time, if this is mono op...
    fld     st0             ;   ...we duplicate the mono into stereo first
su_op_pan_do:
    fld     dword [{{.Input "pan" "panning"}}]    ; p l r
    fld1                                        ; 1 p l r
    fsub    st1                                 ; 1-p p l r
    fmulp   st2                                 ; p (1-p)*l r
    fmulp   st2                                 ; (1-p)*l p*r
    ret
{{- else}}
    fld     dword [{{.Input "pan" "panning"}}]    ; p s
    fmul    st1                                 ; p*s s
    fsub    st1, st0                            ; p*s s-p*s
                                                ; Equal to
                                                ; s*p s*(1-p)
    fxch                                        ; s*(1-p) s*p SHOULD PROBABLY DELETE, WHY BOTHER
    ret
{{- end}}
{{end}}


{{- if .HasOp "delay"}}
;-------------------------------------------------------------------------------
;   DELAY opcode: adds delay effect to the signal
;-------------------------------------------------------------------------------
;   Mono:   perform delay on ST0, using delaycount delaylines starting
;           at delayindex from the delaytable
;   Stereo: perform delay on ST1, using delaycount delaylines starting
;           at delayindex + delaycount from the delaytable (so the right delays
;           can be different)
;-------------------------------------------------------------------------------
{{.Func "su_op_delay" "Opcode"}}
    lodsw                           ; al = delay index, ah = delay count
    {{- .PushRegs .VAL "DelayVal" .COM "DelayCom" | indent 4}}
    movzx   ebx, al
{{- if .Library}}
    mov     {{.SI}}, [{{.Stack "DelayTable"}}] ; when using runtime tables, delaytimes is pulled from the stack so can be a pointer to heap
    lea     {{.BX}}, [{{.SI}} + {{.BX}}*2]
{{- else}}
{{- .Prepare "su_delay_times" | indent 4}}
    lea     {{.BX}},[{{.Use "su_delay_times"}} + {{.BX}}*2]                  ; BX now points to the right position within delay time table
{{- end}}
    movzx   esi, word [{{.Stack "GlobalTick"}}]          ; notice that we load word, so we wrap at 65536
    mov     {{.CX}}, {{.PTRWORD}} [{{.Stack "DelayWorkSpace"}}]   ; {{.WRK}} is now the separate delay workspace, as they require a lot more space
{{- if .StereoAndMono "delay"}}
    jnc     su_op_delay_mono
{{- end}}
{{- if .Stereo "delay"}}
    push    {{.AX}}                 ; save _ah (delay count)
    fxch                        ; r l
    call    su_op_delay_do      ; D(r) l        process delay for the right channel
    pop     {{.AX}}                 ; restore the count for second run
    fxch                        ; l D(r)
su_op_delay_mono:               ; flow into mono delay
{{- end}}
    call    su_op_delay_do      ; when stereo delay is not enabled, we could inline this to save 5 bytes, but I expect stereo delay to be farely popular so maybe not worth the hassle
    mov     {{.PTRWORD}} [{{.Stack "DelayWorkSpace"}}],{{.CX}}   ; move delay workspace pointer back to stack.
    {{- .PopRegs .VAL .COM | indent 4}}
{{- if .SupportsModulation "delay" "delaytime"}}
    xor     eax, eax
    mov     dword [{{.Modulation "delay" "delaytime"}}], eax
{{- end}}
    ret

;-------------------------------------------------------------------------------
;   su_op_delay_do: executes the actual delay
;-------------------------------------------------------------------------------
;   Pseudocode:
;   q = dr*x
;   for (i = 0;i < count;i++)
;     s = b[(t-delaytime[i+offset])&65535]
;     q += s
;     o[i] = o[i]*da+s*(1-da)
;     b[t] = f*o[i] +p^2*x
;  Perform dc-filtering q and output q
;-------------------------------------------------------------------------------
{{.Func "su_op_delay_do"}}                         ; x y
    fld     st0
    fmul    dword [{{.Input "delay" "pregain"}}]  ; p*x y
    fmul    dword [{{.Input "delay" "pregain"}}]  ; p*p*x y
    fxch                                        ; y p*p*x
    fmul    dword [{{.Input "delay" "dry"}}]      ; dr*y p*p*x
su_op_delay_loop:
        {{- if or (.SupportsModulation "delay" "delaytime") (.SupportsParamValue "delay" "notetracking" 1)}} ; delaytime modulation or note syncing require computing the delay time in floats
        fild    word [{{.BX}}]         ; k dr*y p*p*x, where k = delay time
        {{- if .SupportsModulation "delay" "delaytime"}}
        fld     dword [{{.Modulation "delay" "delaytime"}}]
        {{- .Float 32767.0 | .Prepare | indent 8}}
        fmul    dword [{{.Float 32767.0 | .Use}}] ; scale it up, as the modulations would be too small otherwise
        faddp   st1, st0
        {{- end}}
        {{- if .SupportsParamValue "delay" "notetracking" 1}}
        test    ah, 1 ; note syncing is the least significant bit of ah, 0 = ON, 1 = OFF
        jne     su_op_delay_skipnotesync
        fild    dword [{{.INP}}-su_voice.inputs+su_voice.note]
        {{.Int 0x3DAAAAAA | .Prepare | indent 8}}
        fmul    dword [{{.Int 0x3DAAAAAA | .Use}}]
        {{.Call "su_power"}}
        fdivp   st1, st0                 ; use 10787 for delaytime to have neutral transpose
        su_op_delay_skipnotesync:
        {{- end}}
        fistp   dword [{{.SP}}-4]                       ; dr*y p*p*x, dword [{{.SP}}-4] = integer amount of delay (samples)
        mov     edi, esi                            ; edi = esi = current time
        sub     di, word [{{.SP}}-4]                    ; we perform the math in 16-bit to wrap around
        {{- else}}
        mov     edi, esi
        sub     di, word [{{.BX}}]                      ; we perform the math in 16-bit to wrap around
        {{- end}}
        fld     dword [{{.CX}}+su_delayline_wrk.buffer+{{.DI}}*4]; s dr*y p*p*x, where s is the sample from delay buffer
        fadd    st1, st0                                ; s dr*y+s p*p*x (add comb output to current output)
        fld1                                            ; 1 s dr*y+s p*p*x
        fsub    dword [{{.Input "delay" "damp"}}]         ; 1-da s dr*y+s p*p*x
        fmulp   st1, st0                                ; s*(1-da) dr*y+s p*p*x
        fld     dword [{{.Input "delay" "damp"}}]         ; da s*(1-da) dr*y+s p*p*x
        fmul    dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o*da s*(1-da) dr*y+s p*p*x, where o is stored
        faddp   st1, st0                                ; o*da+s*(1-da) dr*y+s p*p*x
        {{- .Float 0.5 | .Prepare | indent 4}}
        fadd    dword [{{.Float 0.5 | .Use}}]           ; add and sub small offset to prevent denormalization. WARNING: this is highly important, as the damp filters might denormalize and give 100x CPU penalty
        fsub    dword [{{.Float 0.5 | .Use}}]           ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
        fst     dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o'=o*da+s*(1-da), o' dr*y+s p*p*x
        fmul    dword [{{.Input "delay" "feedback"}}]     ; f*o' dr*y+s p*p*x
        fadd    st0, st2                                ; f*o'+p*p*x dr*y+s p*p*x
        fstp    dword [{{.CX}}+su_delayline_wrk.buffer+{{.SI}}*4]; save f*o'+p*p*x to delay buffer
        add     {{.BX}},2                                   ; move to next index
        add     {{.CX}}, su_delayline_wrk.size              ; go to next delay delay workspace
        sub     ah, 2
        jg      su_op_delay_loop                        ; if ah > 0, goto loop
    fstp    st1                                 ; dr*y+s1+s2+s3+...
    ; DC-filtering
    fld     dword [{{.CX}}+su_delayline_wrk.dcout]  ; o s
{{- .Float 0.99609375 | .Prepare | indent 4}}
    fmul    dword [{{.Float 0.99609375 | .Use}}]                ; c*o s
    fsub    dword [{{.CX}}+su_delayline_wrk.dcin]   ; c*o-i s
    fxch                                        ; s c*o-i
    fst     dword [{{.CX}}+su_delayline_wrk.dcin]   ; i'=s, s c*o-i
    faddp   st1                                 ; s+c*o-i
{{- .Float 0.5 | .Prepare | indent 4}}
    fadd    dword [{{.Float 0.5 | .Use}}]          ; add and sub small offset to prevent denormalization. WARNING: this is highly important, as low pass filters might denormalize and give 100x CPU penalty
    fsub    dword [{{.Float 0.5 | .Use}}]          ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
    fst     dword [{{.CX}}+su_delayline_wrk.dcout]  ; o'=s+c*o-i
    ret
{{end}}


{{- if .HasOp "compressor"}}
;-------------------------------------------------------------------------------
;   COMPRES opcode: push compressor gain to stack
;-------------------------------------------------------------------------------
;   Mono:   push g on stack, where g is a suitable gain for the signal
;           you can either MULP to compress the signal or SEND it to a GAIN
;           somewhere else for compressor side-chaining.
;   Stereo: push g g on stack, where g is calculated using l^2 + r^2
;-------------------------------------------------------------------------------
{{.Func "su_op_compressor" "Opcode"}}
    fld     st0                                 ; x x
    fmul    st0, st0                            ; x^2 x
{{- if .StereoAndMono "compressor"}}
    jnc     su_op_compressor_mono
{{- end}}
{{- if .Stereo "compressor"}}
    fld     st2                                 ; r x^2 l r
    fst     st3                                 ; y x^2 l r
    fmul    st0, st0                            ; y^2 x^2 l r
    faddp   st1, st0                            ; y^2+x^2 l r
{{- if .StereoAndMono "compressor"}}
    call    su_op_compressor_mono               ; So, for stereo, we square both left & right and add them up
    fld     st0                                 ; and return the computed gain two times, ready for MULP STEREO
    ret
su_op_compressor_mono:
{{- end}}
{{- end}}
    fld     dword [{{.WRK}}]    ; l x^2 x
    fucomi  st0, st1
    setnb   al                                  ; if (st0 >= st1) al = 1; else al = 0;
    fsubp   st1, st0                            ; x^2-l x
    {{.Call "su_nonlinear_map"}}                ; c x^2-l x, c is either attack or release parameter mapped in a nonlinear way
    fmulp   st1, st0                            ; c*(x^2-l) x
    fadd    dword [{{.WRK}}]    ; l+c*(x^2-l) x   // we could've kept level in the stack and save a few bytes, but su_env_map uses 3 stack (c + 2 temp), so the stack was getting quite big.
    ; TODO: make this denormalization optional, if the user wants to save some space
    {{- .Float 0.5 | .Prepare | indent 4}}
    fadd    dword [{{.Float 0.5 | .Use}}]           ; add and sub small offset to prevent denormalization. WARNING: this is highly important, as the damp filters might denormalize and give 100x CPU penalty
    fsub    dword [{{.Float 0.5 | .Use}}]           ; See for example: https://stackoverflow.com/questions/36781881/why-denormalized-floats-are-so-much-slower-than-other-floats-from-hardware-arch
    fst     dword [{{.WRK}}]    ; l'=l+c*(x^2-l), l' x
    fld     dword [{{.Input "compressor" "threshold"}}] ; t l' x
    fmul    st0, st0                            ; t*t l' x
    fxch                                        ; l' t*t x
    fucomi  st0, st1                            ; if l' < t*t
    fcmovb  st0, st1                            ;   l'=t*t
    fdivp   st1, st0                            ; t*t/l' x
    fld     dword [{{.Input "compressor" "ratio"}}]  ; r t*t/l' x
{{.Float 0.5 | .Prepare | indent 4}}
    fmul    dword [{{.Float 0.5 | .Use}}]       ; p=r/2 t*t/l' x
    fxch                                        ; t*t/l' p x
    fyl2x                                       ; p*log2(t*t/l') x
    {{.Call "su_power"}}                        ; 2^(p*log2(t*t/l')) x
                                                ; Equal to:
                                                ; (t*t/l')^p x
                                                ; if ratio is at minimum => p=0 => 1 x
                                                ; if ratio is at maximum => p=0.5 => t/x => t/x*x=t
    fdiv    dword [{{.Input "compressor" "invgain"}}]; this used to be pregain but that ran into problems with getting back up to 0 dB so postgain should be better at that
{{- if and (.Stereo "compressor") (not (.Mono "compressor"))}}
    fld     st0                                 ; and return the computed gain two times, ready for MULP STEREO
{{- end}}
    ret
{{- end}}