feat(compiler): Add support for targeting WebAssembly.

The working principle is similar as before with x86, but instead of outputting .asm, it outputs .wat. This can be compiled into .wasm by using the wat2wasm assembler.
2026-07-13 14:50:55 -04:00 · 2020-12-26 23:16:18 +02:00
parent 7e4bcf18e4
commit e4490faa2e
32 changed files with 2138 additions and 170 deletions
--- a/templates/amd64-386/arithmetic.asm
+++ b/templates/amd64-386/arithmetic.asm
@@ -0,0 +1,205 @@
+{{- if .HasOp "pop"}}
+;-------------------------------------------------------------------------------
+;   POP opcode: remove (discard) the topmost signal from the stack
+;-------------------------------------------------------------------------------
+{{- if .Mono "pop" -}}
+;   Mono:   a -> (empty)
+{{- end}}
+{{- if .Stereo "pop" -}}
+;   Stereo: a b -> (empty)
+{{- end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_pop" "Opcode"}}
+{{- if .StereoAndMono "pop"}}
+    jnc     su_op_pop_mono
+{{- end}}
+{{- if .Stereo "pop"}}
+    fstp    st0
+{{- end}}
+{{- if .StereoAndMono "pop"}}
+su_op_pop_mono:
+{{- end}}
+    fstp    st0
+    ret
+{{end}}
+
+
+{{- if .HasOp "add"}}
+;-------------------------------------------------------------------------------
+;   ADD opcode: add the two top most signals on the stack
+;-------------------------------------------------------------------------------
+{{- if .Mono "add"}}
+;   Mono:   a b -> a+b b
+{{- end}}
+{{- if .Stereo "add" -}}
+;   Stereo: a b c d -> a+c b+d c d
+{{- end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_add" "Opcode"}}
+{{- if .StereoAndMono "add"}}
+    jnc     su_op_add_mono
+{{- end}}
+{{- if .Stereo "add"}}
+    fadd    st0, st2
+    fxch
+    fadd    st0, st3
+    fxch
+    ret
+{{- end}}
+{{- if .StereoAndMono "add"}}
+su_op_add_mono:
+{{- end}}
+{{- if .Mono "add"}}
+    fadd    st1
+{{- end}}
+{{- if .Mono "add"}}
+    ret
+    {{- end}}
+{{end}}
+
+
+{{- if .HasOp "addp"}}
+;-------------------------------------------------------------------------------
+;   ADDP opcode: add the two top most signals on the stack and pop
+;-------------------------------------------------------------------------------
+;   Mono:   a b -> a+b
+;   Stereo: a b c d -> a+c b+d
+;-------------------------------------------------------------------------------
+{{.Func "su_op_addp" "Opcode"}}
+{{- if .StereoAndMono "addp"}}
+    jnc     su_op_addp_mono
+{{- end}}
+{{- if .Stereo "addp"}}
+    faddp   st2, st0
+    faddp   st2, st0
+    ret
+{{- end}}
+{{- if .StereoAndMono "addp"}}
+su_op_addp_mono:
+{{- end}}
+{{- if (.Mono "addp")}}
+    faddp   st1, st0
+    ret
+{{- end}}
+{{end}}
+
+
+{{- if .HasOp "loadnote"}}
+;-------------------------------------------------------------------------------
+;   LOADNOTE opcode: load the current note, scaled to [-1,1]
+;-------------------------------------------------------------------------------
+{{if (.Mono "loadnote") -}}  ;   Mono:   (empty) -> n, where n is the note{{end}}
+{{if (.Stereo "loadnote") -}};   Stereo: (empty) -> n n{{end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_loadnote" "Opcode"}}
+{{- if .StereoAndMono "loadnote"}}
+    jnc     su_op_loadnote_mono
+{{- end}}
+{{- if .Stereo "loadnote"}}
+    call    su_op_loadnote_mono
+    su_op_loadnote_mono:
+{{- end}}
+    fild    dword [{{.INP}}-su_voice.inputs+su_voice.note]
+    {{.Prepare (.Float 0.0078125)}}
+    fmul    dword [{{.Use (.Float 0.0078125)}}]  ; s=n/128.0
+    {{.Prepare (.Float 0.5)}}
+    fsub    dword [{{.Use (.Float 0.5)}}]        ; s-.5
+    fadd    st0, st0                        ; 2*s-1
+    ret
+{{end}}
+
+
+{{- if .HasOp "mul"}}
+;-------------------------------------------------------------------------------
+;   MUL opcode: multiply the two top most signals on the stack
+;-------------------------------------------------------------------------------
+;   Mono:   a b -> a*b a
+;   Stereo: a b c d -> a*c b*d c d
+;-------------------------------------------------------------------------------
+{{.Func "su_op_mul" "Opcode"}}
+    jnc su_op_mul_mono
+    fmul    st0, st2
+    fxch
+    fadd    st0, st3
+    fxch
+    ret
+su_op_mul_mono:
+    fmul    st1
+    ret
+{{end}}
+
+
+{{- if .HasOp "mulp"}}
+;-------------------------------------------------------------------------------
+;   MULP opcode: multiply the two top most signals on the stack and pop
+;-------------------------------------------------------------------------------
+;   Mono:   a b -> a*b
+;   Stereo: a b c d -> a*c b*d
+;-------------------------------------------------------------------------------
+{{.Func "su_op_mulp" "Opcode"}}
+{{- if .StereoAndMono "mulp"}}
+    jnc     su_op_mulp_mono
+{{- end}}
+{{- if .Stereo "mulp"}}
+    fmulp   st2, st0
+    fmulp   st2, st0
+    ret
+{{- end}}
+{{- if .StereoAndMono "mulp"}}
+su_op_mulp_mono:
+{{- end}}
+{{- if .Mono "mulp"}}
+    fmulp   st1
+    ret
+{{- end}}
+{{end}}
+
+
+{{- if .HasOp "push"}}
+;-------------------------------------------------------------------------------
+;   PUSH opcode: push the topmost signal on the stack
+;-------------------------------------------------------------------------------
+;   Mono:   a -> a a
+;   Stereo: a b -> a b a b
+;-------------------------------------------------------------------------------
+{{.Func "su_op_push" "Opcode"}}
+{{- if .StereoAndMono "push"}}
+    jnc     su_op_push_mono
+{{- end}}
+{{- if .Stereo "push"}}
+    fld     st1
+    fld     st1
+    ret
+{{- end}}
+{{- if .StereoAndMono "push"}}
+su_op_push_mono:
+{{- end}}
+{{- if .Mono "push"}}
+    fld     st0
+    ret
+    {{- end}}
+{{end}}
+
+
+{{- if .HasOp "xch"}}
+;-------------------------------------------------------------------------------
+;   XCH opcode: exchange the signals on the stack
+;-------------------------------------------------------------------------------
+;   Mono:   a b -> b a
+;   stereo: a b c d -> c d a b
+;-------------------------------------------------------------------------------
+{{.Func "su_op_xch" "Opcode"}}
+{{- if .StereoAndMono "xch"}}
+    jnc     su_op_xch_mono
+{{- end}}
+{{- if .Stereo "xch"}}
+    fxch    st0, st2 ; c b a d
+    fxch    st0, st1 ; b c a d
+    fxch    st0, st3 ; d c a b
+{{- end}}
+{{- if .StereoAndMono "xch"}}
+su_op_xch_mono:
+{{- end}}
+    fxch    st0, st1
+    ret
+{{end}}
--- a/templates/amd64-386/effects.asm
+++ b/templates/amd64-386/effects.asm
@@ -0,0 +1,394 @@
+{{- if .HasOp "distort"}}
+;-------------------------------------------------------------------------------
+;   DISTORT opcode: apply distortion on the signal
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  x*a/(1-a+(2*a-1)*abs(x))            where x is clamped first
+;   Stereo: l r ->  l*a/(1-a+(2*a-1)*abs(l)) r*a/(1-a+(2*a-1)*abs(r))
+;-------------------------------------------------------------------------------
+{{.Func "su_op_distort" "Opcode"}}
+{{- if .Stereo "distort" -}}
+    {{.Call "su_effects_stereohelper"}}
+{{- end}}
+    fld     dword [{{.Input "distort" "drive"}}]
+    {{.TailCall "su_waveshaper"}}
+{{end}}
+
+
+{{- if .HasOp "hold"}}
+;-------------------------------------------------------------------------------
+;   HOLD opcode: sample and hold the signal, reducing sample rate
+;-------------------------------------------------------------------------------
+;   Mono version:   holds the signal at a rate defined by the freq parameter
+;   Stereo version: holds both channels
+;-------------------------------------------------------------------------------
+{{.Func "su_op_hold" "Opcode"}}
+{{- if .Stereo "hold"}}
+    {{.Call "su_effects_stereohelper"}}
+{{- end}}
+    fld     dword [{{.Input "hold" "holdfreq"}}]    ; f x
+    fmul    st0, st0                        ; f^2 x
+    fchs                                    ; -f^2 x
+    fadd    dword [{{.WRK}}]              ; p-f^2 x
+    fst     dword [{{.WRK}}]              ; p <- p-f^2
+    fldz                                    ; 0 p x
+    fucomip st1                             ; p x
+    fstp    dword [{{.SP}}-4]                   ; t=p, x
+    jc      short su_op_hold_holding        ; if (0 < p) goto holding
+    fld1                                    ; 1 x
+    fadd    dword [{{.SP}}-4]                   ; 1+t x
+    fstp    dword [{{.WRK}}]   ; x
+    fst     dword [{{.WRK}}+4] ; save holded value
+    ret                                     ; x
+su_op_hold_holding:
+    fstp    st0                             ;
+    fld     dword [{{.WRK}}+4] ; x
+    ret
+{{end}}
+
+
+{{- if .HasOp "crush"}}
+;-------------------------------------------------------------------------------
+;   CRUSH opcode: quantize the signal to finite number of levels
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  e*int(x/e)
+;   Stereo: l r ->  e*int(l/e) e*int(r/e)
+;-------------------------------------------------------------------------------
+{{.Func "su_op_crush" "Opcode"}}
+{{- if .Stereo "crush"}}
+    {{.Call "su_effects_stereohelper"}}
+{{- end}}
+    fdiv    dword [{{.Input "crush" "resolution"}}]
+    frndint
+    fmul    dword [{{.Input "crush" "resolution"}}]
+    ret
+{{end}}
+
+
+{{- if .HasOp "gain"}}
+;-------------------------------------------------------------------------------
+;   GAIN opcode: apply gain on the signal
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  x*g
+;   Stereo: l r ->  l*g r*g
+;-------------------------------------------------------------------------------
+{{.Func "su_op_gain" "Opcode"}}
+{{- if .Stereo "gain"}}
+    fld     dword [{{.Input "gain" "gain"}}] ; g l (r)
+{{- if .Mono "invgain"}}
+    jnc     su_op_gain_mono
+{{- end}}
+    fmul    st2, st0                             ; g l r/g
+su_op_gain_mono:
+    fmulp   st1, st0                             ; l/g (r/)
+    ret
+{{- else}}
+    fmul    dword [{{.Input "gain" "gain"}}]
+    ret
+{{- end}}
+{{end}}
+
+
+{{- if .HasOp "invgain"}}
+;-------------------------------------------------------------------------------
+;   INVGAIN opcode: apply inverse gain on the signal
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  x/g
+;   Stereo: l r ->  l/g r/g
+;-------------------------------------------------------------------------------
+{{.Func "su_op_invgain" "Opcode"}}
+{{- if .Stereo "invgain"}}
+    fld     dword [{{.Input "invgain" "invgain"}}] ; g l (r)
+{{- if .Mono "invgain"}}
+    jnc     su_op_invgain_mono
+{{- end}}
+    fdiv    st2, st0                             ; g l r/g
+su_op_invgain_mono:
+    fdivp   st1, st0                             ; l/g (r/)
+    ret
+{{- else}}
+    fdiv    dword [{{.Input "invgain" "invgain"}}]
+    ret
+{{- end}}
+{{end}}
+
+
+{{- if .HasOp "filter"}}
+;-------------------------------------------------------------------------------
+;   FILTER opcode: perform low/high/band-pass/notch etc. filtering on the signal
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  filtered(x)
+;   Stereo: l r ->  filtered(l) filtered(r)
+;-------------------------------------------------------------------------------
+{{.Func "su_op_filter" "Opcode"}}
+    lodsb ; load the flags to al
+{{- if .Stereo "filter"}}
+    {{.Call "su_effects_stereohelper"}}
+{{- end}}
+    fld     dword [{{.Input "filter" "resonance"}}] ; r x
+    fld     dword [{{.Input "filter" "frequency"}}]; f r x
+    fmul    st0, st0                        ; f2 x (square the input so we never get negative and also have a smoother behaviour in the lower frequencies)
+    fst     dword [{{.SP}}-4]                   ; f2 r x
+    fmul    dword [{{.WRK}}+8]  ; f2*b r x
+    fadd    dword [{{.WRK}}]   ; f2*b+l r x
+    fst     dword [{{.WRK}}]   ; l'=f2*b+l r x
+    fsubp   st2, st0                        ; r x-l'
+    fmul    dword [{{.WRK}}+8]  ; r*b x-l'
+    fsubp   st1, st0                        ; x-l'-r*b
+    fst     dword [{{.WRK}}+4]  ; h'=x-l'-r*b
+    fmul    dword [{{.SP}}-4]                   ; f2*h'
+    fadd    dword [{{.WRK}}+8]  ; f2*h'+b
+    fstp    dword [{{.WRK}}+8]  ; b'=f2*h'+b
+    fldz                                    ; 0
+{{- if .SupportsParamValue "filter" "lowpass" 1}}
+    test    al, byte 0x40
+    jz      short su_op_filter_skiplowpass
+    fadd    dword [{{.WRK}}]
+su_op_filter_skiplowpass:
+{{- end}}
+{{- if .SupportsParamValue "filter" "bandpass" 1}}
+    test    al, byte 0x20
+    jz      short su_op_filter_skipbandpass
+    fadd    dword [{{.WRK}}+8]
+su_op_filter_skipbandpass:
+{{- end}}
+{{- if .SupportsParamValue "filter" "highpass" 1}}
+    test    al, byte 0x10
+    jz      short su_op_filter_skiphighpass
+    fadd    dword [{{.WRK}}+4]
+su_op_filter_skiphighpass:
+{{- end}}
+{{- if .SupportsParamValue "filter" "negbandpass" 1}}
+    test    al, byte 0x08
+    jz      short su_op_filter_skipnegbandpass
+    fsub    dword [{{.WRK}}+8]
+su_op_filter_skipnegbandpass:
+{{- end}}
+{{- if .SupportsParamValue "filter" "neghighpass" 1}}
+    test    al, byte 0x04
+    jz      short su_op_filter_skipneghighpass
+    fsub    dword [{{.WRK}}+4]
+su_op_filter_skipneghighpass:
+{{- end}}
+    ret
+{{end}}
+
+
+{{- if .HasOp "clip"}}
+;-------------------------------------------------------------------------------
+;   CLIP opcode: clips the signal into [-1,1] range
+;-------------------------------------------------------------------------------
+;   Mono:   x   ->  min(max(x,-1),1)
+;   Stereo: l r ->  min(max(l,-1),1) min(max(r,-1),1)
+;-------------------------------------------------------------------------------
+{{.Func "su_op_clip" "Opcode"}}
+{{- if .Stereo "clip"}}
+    {{.Call "su_effects_stereohelper"}}
+{{- end}}
+    {{.TailCall "su_clip"}}
+{{end}}
+
+
+{{- if .HasOp "pan" -}}
+;-------------------------------------------------------------------------------
+;   PAN opcode: pan the signal
+;-------------------------------------------------------------------------------
+;   Mono:   s   ->  s*(1-p) s*p
+;   Stereo: l r ->  l*(1-p) r*p
+;
+;   where p is the panning in [0,1] range
+;-------------------------------------------------------------------------------
+{{.Func "su_op_pan" "Opcode"}}
+{{- if .Stereo "pan"}}
+    jc      su_op_pan_do    ; this time, if this is mono op...
+    fld     st0             ;   ...we duplicate the mono into stereo first
+su_op_pan_do:
+    fld     dword [{{.Input "pan" "panning"}}]    ; p l r
+    fld1                                        ; 1 p l r
+    fsub    st1                                 ; 1-p p l r
+    fmulp   st2                                 ; p (1-p)*l r
+    fmulp   st2                                 ; (1-p)*l p*r
+    ret
+{{- else}}
+    fld     dword [{{.Input "pan" "panning"}}]    ; p s
+    fmul    st1                                 ; p*s s
+    fsub    st1, st0                            ; p*s s-p*s
+                                                ; Equal to
+                                                ; s*p s*(1-p)
+    fxch                                        ; s*(1-p) s*p SHOULD PROBABLY DELETE, WHY BOTHER
+    ret
+{{- end}}
+{{end}}
+
+
+{{- if .HasOp "delay"}}
+;-------------------------------------------------------------------------------
+;   DELAY opcode: adds delay effect to the signal
+;-------------------------------------------------------------------------------
+;   Mono:   perform delay on ST0, using delaycount delaylines starting
+;           at delayindex from the delaytable
+;   Stereo: perform delay on ST1, using delaycount delaylines starting
+;           at delayindex + delaycount from the delaytable (so the right delays
+;           can be different)
+;-------------------------------------------------------------------------------
+{{.Func "su_op_delay" "Opcode"}}
+    lodsw                           ; al = delay index, ah = delay count
+    {{- .PushRegs .VAL "DelayVal" .COM "DelayCom" | indent 4}}
+    movzx   ebx, al
+{{- if .Library}}
+    mov     {{.SI}}, [{{.Stack "DelayTable"}}] ; when using runtime tables, delaytimes is pulled from the stack so can be a pointer to heap
+    lea     {{.BX}}, [{{.SI}} + {{.BX}}*2]
+{{- else}}
+{{- .Prepare "su_delay_times" | indent 4}}
+    lea     {{.BX}},[{{.Use "su_delay_times"}} + {{.BX}}*2]                  ; BX now points to the right position within delay time table
+{{- end}}
+    movzx   esi, word [{{.Stack "GlobalTick"}}]          ; notice that we load word, so we wrap at 65536
+    mov     {{.CX}}, {{.PTRWORD}} [{{.Stack "DelayWorkSpace"}}]   ; {{.WRK}} is now the separate delay workspace, as they require a lot more space
+{{- if .StereoAndMono "delay"}}
+    jnc     su_op_delay_mono
+{{- end}}
+{{- if .Stereo "delay"}}
+    push    {{.AX}}                 ; save _ah (delay count)
+    fxch                        ; r l
+    call    su_op_delay_do      ; D(r) l        process delay for the right channel
+    pop     {{.AX}}                 ; restore the count for second run
+    fxch                        ; l D(r)
+su_op_delay_mono:               ; flow into mono delay
+{{- end}}
+    call    su_op_delay_do      ; when stereo delay is not enabled, we could inline this to save 5 bytes, but I expect stereo delay to be farely popular so maybe not worth the hassle
+    mov     {{.PTRWORD}} [{{.Stack "DelayWorkSpace"}}],{{.CX}}   ; move delay workspace pointer back to stack.
+    {{- .PopRegs .VAL .COM | indent 4}}
+{{- if .SupportsModulation "delay" "delaytime"}}
+    xor     eax, eax
+    mov     dword [{{.Modulation "delay" "delaytime"}}], eax
+{{- end}}
+    ret
+
+;-------------------------------------------------------------------------------
+;   su_op_delay_do: executes the actual delay
+;-------------------------------------------------------------------------------
+;   Pseudocode:
+;   q = dr*x
+;   for (i = 0;i < count;i++)
+;     s = b[(t-delaytime[i+offset])&65535]
+;     q += s
+;     o[i] = o[i]*da+s*(1-da)
+;     b[t] = f*o[i] +p^2*x
+;  Perform dc-filtering q and output q
+;-------------------------------------------------------------------------------
+{{.Func "su_op_delay_do"}}                         ; x y
+    fld     st0
+    fmul    dword [{{.Input "delay" "pregain"}}]  ; p*x y
+    fmul    dword [{{.Input "delay" "pregain"}}]  ; p*p*x y
+    fxch                                        ; y p*p*x
+    fmul    dword [{{.Input "delay" "dry"}}]      ; dr*y p*p*x
+su_op_delay_loop:
+        {{- if or (.SupportsModulation "delay" "delaytime") (.SupportsParamValue "delay" "notetracking" 1)}} ; delaytime modulation or note syncing require computing the delay time in floats
+        fild    word [{{.BX}}]         ; k dr*y p*p*x, where k = delay time
+        {{- if .SupportsParamValue "delay" "notetracking" 1}}
+        test    ah, 1 ; note syncing is the least significant bit of ah, 0 = ON, 1 = OFF
+        jne     su_op_delay_skipnotesync
+        fild    dword [{{.INP}}-su_voice.inputs+su_voice.note]
+        {{.Int 0x3DAAAAAA | .Prepare | indent 8}}
+        fmul    dword [{{.Int 0x3DAAAAAA | .Use}}]
+        {{.Call "su_power"}}
+        fdivp   st1, st0                 ; use 10787 for delaytime to have neutral transpose
+        su_op_delay_skipnotesync:
+        {{- end}}
+        {{- if .SupportsModulation "delay" "delaytime"}}
+        fld     dword [{{.Modulation "delay" "delaytime"}}]
+        {{- .Float 32767.0 | .Prepare | indent 8}}
+        fmul    dword [{{.Float 32767.0 | .Use}}] ; scale it up, as the modulations would be too small otherwise
+        faddp   st1, st0
+        {{- end}}
+        fistp   dword [{{.SP}}-4]                       ; dr*y p*p*x, dword [{{.SP}}-4] = integer amount of delay (samples)
+        mov     edi, esi                            ; edi = esi = current time
+        sub     di, word [{{.SP}}-4]                    ; we perform the math in 16-bit to wrap around
+        {{- else}}
+        mov     edi, esi
+        sub     di, word [{{.BX}}]                      ; we perform the math in 16-bit to wrap around
+        {{- end}}
+        fld     dword [{{.CX}}+su_delayline_wrk.buffer+{{.DI}}*4]; s dr*y p*p*x, where s is the sample from delay buffer
+        fadd    st1, st0                                ; s dr*y+s p*p*x (add comb output to current output)
+        fld1                                            ; 1 s dr*y+s p*p*x
+        fsub    dword [{{.Input "delay" "damp"}}]         ; 1-da s dr*y+s p*p*x
+        fmulp   st1, st0                                ; s*(1-da) dr*y+s p*p*x
+        fld     dword [{{.Input "delay" "damp"}}]         ; da s*(1-da) dr*y+s p*p*x
+        fmul    dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o*da s*(1-da) dr*y+s p*p*x, where o is stored
+        faddp   st1, st0                                ; o*da+s*(1-da) dr*y+s p*p*x
+        fst     dword [{{.CX}}+su_delayline_wrk.filtstate]  ; o'=o*da+s*(1-da), o' dr*y+s p*p*x
+        fmul    dword [{{.Input "delay" "feedback"}}]     ; f*o' dr*y+s p*p*x
+        fadd    st0, st2                                ; f*o'+p*p*x dr*y+s p*p*x
+        fstp    dword [{{.CX}}+su_delayline_wrk.buffer+{{.SI}}*4]; save f*o'+p*p*x to delay buffer
+        add     {{.BX}},2                                   ; move to next index
+        add     {{.CX}}, su_delayline_wrk.size              ; go to next delay delay workspace
+        sub     ah, 2
+        jg      su_op_delay_loop                        ; if ah > 0, goto loop
+    fstp    st1                                 ; dr*y+s1+s2+s3+...
+    ; DC-filtering
+    fld     dword [{{.CX}}+su_delayline_wrk.dcout]  ; o s
+{{- .Float 0.99609375 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 0.99609375 | .Use}}]                ; c*o s
+    fsub    dword [{{.CX}}+su_delayline_wrk.dcin]   ; c*o-i s
+    fxch                                        ; s c*o-i
+    fst     dword [{{.CX}}+su_delayline_wrk.dcin]   ; i'=s, s c*o-i
+    faddp   st1                                 ; s+c*o-i
+{{- .Float 0.5 | .Prepare | indent 4}}
+    fadd    dword [{{.Float 0.5 | .Use}}]                     ; add and sub small offset to prevent denormalization
+    fsub    dword [{{.Float 0.5 | .Use}}]
+    fst     dword [{{.CX}}+su_delayline_wrk.dcout]  ; o'=s+c*o-i
+    ret
+{{end}}
+
+
+{{- if .HasOp "compressor"}}
+;-------------------------------------------------------------------------------
+;   COMPRES opcode: push compressor gain to stack
+;-------------------------------------------------------------------------------
+;   Mono:   push g on stack, where g is a suitable gain for the signal
+;           you can either MULP to compress the signal or SEND it to a GAIN
+;           somewhere else for compressor side-chaining.
+;   Stereo: push g g on stack, where g is calculated using l^2 + r^2
+;-------------------------------------------------------------------------------
+{{.Func "su_op_compressor" "Opcode"}}
+    fdiv    dword [{{.Input "compressor" "invgain"}}]; l/g, we'll call this pre inverse gained signal x from now on
+    fld     st0                                 ; x x
+    fmul    st0, st0                            ; x^2 x
+{{- if .StereoAndMono "compressor"}}
+    jnc     su_op_compressor_mono
+{{- end}}
+{{- if .Stereo "compressor"}}
+    fld     st2                                 ; r x^2 l/g r
+    fdiv    dword [{{.Input "compressor" "invgain"}}]; r/g, we'll call this pre inverse gained signal y from now on
+    fst     st3                                 ; y x^2 l/g r/g
+    fmul    st0, st0                            ; y^2 x^2 l/g r/g
+    faddp   st1, st0                            ; y^2+x^2 l/g r/g
+    call    su_op_compressor_mono               ; So, for stereo, we square both left & right and add them up
+    fld     st0                                 ; and return the computed gain two times, ready for MULP STEREO
+    ret
+su_op_compressor_mono:
+{{- end}}
+    fld     dword [{{.WRK}}]    ; l x^2 x
+    fucomi  st0, st1
+    setnb   al                                  ; if (st0 >= st1) al = 1; else al = 0;
+    fsubp   st1, st0                            ; x^2-l x
+    {{.Call "su_nonlinear_map"}}                ; c x^2-l x, c is either attack or release parameter mapped in a nonlinear way
+    fmulp   st1, st0                            ; c*(x^2-l) x
+    fadd    dword [{{.WRK}}]    ; l+c*(x^2-l) x   // we could've kept level in the stack and save a few bytes, but su_env_map uses 3 stack (c + 2 temp), so the stack was getting quite big.
+    fst     dword [{{.WRK}}]    ; l'=l+c*(x^2-l), l' x
+    fld     dword [{{.Input "compressor" "threshold"}}] ; t l' x
+    fmul    st0, st0                            ; t*t l' x
+    fxch                                        ; l' t*t x
+    fucomi  st0, st1                            ; if l' < t*t
+    fcmovb  st0, st1                            ;   l'=t*t
+    fdivp   st1, st0                            ; t*t/l' x
+    fld     dword [{{.Input "compressor" "ratio"}}]  ; r t*t/l' x
+{{.Float 0.5 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 0.5 | .Use}}]       ; p=r/2 t*t/l' x
+    fxch                                        ; t*t/l' p x
+    fyl2x                                       ; p*log2(t*t/l') x
+    {{.TailCall "su_power"}}                     ; 2^(p*log2(t*t/l')) x
+    ; tail call                                 ; Equal to:
+                                                ; (t*t/l')^p x
+                                                ; if ratio is at minimum => p=0 => 1 x
+                                                ; if ratio is at maximum => p=0.5 => t/x => t/x*x=t
+{{- end}}
--- a/templates/amd64-386/flowcontrol.asm
+++ b/templates/amd64-386/flowcontrol.asm
@@ -0,0 +1,23 @@
+{{- if .HasOp "speed" -}}
+;-------------------------------------------------------------------------------
+;   SPEED opcode: modulate the speed (bpm) of the song based on ST0
+;-------------------------------------------------------------------------------
+;   Mono: adds or subtracts the ticks, a value of 0.5 is neutral & will7
+;   result in no speed change.
+;   There is no STEREO version.
+;-------------------------------------------------------------------------------
+{{.Func "su_op_speed" "Opcode"}}
+{{- .Float 2.206896551724138 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 2.206896551724138 | .Use}}]         ; (2*s-1)*64/24, let's call this p from now on
+    {{.Call "su_power"}}
+    fld1                                 ; 1 2^p
+    fsubp   st1, st0                     ; 2^p-1, the player is advancing 1 tick by its own
+    fadd    dword [{{.WRK}}] ; t+2^p-1, t is the remainder from previous rounds as ticks have to be rounded to 1
+    push    {{.AX}}
+    fist    dword [{{.SP}}]                  ; Main stack: k=int(t+2^p-1)
+    fisub   dword [{{.SP}}]                  ; t+2^p-1-k, the remainder
+    pop     {{.AX}}
+    add     dword [{{.Stack "Sample"}}], eax          ; add the whole ticks to row tick count
+    fstp    dword [{{.WRK}}] ; save the remainder for future
+    ret
+{{end}}
--- a/templates/amd64-386/gmdls.asm
+++ b/templates/amd64-386/gmdls.asm
@@ -0,0 +1,60 @@
+{{- if .SupportsParamValue "oscillator" "type" .Sample}}
+
+{{- if eq .OS "windows"}}
+{{.ExportFunc "su_load_gmdls"}}
+{{- if .Amd64}}
+    extern OpenFile ; requires windows
+    extern ReadFile ; requires windows
+    ;        Win64 ABI: RCX, RDX, R8, and R9
+    sub     rsp, 40         ; Win64 ABI requires "shadow space" + space for one parameter.
+    mov     rdx, qword su_sample_table
+    mov     rcx, qword su_gmdls_path1
+    su_gmdls_pathloop:
+        xor     r8,r8 ; OF_READ
+        push    rdx                 ; &ofstruct, blatantly reuse the sample table
+        push    rcx
+        call    OpenFile            ; eax = OpenFile(path,&ofstruct,OF_READ)
+        pop     rcx
+        add     rcx, su_gmdls_path2 - su_gmdls_path1 ; if we ever get to third, then crash
+        pop     rdx
+        cmp     eax, -1             ; ecx == INVALID?
+        je      su_gmdls_pathloop
+    movsxd  rcx, eax
+    mov     qword [rsp+32], 0
+    mov     r9, rdx
+    mov     r8d, 3440660   ; number of bytes to read
+    call    ReadFile                ; Readfile(handle,&su_sample_table,SAMPLE_TABLE_SIZE,&bytes_read,NULL)
+    add     rsp, 40         ; shadow space, as required by Win64 ABI
+    ret
+{{else}}
+    mov     edx, su_sample_table
+    mov     ecx, su_gmdls_path1
+    su_gmdls_pathloop:
+        push    0                   ; OF_READ
+        push    edx                 ; &ofstruct, blatantly reuse the sample table
+        push    ecx                 ; path
+        call    _OpenFile@12        ; eax = OpenFile(path,&ofstruct,OF_READ)
+        add     ecx, su_gmdls_path2 - su_gmdls_path1 ; if we ever get to third, then crash
+        cmp     eax, -1             ; eax == INVALID?
+        je      su_gmdls_pathloop
+    push    0                       ; NULL
+    push    edx                     ; &bytes_read, reusing sample table again; it does not matter that the first four bytes are trashed
+    push    3440660                 ; number of bytes to read
+    push    edx                     ; here we actually pass the sample table to readfile
+    push    eax                     ; handle to file
+    call    _ReadFile@20            ; Readfile(handle,&su_sample_table,SAMPLE_TABLE_SIZE,&bytes_read,NULL)
+    ret
+extern _OpenFile@12 ; requires windows
+extern _ReadFile@20 ; requires windows
+{{end}}
+
+{{.Data "su_gmdls_path1"}}
+    db 'drivers/gm.dls',0
+su_gmdls_path2:
+    db 'drivers/etc/gm.dls',0
+{{end}}
+
+{{.SectBss "susamtable"}}
+su_sample_table:
+    resb    3440660    ; size of gmdls.
+{{end}}
--- a/templates/amd64-386/library.asm
+++ b/templates/amd64-386/library.asm
@@ -0,0 +1,134 @@
+{{template "structs.asm" .}}
+
+struc su_synth
+    .synth_wrk  resb    su_synthworkspace.size
+    .delay_wrks resb    su_delayline_wrk.size * 64
+    .delaytimes resw    768
+    .sampleoffs resb    su_sample_offset.size * 256
+    .randseed   resd    1
+    .globaltime resd    1
+    .commands   resb    32 * 64
+    .values     resb    32 * 64 * 8
+    .polyphony  resd    1
+    .numvoices  resd    1
+endstruc
+
+{{.ExportFunc "su_render" "SynthStateParam" "BufferPtrParam" "SamplesParam" "TimeParam"}}
+    {{- if .Amd64}}
+    {{- if eq .OS "windows"}}
+    {{- .PushRegs "rdi" "NonVolatileRDI" "rsi" "NonVolatileRSI" "rbx" "NonVolatileRBX"  "rbp" "NonVolatileRBP"  | indent 4}}
+    mov     rsi, r8 ; rsi = &samples
+    mov     rbx, r9 ; rbx = &time
+    {{- else}} ; SystemV amd64 ABI, linux mac or hopefully something similar
+    {{- .PushRegs "rbx" "NonVolatileRBX"  "rbp" "NonVolatileRBP"  | indent 4}}
+    mov     rbx, rcx ; rbx points to time
+    xchg    rsi, rdx ; rdx points to buffer, rsi points to samples
+    mov     rcx, rdi ; rcx = &Synthstate
+    {{- end}}
+    {{- else}}
+    {{- .PushRegs | indent 4 }} ; push registers
+    mov     ecx, [{{.Stack "SynthStateParam"}}] ; ecx = &synthState
+    mov     edx, [{{.Stack "BufferPtrParam"}}]  ; edx = &buffer
+    mov     esi, [{{.Stack "SamplesParam"}}]  ; esi = &samples
+    mov     ebx, [{{.Stack "TimeParam"}}]  ; ebx = &time
+    {{- end}}
+    {{.SaveFPUState | indent 4}}       ; save the FPU state to stack & reset the FPU
+    {{.Push .SI "Samples"}}
+    {{.Push .BX "Time"}}
+    xor     eax, eax    ; samplenumber starts at 0
+    {{.Push .AX "BufSample"}}
+    mov     esi, [{{.SI}}]  ; zero extend dereferenced pointer
+    {{.Push .SI "BufSize"}}
+    {{.Push .DX "BufPtr"}}
+    {{.Push .CX "SynthState"}}
+    lea     {{.AX}}, [{{.CX}} + su_synth.sampleoffs]
+    {{.Push .AX "SampleTable"}}
+    lea     {{.AX}}, [{{.CX}} + su_synth.delaytimes]
+    {{.Push .AX "DelayTable"}}
+    mov     eax, [{{.CX}} + su_synth.randseed]
+    {{.Push .AX "RandSeed"}}
+    mov     eax, [{{.CX}} + su_synth.globaltime]
+    {{.Push .AX "GlobalTick"}}
+    mov     ebx, dword [{{.BX}}]           ; zero extend dereferenced pointer
+    {{.Push .BX "RowLength"}}             ; the nominal rowlength should be time_in
+    xor     eax, eax                   ; rowtick starts at 0
+su_render_samples_loop:
+        push    {{.DI}}
+        fnstsw  [{{.SP}}]                         ; store the FPU status flag to stack top
+        pop     {{.DI}}                           ; {{.DI}} = FPU status flag
+        and     {{.DI}}, 0b0011100001000101        ; mask TOP pointer, stack error, zero divide and in{{.VAL}}id operation
+        test    {{.DI}},{{.DI}}                       ; all the aforementioned bits should be 0!
+        jne     su_render_samples_time_finish ; otherwise, we exit due to error
+        cmp     eax, [{{.Stack "RowLength"}}]                    ; if rowtick >= maxtime
+        jge     su_render_samples_time_finish ;   goto finish
+        mov     ecx, [{{.Stack "BufSize"}}]        ; ecx = buffer length in samples
+        cmp     [{{.Stack "BufSample"}}], ecx        ; if samples >= maxsamples
+        jge     su_render_samples_time_finish ;   goto finish
+        inc     eax                           ; time++
+        inc     dword [{{.Stack "BufSample"}}]       ; samples++
+        mov     {{.CX}}, [{{.Stack "SynthState"}}]
+        {{.Push .AX "Sample"}}
+        mov     eax, [{{.CX}} + su_synth.polyphony]
+        {{.Push .AX "PolyphonyBitmask"}}
+        mov     eax, [{{.CX}} + su_synth.numvoices]
+        {{.Push .AX "VoicesRemain"}}
+        lea     {{.DX}}, [{{.CX}}+ su_synth.synth_wrk]
+        lea     {{.COM}}, [{{.CX}}+ su_synth.commands]
+        lea     {{.VAL}}, [{{.CX}}+ su_synth.values]
+        lea     {{.WRK}}, [{{.DX}} + su_synthworkspace.voices]
+        lea     {{.CX}}, [{{.CX}}+ su_synth.delay_wrks - su_delayline_wrk.filtstate]
+        {{.Call "su_run_vm"}}
+        {{.Pop .AX}}
+        {{.Pop .AX}}
+        mov     {{.DI}}, [{{.Stack "BufPtr"}}] ; edi containts buffer ptr
+        mov     {{.CX}}, [{{.Stack "SynthState"}}]
+        lea     {{.SI}}, [{{.CX}} + su_synth.synth_wrk + su_synthworkspace.left]
+        movsd   ; copy left channel to output buffer
+        movsd   ; copy right channel to output buffer
+        mov     [{{.Stack "BufPtr"}}], {{.DI}} ; save back the updated ptr
+        lea     {{.DI}}, [{{.SI}}-8]
+        xor     eax, eax
+        stosd   ; clear left channel so the VM is ready to write them again
+        stosd   ; clear right channel so the VM is ready to write them again
+        {{.Pop .AX}}
+        inc     dword [{{.Stack "GlobalTick"}}] ; increment global time, used by delays
+        jmp     su_render_samples_loop
+su_render_samples_time_finish:
+    {{.Pop .CX}}
+    {{.Pop .BX}}
+    {{.Pop .DX}}
+    {{.Pop .CX}}
+    {{.Pop .CX}}
+    {{.Pop .CX}}
+    mov     [{{.CX}} + su_synth.randseed], edx
+    mov     [{{.CX}} + su_synth.globaltime], ebx
+    {{.Pop .BX}}
+    {{.Pop .BX}}
+    {{.Pop .DX}}
+    {{.Pop .BX}}
+    {{.Pop .SI}}
+    mov     dword [{{.SI}}], edx    ; *samples = samples rendered
+    mov     dword [{{.BX}}], eax    ; *time = time ticks rendered
+    mov     {{.AX}},{{.DI}}             ; {{.DI}} was the masked FPU status flag, {{.AX}} is return {{.VAL}}ue
+    {{.LoadFPUState | indent 4}}       ; load the FPU state from stack
+    {{- if .Amd64}}
+    {{- if eq .OS "windows"}}
+    {{- .PopRegs "rdi" "rsi" "rbx" "rbp" | indent 4}}
+    {{- else}} ; SystemV amd64 ABI, linux mac or hopefully something similar
+    {{- .PopRegs "rbx" "rbp" | indent 4}}
+    {{- end}}
+    ret
+    {{- else}}
+    mov     [{{.Stack "eax"}}],eax ; we want to return eax, but popad pops everything, so put eax to stack for popad to pop
+    {{- .PopRegs | indent 4 }} ; popad
+    ret     16
+    {{- end}}
+
+
+{{template "patch.asm" .}}
+
+;-------------------------------------------------------------------------------
+;    Constants
+;-------------------------------------------------------------------------------
+{{.SectData "constants"}}
+{{.Constants}}
--- a/templates/amd64-386/library.h
+++ b/templates/amd64-386/library.h
@@ -0,0 +1,100 @@
+#ifndef _SOINTU_H
+#define _SOINTU_H
+
+#pragma pack(push,1) // this should be fine for both Go and assembly
+typedef struct Unit {
+    float State[8];
+    float Ports[8];
+} Unit;
+
+typedef struct Voice {
+    int Note;
+    int Release;
+    float Inputs[8];
+    float Reserved[6];
+    struct Unit Units[63];
+} Voice;
+
+typedef struct DelayWorkspace {
+    float Buffer[65536];
+    float Dcin;
+    float Dcout;
+    float Filtstate;
+} DelayWorkspace;
+
+typedef struct SynthWorkspace {
+    unsigned char Curvoices[32];
+    float Left;
+    float Right;
+    float Aux[6];
+    struct Voice Voices[32];
+} SynthWorkspace;
+
+typedef struct SampleOffset {
+    unsigned int Start;
+    unsigned short LoopStart;
+    unsigned short LoopLength;
+} SampleOffset;
+
+typedef struct Synth {
+    struct SynthWorkspace SynthWrk;
+    struct DelayWorkspace DelayWrks[64]; // let's keep this as 64 for now, so the delays take 16 meg. If that's too little or too much, we can change this in future.
+    unsigned short DelayTimes[768];
+    struct SampleOffset SampleOffsets[256];
+    unsigned int RandSeed;
+    unsigned int GlobalTick;
+    unsigned char Commands[32 * 64];
+    unsigned char Values[32 * 64 * 8];
+    unsigned int Polyphony;
+    unsigned int NumVoices;
+} Synth;
+#pragma pack(pop)
+
+#if UINTPTR_MAX == 0xffffffff // are we 32-bit?
+#if defined(__clang__) || defined(__GNUC__)
+#define CALLCONV __attribute__ ((stdcall))
+#elif defined(_WIN32)
+#define CALLCONV __stdcall // on 32-bit platforms, we just use stdcall, as all know it
+#endif
+#else // 64-bit
+#define CALLCONV  // the asm will use honor honor correct x64 ABI on all 64-bit platforms
+#endif
+
+void CALLCONV su_load_gmdls(void);
+
+// int su_render(Synth* synth, float* buffer, int* samples, int* time):
+//      Renders samples until 'samples' number of samples are reached or 'time' number of
+//      modulated time ticks are reached, whichever happens first. 'samples' and 'time' are
+//      are passed by reference as the function modifies to tell how many samples were
+//      actually rendered and how many time ticks were actually advanced.
+//
+// Parameters:
+//      synth       pointer to the synthesizer used. RandSeed should be > 0 e.g. 1
+//      buffer      audio sample buffer, L R L R ...
+//      samples     pointer to the maximum number of samples to be rendered.
+//                  buffer should have a length of 2 * maxsamples as the audio
+//                  is stereo.
+//      time        maximum modulated time rendered.
+//
+// The value referred by samples is changed to contain the actual number of samples rendered
+// Similarly, the value referred by time is changed to contain the number of time ticks advanced.
+// If samples_out == samples_in, then is must be that time_in <= time_out.
+// If samples_out < samples_in, then time_out >= time_in. Note that it could happen that
+// time_out > time_in, as it is modulated and the time could advance by 2 or more, so the loop
+// exit condition would fire when the current time is already past time_in
+//
+// Returns an error code, which is actually just masked version of the FPU Status Word
+// On a succesful run, the return value should be 0
+// Error code bits:
+//    bit  0        FPU invalid operation (stack over/underflow OR invalid arithmetic e.g. NaNs)
+//    bit  2        Divide by zero occurred
+//    bit  6        Stack overflow or underflow occurred
+//    bits 11-13    The top pointer of the fpu stack. Any other value than 0 indicates that some values were left on the stack.
+int CALLCONV su_render(Synth* synth, float* buffer, int* samples, int* time);
+
+#define SU_ADVANCE_ID       0
+{{- range $index, $element := .Instructions}}
+#define {{printf "su_%v_id" $element | upper | printf "%-20v"}}{{add1 $index | mul 2}}
+{{- end}}
+
+#endif // _SOINTU_H
--- a/templates/amd64-386/output_sound.asm
+++ b/templates/amd64-386/output_sound.asm
@@ -0,0 +1,44 @@
+{{- if not .Song.Output16Bit }}
+    {{- if not .Clip }}
+            mov     {{.DI}}, [{{.Stack "OutputBufPtr"}}] ; edi containts ptr
+            mov     {{.SI}}, {{.PTRWORD}} su_synth_obj + su_synthworkspace.left
+            movsd   ; copy left channel to output buffer
+            movsd   ; copy right channel to output buffer
+            mov     [{{.Stack "OutputBufPtr"}}], {{.DI}} ; save back the updated ptr
+            lea     {{.DI}}, [{{.SI}}-8]
+            xor     eax, eax
+            stosd   ; clear left channel so the VM is ready to write them again
+            stosd   ; clear right channel so the VM is ready to write them again
+    {{ else }}
+            mov     {{.SI}}, qword [{{.Stack "OutputBufPtr"}}] ; esi points to the output buffer
+            xor     ecx,ecx
+            xor     eax,eax
+            %%loop: ; loop over two channels, left & right
+             do fld     dword [,su_synth_obj+su_synthworkspace.left,_CX*4,]
+                {{.Call "su_clip"}}
+                fstp    dword [_SI]
+             do mov     dword [,su_synth_obj+su_synthworkspace.left,_CX*4,{],eax} ; clear the sample so the VM is ready to write it
+                add     _SI,4
+                cmp     ecx,2
+                jl      %%loop
+            mov     dword [_SP+su_stack.bufferptr - su_stack.output_sound], _SI ; save esi back to stack
+    {{ end }}
+{{- else}}
+            mov     {{.SI}}, [{{.Stack "OutputBufPtr"}}] ; esi points to the output buffer
+            mov     {{.DI}}, {{.PTRWORD}} su_synth_obj+su_synthworkspace.left
+            mov     ecx, 2
+            output_sound16bit_loop: ; loop over two channels, left & right
+                    fld     dword [{{.DI}}]
+                    {{.Call "su_clip"}}
+            {{- .Float 32767.0 | .Prepare | indent 16}}
+                    fmul    dword [{{.Float 32767.0 | .Use}}]
+                    push    {{.AX}}
+                    fistp   dword [{{.SP}}]
+                    pop     {{.AX}}
+                    mov     word [{{.SI}}],ax   ; // store integer converted right sample
+                    xor     eax,eax
+                    stosd
+                    add     {{.SI}},2
+                    loop    output_sound16bit_loop
+            mov     [{{.Stack "OutputBufPtr"}}], {{.SI}} ; save esi back to stack
+{{- end }}
--- a/templates/amd64-386/patch.asm
+++ b/templates/amd64-386/patch.asm
@@ -0,0 +1,188 @@
+;-------------------------------------------------------------------------------
+;   su_run_vm function: runs the entire virtual machine once, creating 1 sample
+;-------------------------------------------------------------------------------
+;   Input:      su_synth_obj.left   :   Set to 0 before calling
+;               su_synth_obj.right  :   Set to 0 before calling
+;               _CX                 :   Pointer to delay workspace (if needed)
+;               _DX                 :   Pointer to synth object
+;               COM                 :   Pointer to command stream
+;               VAL                 :   Pointer to value stream
+;               WRK                 :   Pointer to the last workspace processed
+;   Output:     su_synth_obj.left   :   left sample
+;               su_synth_obj.right  :   right sample
+;   Dirty:      everything
+;-------------------------------------------------------------------------------
+{{.Func "su_run_vm"}}
+    {{- .PushRegs .CX "DelayWorkSpace" .DX "Synth" .COM "CommandStream" .WRK "Voice" .VAL "ValueStream" | indent 4}}
+su_run_vm_loop:                                     ; loop until all voices done
+    movzx   edi, byte [{{.COM}}]                         ; edi = command byte
+    inc     {{.COM}}                                     ; move to next instruction
+    add     {{.WRK}}, su_unit.size                       ; move WRK to next unit
+    shr     edi, 1                                  ; shift out the LSB bit = stereo bit
+    je      su_run_vm_advance                ; the opcode is zero, jump to advance
+    mov     {{.INP}}, [{{.Stack "Voice"}}]         ; reset INP to point to the inputs part of voice
+    add     {{.INP}}, su_voice.inputs
+    xor     ecx, ecx                                ; counter = 0
+    xor     eax, eax                                ; clear out high bits of eax, as lodsb only sets al
+su_transform_values_loop:
+    {{- .Prepare "su_vm_transformcounts-1" | indent 4}}
+    cmp     cl, byte [{{.Use "su_vm_transformcounts-1"}}+{{.DI}}]   ; compare the counter to the value in the param count table
+    je      su_transform_values_out
+    lodsb                                           ; load the byte value from VAL stream
+    push    {{.AX}}                                     ; push it to memory so FPU can read it
+    fild    dword [{{.SP}}]                             ; load the value to FPU stack
+    {{- .Prepare (.Float 0.0078125) | indent 4}}
+    fmul    dword [{{.Use (.Float 0.0078125)}}]          ; divide it by 128 (0 => 0, 128 => 1.0)
+    fadd    dword [{{.WRK}}+su_unit.ports+{{.CX}}*4]         ; add the modulations in the current workspace
+    fstp    dword [{{.INP}}+{{.CX}}*4]                       ; store the modulated value in the inputs section of voice
+    xor     eax, eax
+    mov     dword [{{.WRK}}+su_unit.ports+{{.CX}}*4], eax    ; clear out the modulation ports
+    pop     {{.AX}}
+    inc     ecx
+    jmp     su_transform_values_loop
+su_transform_values_out:
+    bt      dword [{{.COM}}-1],0                         ; LSB of COM = stereo bit => carry
+    {{- .SaveStack "Opcode"}}
+    {{- $x := printf "su_vm_jumptable-%v" .PTRSIZE}}
+    {{- .Prepare $x | indent 4}}
+    call    [{{.Use $x}}+{{.DI}}*{{.PTRSIZE}}]       ; call the function corresponding to the instruction
+    jmp     su_run_vm_loop
+su_run_vm_advance:
+    {{- if .SupportsPolyphony}}
+    mov     {{.WRK}}, [{{.Stack "Voice"}}]         ; WRK points to start of current voice
+    add     {{.WRK}}, su_voice.size              ; move to next voice
+    mov     [{{.Stack "Voice"}}], {{.WRK}}         ; update the pointer in the stack to point to the new voice
+    mov     ecx, [{{.Stack "VoicesRemain"}}]     ; ecx = how many voices remain to process
+    dec     ecx                             ; decrement number of voices to process
+    bt      dword [{{.Stack "PolyphonyBitmask"}}], ecx ; if voice bit of su_polyphonism not set
+    jnc     su_op_advance_next_instrument   ; goto next_instrument
+    mov     {{.VAL}}, [{{.Stack "ValueStream"}}] ; if it was set, then repeat the opcodes for the current voice
+    mov     {{.COM}}, [{{.Stack "CommandStream"}}]
+su_op_advance_next_instrument:
+    mov     [{{.Stack "ValueStream"}}], {{.VAL}} ; save current VAL as a checkpoint
+    mov     [{{.Stack "CommandStream"}}], {{.COM}} ; save current COM as a checkpoint
+su_op_advance_finish:
+    mov     [{{.Stack "VoicesRemain"}}], ecx
+    jne     su_run_vm_loop  ; ZF was set by dec ecx
+    {{- else}}
+    mov     {{.WRK}}, {{.PTRWORD}} [{{.Stack "Voice"}}] ; load pointer to voice to register
+    add     {{.WRK}}, su_voice.size              ; shift it to point to following voice
+    mov     {{.PTRWORD}} [{{.Stack "Voice"}}], {{.WRK}} ; save back to stack
+    dec     dword [{{.Stack "VoicesRemain"}}]  ; voices--
+    jne     su_run_vm_loop                          ;   if there's more voices to process, goto vm_loop
+    {{- end}}
+    {{- .PopRegs .CX .DX .COM .WRK .VAL | indent 4}}
+    ret
+
+{{- template "arithmetic.asm" .}}
+{{- template "effects.asm" .}}
+{{- template "flowcontrol.asm" .}}
+{{- template "sinks.asm" .}}
+{{- template "sources.asm" .}}
+{{- template "gmdls.asm" .}}
+
+{{- if .HasCall "su_nonlinear_map"}}
+;-------------------------------------------------------------------------------
+;   su_nonlinear_map function: returns 2^(-24*x) of parameter number _AX
+;-------------------------------------------------------------------------------
+;   Input:      _AX     :   parameter number (e.g. for envelope: 0 = attac, 1 = decay...)
+;               INP     :   pointer to transformed values
+;   Output:     st0     :   2^(-24*x), where x is the parameter in the range 0-1
+;-------------------------------------------------------------------------------
+{{.Func "su_nonlinear_map"}}
+    fld     dword [{{.INP}}+{{.AX}}*4]   ; x, where x is the parameter in the range 0-1
+    {{.Prepare (.Int 24)}}
+    fimul   dword [{{.Use (.Int 24)}}]      ; 24*x
+    fchs                        ; -24*x
+
+{{end}}
+
+{{- if or (.HasCall "su_power") (.HasCall "su_nonlinear_map")}}
+;-------------------------------------------------------------------------------
+;   su_power function: computes 2^x
+;-------------------------------------------------------------------------------
+;   Input:      st0     :   x
+;   Output:     st0     :   2^x
+;-------------------------------------------------------------------------------
+{{- if not (.HasCall "su_nonlinear_map")}}{{.SectText "su_power"}}{{end}}
+su_power:
+    fld1          ; 1 x
+    fld st1       ; x 1 x
+    fprem         ; mod(x,1) 1 x
+    f2xm1         ; 2^mod(x,1)-1 1 x
+    faddp st1,st0 ; 2^mod(x,1) x
+    fscale        ; 2^mod(x,1)*2^trunc(x) x
+                  ; Equal to:
+                  ; 2^x x
+    fstp st1      ; 2^x
+    ret
+
+{{end}}
+
+{{- if .HasCall "su_effects_stereohelper" }}
+;-------------------------------------------------------------------------------
+;   su_effects_stereohelper: moves the workspace to next, does the filtering for
+;   right channel (pulling the calling address from stack), rewinds the
+;   workspace and returns
+;-------------------------------------------------------------------------------
+{{.Func "su_effects_stereohelper"}}
+    jnc     su_effects_stereohelper_mono ; carry is still the stereo bit
+    add     {{.WRK}}, 16
+    fxch                  ; r l
+    call    [{{.SP}}]         ; call whoever called me...
+    fxch                  ; l r
+    sub     {{.WRK}}, 16       ; move WRK back to where it was
+su_effects_stereohelper_mono:
+    ret                   ; return to process l/mono sound
+
+{{end}}
+
+{{- if .HasCall "su_waveshaper" }}
+{{.Func "su_waveshaper"}}
+    fxch                                    ; x a
+    {{.Call "su_clip"}}
+    fxch                                    ; a x' (from now on just called x)
+    fld     st0                             ; a a x
+    {{.Prepare (.Float 0.5)}}
+    fsub    dword [{{.Use (.Float 0.5)}}]                 ; a-.5 a x
+    fadd    st0                             ; 2*a-1 a x
+    fld     st2                             ; x 2*a-1 a x
+    fabs                                    ; abs(x) 2*a-1 a x
+    fmulp   st1                             ; (2*a-1)*abs(x) a x
+    fld1                                    ; 1 (2*a-1)*abs(x) a x
+    faddp   st1                             ; 1+(2*a-1)*abs(x) a x
+    fsub    st1                             ; 1-a+(2*a-1)*abs(x) a x
+    fdivp   st1, st0                        ; a/(1-a+(2*a-1)*abs(x)) x
+    fmulp   st1                             ; x*a/(1-a+(2*a-1)*abs(x))
+    ret
+{{end}}
+
+{{- if .HasCall "su_clip"}}
+{{.Func "su_clip"}}
+    fld1                                    ; 1 x a
+    fucomi  st1                             ; if (1 <= x)
+    jbe     short su_clip_do                ;   goto Clip_Do
+    fchs                                    ; -1 x a
+    fucomi  st1                             ; if (-1 < x)
+    fcmovb  st0, st1                        ;   x x a
+su_clip_do:
+    fstp    st1                             ; x' a, where x' = clamp(x)
+    ret
+{{end}}
+
+;-------------------------------------------------------------------------------
+; The opcode table jump table. This is constructed to only include the opcodes
+; that are used so that the jump table is as small as possible.
+;-------------------------------------------------------------------------------
+{{.Data "su_vm_jumptable"}}
+{{- range .Instructions}}
+    {{$.DPTR}}    su_op_{{.}}
+{{- end}}
+
+;-------------------------------------------------------------------------------
+; The number of transformed parameters each opcode takes
+;-------------------------------------------------------------------------------
+{{.Data "su_vm_transformcounts"}}
+{{- range .Instructions}}
+    db    {{$.TransformCount .}}
+{{- end}}
--- a/templates/amd64-386/player.asm
+++ b/templates/amd64-386/player.asm
@@ -0,0 +1,236 @@
+{{template "structs.asm" .}}
+;-------------------------------------------------------------------------------
+;   Uninitialized data: The synth object
+;-------------------------------------------------------------------------------
+{{.SectBss "synth_object"}}
+su_synth_obj:
+    resb    su_synthworkspace.size
+    resb    {{.NumDelayLines}}*su_delayline_wrk.size
+
+;-------------------------------------------------------------------------------
+;   su_render_song function: the entry point for the synth
+;-------------------------------------------------------------------------------
+;   Has the signature su_render_song(void *ptr), where ptr is a pointer to
+;   the output buffer. Renders the compile time hard-coded song to the buffer.
+;   Stack:  output_ptr
+;-------------------------------------------------------------------------------
+{{.ExportFunc "su_render_song" "OutputBufPtr"}}
+    {{-  if .Amd64}}
+    {{- if eq .OS "windows"}}
+    {{- .PushRegs "rcx" "OutputBufPtr" "rdi" "NonVolatileRsi" "rsi" "NonVolatile" "rbx" "NonVolatileRbx" "rbp" "NonVolatileRbp" | indent 4}} ; rcx = ptr to buf. rdi,rsi,rbx,rbp  nonvolatile
+    {{- else}} ; SystemV amd64 ABI, linux mac or hopefully something similar
+    {{- .PushRegs "rdi" "OutputBufPtr" "rbx" "NonVolatileRbx" "rbp" "NonVolatileRbp" | indent 4}}
+    {{- end}}
+    {{- else}}
+    {{- .PushRegs | indent 4}}
+    {{- end}}
+    {{- $prologsize := len .Stacklocs}}
+    xor     eax, eax
+    {{- if ne .VoiceTrackBitmask 0}}
+    {{.Push (.VoiceTrackBitmask | printf "%v") "VoiceTrackBitmask"}}
+    {{- end}}
+    {{.Push "1" "RandSeed"}}
+    {{.Push .AX "GlobalTick"}}
+su_render_rowloop:                      ; loop through every row in the song
+        {{.Push .AX "Row"}}
+        {{.Call "su_update_voices"}}   ; update instruments for the new row
+        xor     eax, eax                ; ecx is the current sample within row
+su_render_sampleloop:                   ; loop through every sample in the row
+            {{.Push .AX "Sample"}}
+            {{- if .SupportsPolyphony}}
+            {{.Push (.PolyphonyBitmask | printf "%v") "PolyphonyBitmask"}} ; does the next voice reuse the current opcodes?
+            {{- end}}
+            {{.Push (.Song.Patch.TotalVoices | printf "%v") "VoicesRemain"}}
+            mov     {{.DX}}, {{.PTRWORD}} su_synth_obj                       ; {{.DX}} points to the synth object
+            mov     {{.COM}}, {{.PTRWORD}} su_patch_code           ; COM points to vm code
+            mov     {{.VAL}}, {{.PTRWORD}} su_patch_parameters             ; VAL points to unit params
+            {{- if .HasOp "delay"}}
+            lea     {{.CX}}, [{{.DX}} + su_synthworkspace.size - su_delayline_wrk.filtstate]
+            {{- end}}
+            lea     {{.WRK}}, [{{.DX}} + su_synthworkspace.voices]            ; WRK points to the first voice
+            {{.Call "su_run_vm"}} ; run through the VM code
+            {{.Pop .AX}}
+            {{- if .SupportsPolyphony}}
+            {{.Pop .AX}}
+            {{- end}}
+            {{- template "output_sound.asm" .}}                ; *ptr++ = left, *ptr++ = right
+            {{.Pop .AX}}
+            inc     dword [{{.Stack "GlobalTick"}}] ; increment global time, used by delays
+            inc     eax
+            cmp     eax, {{.Song.SamplesPerRow}}
+            jl      su_render_sampleloop
+        {{.Pop .AX}}                  ; Stack: pushad ptr
+        inc     eax
+        cmp     eax, {{.Song.TotalRows}}
+        jl      su_render_rowloop
+    ; rewind the stack the entropy of multiple pop {{.AX}} is probably lower than add
+    {{- range slice .Stacklocs $prologsize}}
+    {{$.Pop $.AX}}
+    {{- end}}
+    {{-  if .Amd64}}
+    {{- if eq .OS "windows"}}
+    ; Windows64 ABI, rdi rsi rbx rbp non-volatile
+    {{- .PopRegs "rcx" "rdi" "rsi" "rbx" "rbp" | indent 4}}
+    {{- else}}
+    ; SystemV64 ABI (linux mac or hopefully something similar), rbx rbp non-volatile
+    {{- .PopRegs "rdi" "rbx" "rbp" | indent 4}}
+    {{- end}}
+    ret
+    {{- else}}
+    {{- .PopRegs | indent 4}}
+    ret     4
+    {{- end}}
+
+;-------------------------------------------------------------------------------
+;   su_update_voices function: polyphonic & chord implementation
+;-------------------------------------------------------------------------------
+;   Input:      eax     :   current row within song
+;   Dirty:      pretty much everything
+;-------------------------------------------------------------------------------
+{{.Func "su_update_voices"}}
+{{- if ne .VoiceTrackBitmask 0}}
+; The more complicated implementation: one track can trigger multiple voices
+    xor     edx, edx
+    mov     ebx, {{.Song.PatternRows}}                   ; we could do xor ebx,ebx; mov bl,PATTERN_SIZE, but that would limit patternsize to 256...
+    div     ebx                                 ; eax = current pattern, edx = current row in pattern
+    {{.Prepare "su_tracks"}}
+    lea     {{.SI}}, [{{.Use "su_tracks"}}+{{.AX}}]  ; esi points to the pattern data for current track
+    xor     eax, eax                            ; eax is the first voice of next track
+    xor     ebx, ebx                            ; ebx is the first voice of current track
+    mov     {{.BP}}, {{.PTRWORD}} su_synth_obj           ; ebp points to the current_voiceno array
+su_update_voices_trackloop:
+        movzx   eax, byte [{{.SI}}]                     ; eax = current pattern
+        imul    eax, {{.Song.PatternRows}}                   ; eax = offset to current pattern data
+{{- .Prepare "su_patterns" .AX | indent 4}}
+        movzx   eax,byte [{{.Use "su_patterns" .AX}},{{.DX}}]  ; eax = note
+        push    {{.DX}}                                 ; Stack: ptrnrow
+        xor     edx, edx                            ; edx=0
+        mov     ecx, ebx                            ; ecx=first voice of the track to be done
+su_calculate_voices_loop:                           ; do {
+        bt      dword [{{.Stack "VoiceTrackBitmask"}}],ecx ; test voicetrack_bitmask// notice that the incs don't set carry
+        inc     edx                                 ;   edx++   // edx=numvoices
+        inc     ecx                                 ;   ecx++   // ecx=the first voice of next track
+        jc      su_calculate_voices_loop            ; } while bit ecx-1 of bitmask is on
+        push    {{.CX}}                                 ; Stack: next_instr ptrnrow
+        cmp     al, {{.Song.Hold}}                    ; anything but hold causes action
+        je      short su_update_voices_nexttrack
+        mov     cl, byte [{{.BP}}]
+        mov     edi, ecx
+        add     edi, ebx
+        shl     edi, 12           ; each unit = 64 bytes and there are 1<<MAX_UNITS_SHIFT units + small header
+{{- .Prepare "su_synth_obj" | indent 4}}
+        inc     dword [{{.Use "su_synth_obj"}} + su_synthworkspace.voices + su_voice.release + {{.DI}}] ; set the voice currently active to release; notice that it could increment any number of times
+        cmp     al, {{.Song.Hold}}                    ; if cl < HLD (no new note triggered)
+        jl      su_update_voices_nexttrack          ;   goto nexttrack
+        inc     ecx                                 ; curvoice++
+        cmp     ecx, edx                            ; if (curvoice >= num_voices)
+        jl      su_update_voices_skipreset
+        xor     ecx,ecx                             ;   curvoice = 0
+su_update_voices_skipreset:
+        mov     byte [{{.BP}}],cl
+        add     ecx, ebx
+        shl     ecx, 12                           ; each unit = 64 bytes and there are 1<<6 units + small header
+        lea     {{.DI}},[{{.Use "su_synth_obj"}} + su_synthworkspace.voices + {{.CX}}]
+        stosd                                       ; save note
+        mov     ecx, (su_voice.size - su_voice.release)/4
+        xor     eax, eax
+        rep stosd                                   ; clear the workspace of the new voice, retriggering oscillators
+su_update_voices_nexttrack:
+        pop     {{.BX}}                                 ; ebx=first voice of next instrument, Stack: ptrnrow
+        pop     {{.DX}}                                 ; edx=patrnrow
+        add     {{.SI}}, {{.Song.SequenceLength}}
+        inc     {{.BP}}
+{{- $addrname := len .Song.Tracks | printf "su_synth_obj + %v"}}
+{{- .Prepare $addrname | indent 8}}
+        cmp     {{.BP}},{{.Use $addrname}}
+        jl      su_update_voices_trackloop
+    ret
+{{- else}}
+; The simple implementation: each track triggers always the same voice
+    xor     edx, edx
+    xor     ebx, ebx
+    mov     bl, {{.Song.PatternRows}}           ; rows per pattern
+    div     ebx                                 ; eax = current pattern, edx = current row in pattern
+{{- .Prepare "su_tracks" | indent 4}}
+    lea     {{.SI}}, [{{.Use "su_tracks"}}+{{.AX}}]; esi points to the pattern data for current track
+    mov     {{.DI}}, {{.PTRWORD}} su_synth_obj+su_synthworkspace.voices
+    mov     bl, {{len .Song.Tracks}}                      ; MAX_TRACKS is always <= 32 so this is ok
+su_update_voices_trackloop:
+        movzx   eax, byte [{{.SI}}]                     ; eax = current pattern
+        imul    eax, {{.Song.PatternRows}}           ; multiply by rows per pattern, eax = offset to current pattern data
+{{- .Prepare "su_patterns" .AX | indent 8}}
+        movzx   eax, byte [{{.Use "su_patterns" .AX}} + {{.DX}}]  ; ecx = note
+        cmp     al, {{.Song.Hold}}                   ; anything but hold causes action
+        je      short su_update_voices_nexttrack
+        inc     dword [{{.DI}}+su_voice.release]        ; set the voice currently active to release; notice that it could increment any number of times
+        jb      su_update_voices_nexttrack          ; if cl < HLD (no new note triggered)  goto nexttrack
+su_update_voices_retrigger:
+        stosd                                       ; save note
+        mov     ecx, (su_voice.size - su_voice.release)/4  ; could be xor ecx, ecx; mov ch,...>>8, but will it actually be smaller after compression?
+        xor     eax, eax
+        rep stosd                                   ; clear the workspace of the new voice, retriggering oscillators
+        jmp     short su_update_voices_skipadd
+su_update_voices_nexttrack:
+        add     {{.DI}}, su_voice.size
+su_update_voices_skipadd:
+        add     {{.SI}}, {{.Song.SequenceLength}}
+        dec     ebx
+        jnz     short su_update_voices_trackloop
+    ret
+{{- end}}
+
+{{template "patch.asm" .}}
+
+;-------------------------------------------------------------------------------
+;    Patterns
+;-------------------------------------------------------------------------------
+{{.Data "su_patterns"}}
+{{- range .Song.Patterns}}
+    db {{. | toStrings | join ","}}
+{{- end}}
+
+;-------------------------------------------------------------------------------
+;    Tracks
+;-------------------------------------------------------------------------------
+{{.Data "su_tracks"}}
+{{- range .Song.Tracks}}
+    db {{.Sequence | toStrings | join ","}}
+{{- end}}
+
+{{- if gt (.SampleOffsets | len) 0}}
+;-------------------------------------------------------------------------------
+;    Sample offsets
+;-------------------------------------------------------------------------------
+{{.Data "su_sample_offsets"}}
+{{- range .SampleOffsets}}
+    dd {{.Start}}
+    dw {{.LoopStart}}
+    dw {{.LoopLength}}
+{{- end}}
+{{end}}
+
+{{- if gt (.DelayTimes | len ) 0}}
+;-------------------------------------------------------------------------------
+;    Delay times
+;-------------------------------------------------------------------------------
+{{.Data "su_delay_times"}}
+    dw {{.DelayTimes | toStrings | join ","}}
+{{end}}
+
+;-------------------------------------------------------------------------------
+;    The code for this patch, basically indices to vm jump table
+;-------------------------------------------------------------------------------
+{{.Data "su_patch_code"}}
+    db {{.Commands | toStrings | join ","}}
+
+;-------------------------------------------------------------------------------
+;    The parameters / inputs to each opcode
+;-------------------------------------------------------------------------------
+{{.Data "su_patch_parameters"}}
+    db {{.Values | toStrings | join ","}}
+
+;-------------------------------------------------------------------------------
+;    Constants
+;-------------------------------------------------------------------------------
+{{.SectData "constants"}}
+{{.Constants}}
--- a/templates/amd64-386/player.h
+++ b/templates/amd64-386/player.h
@@ -0,0 +1,50 @@
+// auto-generated by Sointu, editing not recommended
+#ifndef SU_RENDER_H
+#define SU_RENDER_H
+
+#define SU_MAX_SAMPLES     {{.MaxSamples}}
+#define SU_BUFFER_LENGTH   (SU_MAX_SAMPLES*2)
+
+#define SU_SAMPLE_RATE     44100
+#define SU_BPM             {{.Song.BPM}}
+#define SU_PATTERN_SIZE    {{.Song.PatternRows}}
+#define SU_MAX_PATTERNS    {{.Song.SequenceLength}}
+#define SU_TOTAL_ROWS      (SU_MAX_PATTERNS*SU_PATTERN_SIZE)
+#define SU_SAMPLES_PER_ROW (SU_SAMPLE_RATE*4*60/(SU_BPM*16))
+
+#include <stdint.h>
+#if UINTPTR_MAX == 0xffffffff
+    #if defined(__clang__) || defined(__GNUC__)
+        #define SU_CALLCONV __attribute__ ((stdcall))
+    #elif defined(_WIN32)
+        #define SU_CALLCONV __stdcall
+    #endif
+#else
+    #define SU_CALLCONV
+#endif
+
+{{- if .Song.Output16Bit}}
+typedef short SUsample;
+#define SU_SAMPLE_RANGE 32767.0
+{{- else}}
+typedef float SUsample;
+#define SU_SAMPLE_RANGE 1.0
+{{- end}}
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SU_CALLCONV su_render_song(SUsample *buffer);
+{{- if gt (.SampleOffsets | len) 0}}
+void SU_CALLCONV su_load_gmdls();
+#define SU_LOAD_GMDLS
+{{- end}}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/templates/amd64-386/sinks.asm
+++ b/templates/amd64-386/sinks.asm
@@ -0,0 +1,126 @@
+{{- if .HasOp "out"}}
+;-------------------------------------------------------------------------------
+;   OUT opcode: outputs and pops the signal
+;-------------------------------------------------------------------------------
+{{- if .Mono "out"}}
+;   Mono: add ST0 to main left port, then pop
+{{- end}}
+{{- if .Stereo "out"}}
+;   Stereo: add ST0 to left out and ST1 to right out, then pop
+{{- end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_out" "Opcode"}}   ; l r
+    mov     {{.AX}}, [{{.Stack "Synth"}}] ; AX points to the synth object
+{{- if .StereoAndMono "out" }}
+    jnc     su_op_out_mono
+{{- end }}
+{{- if .Stereo "out" }}
+    call    su_op_out_mono
+    add     {{.AX}}, 4 ; shift from left to right channel
+su_op_out_mono:
+{{- end}}
+    fmul    dword [{{.Input "out" "gain"}}] ; multiply by gain
+    fadd    dword [{{.AX}} + su_synthworkspace.left]   ; add current value of the output
+    fstp    dword [{{.AX}} + su_synthworkspace.left]   ; store the new value of the output
+    ret
+{{end}}
+
+
+{{- if .HasOp "outaux"}}
+;-------------------------------------------------------------------------------
+;   OUTAUX opcode: outputs to main and aux1 outputs and pops the signal
+;-------------------------------------------------------------------------------
+;   Mono: add outgain*ST0 to main left port and auxgain*ST0 to aux1 left
+;   Stereo: also add outgain*ST1 to main right port and auxgain*ST1 to aux1 right
+;-------------------------------------------------------------------------------
+{{.Func "su_op_outaux" "Opcode"}} ; l r
+    mov     {{.AX}}, [{{.Stack "Synth"}}]
+{{- if .StereoAndMono "outaux" }}
+    jnc     su_op_outaux_mono
+{{- end}}
+{{- if .Stereo "outaux" }}
+    call    su_op_outaux_mono
+    add     {{.AX}}, 4
+su_op_outaux_mono:
+{{- end}}
+    fld     st0                                     ; l l
+    fmul    dword [{{.Input "outaux" "outgain"}}]   ; g*l
+    fadd    dword [{{.AX}} + su_synthworkspace.left]             ; g*l+o
+    fstp    dword [{{.AX}} + su_synthworkspace.left]             ; o'=g*l+o
+    fmul    dword [{{.Input "outaux" "auxgain"}}]   ; h*l
+    fadd    dword [{{.AX}} + su_synthworkspace.aux]              ; h*l+a
+    fstp    dword [{{.AX}} + su_synthworkspace.aux]              ; a'=h*l+a
+    ret
+{{end}}
+
+
+{{- if .HasOp "aux"}}
+;-------------------------------------------------------------------------------
+;   AUX opcode: outputs the signal to aux (or main) port and pops the signal
+;-------------------------------------------------------------------------------
+;   Mono: add gain*ST0 to left port
+;   Stereo: also add gain*ST1 to right port
+;-------------------------------------------------------------------------------
+{{.Func "su_op_aux" "Opcode"}} ; l r
+    lodsb
+    mov     {{.DI}}, [{{.Stack "Synth"}}]
+{{- if .StereoAndMono "aux" }}
+    jnc     su_op_aux_mono
+{{- end}}
+{{- if .Stereo "aux" }}
+    call    su_op_aux_mono
+    add     {{.DI}}, 4
+su_op_aux_mono:
+{{- end}}
+    fmul    dword [{{.Input "aux" "gain"}}]     ; g*l
+    fadd    dword [{{.DI}} + su_synthworkspace.left + {{.AX}}*4] ; g*l+o
+    fstp    dword [{{.DI}} + su_synthworkspace.left + {{.AX}}*4] ; o'=g*l+o
+    ret
+{{end}}
+
+
+{{- if .HasOp "send"}}
+;-------------------------------------------------------------------------------
+;   SEND opcode: adds the signal to a port
+;-------------------------------------------------------------------------------
+;   Mono: adds signal to a memory address, defined by a word in VAL stream
+;   Stereo: also add right signal to the following address
+;-------------------------------------------------------------------------------
+{{.Func "su_op_send" "Opcode"}}
+    lodsw
+    mov     {{.CX}}, [{{.Stack "Voice"}}]  ; load pointer to voice
+{{- if .StereoAndMono "send"}}
+    jnc     su_op_send_mono
+{{- end}}
+{{- if .Stereo "send"}}
+    mov     {{.DI}}, {{.AX}}
+    inc     {{.AX}}  ; send the right channel first
+    fxch                        ; r l
+    call    su_op_send_mono     ; (r) l
+    mov     {{.AX}}, {{.DI}}            ; move back to original address
+    test    {{.AX}}, 0x8    ; if r was not popped and is still in the stack
+    jnz     su_op_send_mono
+    fxch                        ; swap them back: l r
+su_op_send_mono:
+{{- end}}
+{{- if .SupportsParamValueOtherThan "send" "voice" 0}}
+    test    {{.AX}}, 0x8000
+    jz      su_op_send_skipglobal
+    mov     {{.CX}}, [{{.Stack "Synth"}}]
+su_op_send_skipglobal:
+{{- end}}
+    test    {{.AX}}, 0x8        ; if the SEND_POP bit is not set
+    jnz     su_op_send_skippush
+    fld     st0                 ; duplicate the signal on stack: s s
+su_op_send_skippush:            ; there is signal s, but maybe also another: s (s)
+    fld     dword [{{.Input "send" "amount"}}]   ; a l (l)
+{{- .Float 0.5 | .Prepare | indent 4}}
+    fsub    dword [{{.Float 0.5 | .Use}}]                    ; a-.5 l (l)
+    fadd    st0                                ; g=2*a-1 l (l)
+    and     ah, 0x7f ; eax = send address, clear the global bit
+    or      al, 0x8 ; set the POP bit always, at the same time shifting to ports instead of wrk
+    fmulp   st1, st0                           ; g*l (l)
+    fadd    dword [{{.CX}} + {{.AX}}*4]     ; g*l+L (l),where L is the current value
+    fstp    dword [{{.CX}} + {{.AX}}*4]     ; (l)
+    ret
+{{end}}
--- a/templates/amd64-386/sources.asm
+++ b/templates/amd64-386/sources.asm
@@ -0,0 +1,418 @@
+{{if .HasOp "envelope" -}}
+;-------------------------------------------------------------------------------
+;   ENVELOPE opcode: pushes an ADSR envelope value on stack [0,1]
+;-------------------------------------------------------------------------------
+;   Mono:   push the envelope value on stack
+;   Stereo: push the envelope valeu on stack twice
+;-------------------------------------------------------------------------------
+{{.Func "su_op_envelope" "Opcode"}}
+{{- if .StereoAndMono "envelope"}}
+    jnc     su_op_envelope_mono
+{{- end}}
+{{- if .Stereo "envelope"}}
+    call    su_op_envelope_mono
+    fld     st0
+    ret
+su_op_envelope_mono:
+{{- end}}
+    mov     eax, dword [{{.INP}}-su_voice.inputs+su_voice.release] ; eax = su_instrument.release
+    test    eax, eax                            ; if (eax == 0)
+    je      su_op_envelope_process              ;   goto process
+    mov     dword [{{.WRK}}], {{.InputNumber "envelope" "release"}}  ; [state]=RELEASE
+su_op_envelope_process:
+    mov     eax, dword [{{.WRK}}]  ; al=[state]
+    fld     dword [{{.WRK}}+4]       ; x=[level]
+    cmp     al, {{.InputNumber "envelope" "sustain"}}               ; if (al==SUSTAIN)
+    je      short su_op_envelope_leave2         ;   goto leave2
+su_op_envelope_attac:
+    cmp     al, {{.InputNumber "envelope" "attack"}}                 ; if (al!=ATTAC)
+    jne     short su_op_envelope_decay          ;   goto decay
+    {{.Call "su_nonlinear_map"}}                ; a x, where a=attack
+    faddp   st1, st0                            ; a+x
+    fld1                                        ; 1 a+x
+    fucomi  st1                                 ; if (a+x<=1) // is attack complete?
+    fcmovnb st0, st1                            ;   a+x a+x
+    jbe     short su_op_envelope_statechange    ; else goto statechange
+su_op_envelope_decay:
+    cmp     al, {{.InputNumber "envelope" "decay"}}                 ; if (al!=DECAY)
+    jne     short su_op_envelope_release        ;   goto release
+    {{.Call "su_nonlinear_map"}}                ; d x, where d=decay
+    fsubp   st1, st0                            ; x-d
+    fld     dword [{{.Input "envelope" "sustain"}}]    ; s x-d, where s=sustain
+    fucomi  st1                                 ; if (x-d>s) // is decay complete?
+    fcmovb  st0, st1                            ;   x-d x-d
+    jnc     short su_op_envelope_statechange    ; else goto statechange
+su_op_envelope_release:
+    cmp     al, {{.InputNumber "envelope" "release"}}               ; if (al!=RELEASE)
+    jne     short su_op_envelope_leave          ;   goto leave
+    {{.Call "su_nonlinear_map"}}                ; r x, where r=release
+    fsubp   st1, st0                            ; x-r
+    fldz                                        ; 0 x-r
+    fucomi  st1                                 ; if (x-r>0) // is release complete?
+    fcmovb  st0, st1                            ;   x-r x-r, then goto leave
+    jc      short su_op_envelope_leave
+su_op_envelope_statechange:
+    inc     dword [{{.WRK}}]       ; [state]++
+su_op_envelope_leave:
+    fstp    st1                                 ; x', where x' is the new value
+    fst     dword [{{.WRK}}+4]       ; [level]=x'
+su_op_envelope_leave2:
+    fmul    dword [{{.Input "envelope" "gain"}}]       ; [gain]*x'
+    ret
+{{end}}
+
+
+{{- if .HasOp "noise"}}
+;-------------------------------------------------------------------------------
+;   NOISE opcode: creates noise
+;-------------------------------------------------------------------------------
+;   Mono:   push a random value [-1,1] value on stack
+;   Stereo: push two (differeent) random values on stack
+;-------------------------------------------------------------------------------
+{{.Func "su_op_noise" "Opcode"}}
+    lea     {{.CX}},[{{.Stack "RandSeed"}}]
+{{- if .StereoAndMono "noise"}}
+    jnc     su_op_noise_mono
+{{- end}}
+{{- if .Stereo "noise"}}
+    call    su_op_noise_mono
+su_op_noise_mono:
+{{- end}}
+    imul    eax, [{{.CX}}],16007
+    mov     [{{.CX}}],eax
+    fild    dword [{{.CX}}]
+{{- .Prepare (.Int 2147483648)}}
+    fidiv   dword [{{.Use (.Int 2147483648)}}] ; 65536*32768
+    fld     dword [{{.Input "noise" "shape"}}]
+    {{.Call "su_waveshaper"}}
+    fld     dword [{{.Input "noise" "gain"}}]
+    fmulp   st1, st0
+    ret
+{{end}}
+
+
+{{- if .HasOp "oscillator"}}
+;-------------------------------------------------------------------------------
+;   OSCILLAT opcode: oscillator, the heart of the synth
+;-------------------------------------------------------------------------------
+;   Mono:   push oscillator value on stack
+;   Stereo: push l r on stack, where l has opposite detune compared to r
+;-------------------------------------------------------------------------------
+{{.Func "su_op_oscillator" "Opcode"}}
+    lodsb                                   ; load the flags
+{{- if .Library}}
+    mov     {{.DI}}, [{{.Stack "SampleTable"}}]; we need to put this in a register, as the stereo & unisons screw the stack positions
+                                 ; ain't we lucky that {{.DI}} was unused throughout
+{{- end}}
+    fld     dword [{{.Input "oscillator" "detune"}}] ; e, where e is the detune [0,1]
+{{- .Prepare (.Float 0.5)}}
+    fsub    dword [{{.Use (.Float 0.5)}}]                 ; e-.5
+    fadd    st0, st0                        ; d=2*e-.5, where d is the detune [-1,1]
+{{- if .StereoAndMono "oscillator"}}
+    jnc     su_op_oscillat_mono
+{{- end}}
+{{- if .Stereo "oscillator"}}
+    fld     st0                             ; d d
+    call    su_op_oscillat_mono             ; r d
+    ;; WARNING: this is a bug. WRK should be nonvolatile, but we are changing it. It does not cause immediate problems but modulations will be off.
+    ;; Figure out how to do this; maybe $WRK should be volatile (pushed by the virtual machine)
+    add     {{.WRK}}, 4                     ; state vars: r1 l1 r2 l2 r3 l3 r4 l4, for the unison osc phases-
+    fxch                                    ; d r
+    fchs                                    ; -d r, negate the detune for second round
+su_op_oscillat_mono:
+{{- end}}
+{{- if .SupportsParamValueOtherThan "oscillator" "unison" 0}}
+    {{.PushRegs .AX "" .WRK "OscWRK" .AX "OscFlags"}}
+    fldz                            ; 0 d
+    fxch                            ; d a=0, "accumulated signal"
+su_op_oscillat_unison_loop:
+    fst     dword [{{.SP}}]             ; save the current detune, d. We could keep it in fpu stack but it was getting big.
+    call    su_op_oscillat_single   ; s a
+    faddp   st1, st0                ; a+=s
+    test    al, 3
+    je      su_op_oscillat_unison_out
+    ;; WARNING: this is a bug. WRK should be nonvolatile, but we are changing it. It does not cause immediate problems but modulations will be off.
+    ;; Figure out how to do this; maybe $WRK should be volatile (pushed by the virtual machine)
+    add     {{.WRK}}, 8
+    fld     dword [{{.Input "oscillator" "phase"}}] ; p s
+{{.Int 0x3DAAAAAA | .Prepare}}
+    fadd    dword [{{.Int 0x3DAAAAAA | .Use}}]  ; 1/12 p s, add some little phase offset to unison oscillators so they don't start in sync
+    fstp    dword [{{.Input "oscillator" "phase"}}] ; s    note that this changes the phase for second, possible stereo run. That's probably ok
+    fld     dword [{{.SP}}]             ; d s
+{{.Float 0.5 | .Prepare}}
+    fmul    dword [{{.Float 0.5 | .Use}}]         ; .5*d s    // negate and halve the detune of each oscillator
+    fchs                            ; -.5*d s   // negate and halve the detune of each oscillator
+    dec     eax
+    jmp     short su_op_oscillat_unison_loop
+su_op_oscillat_unison_out:
+    {{.PopRegs .AX .WRK .AX}}
+    ret
+su_op_oscillat_single:
+{{- end}}
+    fld     dword [{{.Input "oscillator" "transpose"}}]
+{{- .Float 0.5 | .Prepare}}
+    fsub    dword [{{.Float 0.5 | .Use}}]
+{{- .Float 0.0078125 | .Prepare}}
+    fdiv    dword [{{.Float 0.0078125 | .Use}}]
+    faddp   st1
+    test    al, byte 0x08
+    jnz     su_op_oscillat_skipnote
+    fiadd   dword [{{.INP}}-su_voice.inputs+su_voice.note]   ; // st0 is note, st1 is t+d offset
+su_op_oscillat_skipnote:
+{{- .Int 0x3DAAAAAA | .Prepare}}
+    fmul    dword [{{.Int 0x3DAAAAAA | .Use}}]
+    {{.Call "su_power"}}
+    test    al, byte 0x08
+    jz      short su_op_oscillat_normalize_note
+{{- .Float 0.000038 | .Prepare}}
+    fmul    dword [{{.Float 0.000038 | .Use}}]  ; // st0 is now frequency for lfo
+    jmp     short su_op_oscillat_normalized
+su_op_oscillat_normalize_note:
+{{- .Float 0.000092696138 | .Prepare}}
+    fmul    dword [{{.Float 0.000092696138 | .Use}}]   ; // st0 is now frequency
+su_op_oscillat_normalized:
+    fadd    dword [{{.WRK}}]
+    fst     dword [{{.WRK}}]
+    fadd    dword [{{.Input "oscillator" "phase"}}]
+{{- if .SupportsParamValue "oscillator" "type" .Sample}}
+    test    al, byte 0x80
+    jz      short su_op_oscillat_not_sample
+    {{.Call "su_oscillat_sample"}}
+    jmp     su_op_oscillat_shaping ; skip the rest to avoid color phase normalization and colorloading
+su_op_oscillat_not_sample:
+{{- end}}
+    fld1
+    fadd    st1, st0
+    fxch
+    fprem
+    fstp    st1
+    fld     dword [{{.Input "oscillator" "color"}}]               ; // c      p
+    ; every oscillator test included if needed
+{{- if .SupportsParamValue "oscillator" "type" .Sine}}
+    test    al, byte 0x40
+    jz      short su_op_oscillat_notsine
+    {{.Call "su_oscillat_sine"}}
+su_op_oscillat_notsine:
+{{- end}}
+{{- if .SupportsParamValue "oscillator" "type" .Trisaw}}
+    test    al, byte 0x20
+    jz      short su_op_oscillat_not_trisaw
+    {{.Call "su_oscillat_trisaw"}}
+su_op_oscillat_not_trisaw:
+{{- end}}
+{{- if .SupportsParamValue "oscillator" "type" .Pulse}}
+    test    al, byte 0x10
+    jz      short su_op_oscillat_not_pulse
+    {{.Call "su_oscillat_pulse"}}
+su_op_oscillat_not_pulse:
+{{- end}}
+{{- if .SupportsParamValue "oscillator" "type" .Gate}}
+    test    al, byte 0x04
+    jz      short su_op_oscillat_not_gate
+    {{.Call "su_oscillat_gate"}}
+    jmp     su_op_oscillat_gain ; skip waveshaping as the shape parameter is reused for gateshigh
+su_op_oscillat_not_gate:
+{{- end}}
+su_op_oscillat_shaping:
+    ; finally, shape the oscillator and apply gain
+    fld     dword [{{.Input "oscillator" "shape"}}]
+    {{.Call "su_waveshaper"}}
+su_op_oscillat_gain:
+    fld     dword [{{.Input "oscillator" "gain"}}]
+    fmulp   st1, st0
+    ret
+{{end}}
+
+
+{{- if .HasCall "su_oscillat_pulse"}}
+{{.Func "su_oscillat_pulse"}}
+    fucomi  st1                             ; // c      p
+    fld1
+    jnc     short su_oscillat_pulse_up      ; // +1     c       p
+    fchs                                    ; // -1     c       p
+su_oscillat_pulse_up:
+    fstp    st1                             ; // +-1    p
+    fstp    st1                             ; // +-1
+    ret
+{{end}}
+
+
+{{- if .HasCall "su_oscillat_trisaw"}}
+{{.Func "su_oscillat_trisaw"}}
+    fucomi  st1                             ; // c      p
+    jnc     short su_oscillat_trisaw_up
+    fld1                                    ; // 1      c       p
+    fsubr   st2, st0                        ; // 1      c       1-p
+    fsubrp  st1, st0                        ; // 1-c    1-p
+su_oscillat_trisaw_up:
+    fdivp   st1, st0                        ; // tp'/tc
+    fadd    st0                             ; // 2*''
+    fld1                                    ; // 1      2*''
+    fsubp   st1, st0                        ; // 2*''-1
+    ret
+{{end}}
+
+
+{{- if .HasCall "su_oscillat_sine"}}
+{{.Func "su_oscillat_sine"}}
+    fucomi  st1                             ; // c      p
+    jnc     short su_oscillat_sine_do
+    fstp    st1
+    fsub    st0, st0                        ; // 0
+    ret
+su_oscillat_sine_do:
+    fdivp   st1, st0                        ; // p/c
+    fldpi                                   ; // pi     p
+    fadd    st0                             ; // 2*pi   p
+    fmulp   st1, st0                        ; // 2*pi*p
+    fsin                                    ; // sin(2*pi*p)
+    ret
+{{end}}
+
+
+{{- if .HasCall "su_oscillat_gate"}}
+{{.Func "su_oscillat_gate"}}
+    fxch                                    ; p c
+    fstp    st1                             ; p
+{{- .Float 16.0 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 16.0 | .Use}}]                  ; 16*p
+    push    {{.AX}}
+    push    {{.AX}}
+    fistp   dword [{{.SP}}]                     ; s=int(16*p), stack empty
+    fld1                                    ; 1
+    pop     {{.AX}}
+    and     al, 0xf                         ; ax=int(16*p) & 15, stack: 1
+    bt      word [{{.VAL}}-4],ax                 ; if bit ax of the gate word is set
+    jc      su_oscillat_gate_bit                ;   goto gate_bit
+    fsub    st0, st0                        ; stack: 0
+su_oscillat_gate_bit:                           ; stack: 0/1, let's call it x
+    fld     dword [{{.WRK}}+16] ; g x, g is gatestate, x is the input to this filter 0/1
+    fsub    st1                             ; g-x x
+{{- .Float 0.99609375 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 0.99609375 | .Use}}]            ; c(g-x) x
+    faddp   st1, st0                        ; x+c(g-x)
+    fst     dword [{{.WRK}}+16]; g'=x+c(g-x) NOTE THAT UNISON 2 & UNISON 3 ALSO USE {{.WRK}}+16, so gate and unison 2 & 3 don't work. Probably should delete that low pass altogether
+    pop     {{.AX}}                             ; Another way to see this (c~0.996)
+    ret                                     ; g'=cg+(1-c)x
+    ; This is a low-pass to smooth the gate transitions
+{{end}}
+
+
+{{- if .HasCall "su_oscillat_sample"}}
+{{.Func "su_oscillat_sample"}}
+    {{- .PushRegs .AX "SampleAx" .DX "SampleDx" .CX "SampleCx" .BX "SampleBx" | indent 4}}                              ; edx must be saved, eax & ecx if this is stereo osc
+    push    {{.AX}}
+    mov     al, byte [{{.VAL}}-4]                                ; reuse "color" as the sample number
+{{- if .Library}}
+    lea     {{.DI}}, [{{.DI}} + {{.AX}}*8]                           ; edi points now to the sample table entry
+{{- else}}
+{{- .Prepare "su_sample_offsets" | indent 4}}
+    lea     {{.DI}}, [{{.Use "su_sample_offsets"}} + {{.AX}}*8]; edi points now to the sample table entry
+{{- end}}
+{{- .Float 84.28074964676522 | .Prepare | indent 4}}
+    fmul    dword [{{.Float 84.28074964676522 | .Use}}]                  ; p*r
+    fistp   dword [{{.SP}}]
+    pop     {{.DX}}                                             ; edx is now the sample number
+    movzx   ebx, word [{{.DI}} + 4]    ; ecx = loopstart
+    sub     edx, ebx                                        ; if sample number < loop start
+    jl      su_oscillat_sample_not_looping                  ;   then we're not looping yet
+    mov     eax, edx                                        ; eax = sample number
+    movzx   ecx, word [{{.DI}} + 6]   ; edi is now the loop length
+    xor     edx, edx                                        ; div wants edx to be empty
+    div     ecx                                             ; edx is now the remainder
+su_oscillat_sample_not_looping:
+    add     edx, ebx                                        ; sampleno += loopstart
+    add     edx, dword [{{.DI}}]
+{{- .Prepare "su_sample_table" | indent 4}}
+    fild    word [{{.Use "su_sample_table"}} + {{.DX}}*2]
+{{- .Float 32767.0 | .Prepare | indent 4}}
+    fdiv    dword [{{.Float 32767.0 | .Use}}]
+    {{- .PopRegs .AX .DX .CX .BX | indent 4}}
+    ret
+{{end}}
+
+
+{{- if .HasOp "loadval"}}
+;-------------------------------------------------------------------------------
+;   LOADVAL opcode
+;-------------------------------------------------------------------------------
+{{- if .Mono "loadval"}}
+;   Mono: push 2*v-1 on stack, where v is the input to port "value"
+{{- end}}
+{{- if .Stereo "loadval"}}
+;   Stereo: push 2*v-1 twice on stack
+{{- end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_loadval" "Opcode"}}
+    {{- if .StereoAndMono "loadval" }}
+    jnc     su_op_loadval_mono
+    {{- end}}
+    {{- if .Stereo "loadval" }}
+    call    su_op_loadval_mono
+su_op_loadval_mono:
+    {{- end }}
+    fld     dword [{{.Input "loadval" "value"}}] ; v
+{{- .Float 0.5 | .Prepare | indent 4}}
+    fsub    dword [{{.Float 0.5 | .Use}}]
+    fadd    st0                                 ; 2*v-1
+    ret
+{{end}}
+
+
+{{- if .HasOp "receive"}}
+;-------------------------------------------------------------------------------
+;   RECEIVE opcode
+;-------------------------------------------------------------------------------
+{{- if .Mono "receive"}}
+;   Mono:   push l on stack, where l is the left channel received
+{{- end}}
+{{- if .Stereo "receive"}}
+;   Stereo: push l r on stack
+{{- end}}
+;-------------------------------------------------------------------------------
+{{.Func "su_op_receive" "Opcode"}}
+    lea     {{.DI}}, [{{.WRK}}+su_unit.ports]
+{{- if .StereoAndMono "receive"}}
+    jnc     su_op_receive_mono
+{{- end}}
+{{- if .Stereo "receive"}}
+    xor     ecx,ecx
+    fld     dword [{{.DI}}+4]
+    mov     dword [{{.DI}}+4],ecx
+{{- end}}
+{{- if .StereoAndMono "receive"}}
+su_op_receive_mono:
+    xor     ecx,ecx
+{{- end}}
+    fld     dword [{{.DI}}]
+    mov     dword [{{.DI}}],ecx
+    ret
+{{end}}
+
+
+{{- if .HasOp "in"}}
+;-------------------------------------------------------------------------------
+;   IN opcode: inputs and clears a global port
+;-------------------------------------------------------------------------------
+;   Mono: push the left channel of a global port (out or aux)
+;   Stereo: also push the right channel (stack in l r order)
+;-------------------------------------------------------------------------------
+{{.Func "su_op_in" "Opcode"}}
+    lodsb
+    mov     {{.DI}}, [{{.Stack "Synth"}}]
+{{- if .StereoAndMono "in"}}
+    jnc     su_op_in_mono
+{{- end}}
+{{- if .Stereo "in"}}
+    xor     ecx, ecx ; we cannot xor before jnc, so we have to do it mono & stereo. LAHF / SAHF could do it, but is the same number of bytes with more entropy
+    fld     dword [{{.DI}} + su_synthworkspace.right + {{.AX}}*4]
+    mov     dword [{{.DI}} + su_synthworkspace.right + {{.AX}}*4], ecx
+{{- end}}
+{{- if .StereoAndMono "in"}}
+su_op_in_mono:
+    xor     ecx, ecx
+{{- end}}
+    fld     dword [{{.DI}} + su_synthworkspace.left + {{.AX}}*4]
+    mov     dword [{{.DI}} + su_synthworkspace.left + {{.AX}}*4], ecx
+    ret
+{{end}}
--- a/templates/amd64-386/structs.asm
+++ b/templates/amd64-386/structs.asm
@@ -0,0 +1,53 @@
+;-------------------------------------------------------------------------------
+;   unit struct
+;-------------------------------------------------------------------------------
+struc su_unit
+    .state      resd    8
+    .ports      resd    8
+    .size:
+endstruc
+
+;-------------------------------------------------------------------------------
+;   voice struct
+;-------------------------------------------------------------------------------
+struc su_voice
+    .note       resd    1
+    .release    resd    1
+    .inputs     resd    8
+    .reserved   resd    6 ; this is done to so the whole voice is 2^n long, see polyphonic player
+    .workspace  resb    63 * su_unit.size
+    .size:
+endstruc
+
+;-------------------------------------------------------------------------------
+;   synthworkspace struct
+;-------------------------------------------------------------------------------
+struc su_synthworkspace
+    .curvoices  resb    32      ; these are used by the multitrack player to store which voice is playing on which track
+    .left       resd    1
+    .right      resd    1
+    .aux        resd    6       ; 3 auxiliary signals
+    .voices     resb    32 * su_voice.size
+    .size:
+endstruc
+
+;-------------------------------------------------------------------------------
+;   su_delayline_wrk struct
+;-------------------------------------------------------------------------------
+struc   su_delayline_wrk
+    .dcin       resd    1
+    .dcout      resd    1
+    .filtstate  resd    1
+    .buffer     resd    65536
+    .size:
+endstruc
+
+;-------------------------------------------------------------------------------
+;   su_sample_offset struct
+;-------------------------------------------------------------------------------
+struc   su_sample_offset  ; length conveniently 8 bytes, so easy to index
+    .start      resd    1
+    .loopstart  resw    1
+    .looplength resw    1
+    .size:
+endstruc