diff --git a/CMakeSettings.json b/CMakeSettings.json
index 552c223..801a729 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -23,6 +23,30 @@
       "ctestCommandArgs": "",
       "inheritEnvironments": [ "msvc_x86" ],
       "variables": []
+    },
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "-v",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": []
+    },
+    {
+      "name": "x64-Release",
+      "generator": "Ninja",
+      "configurationType": "RelWithDebInfo",
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "-v",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": []
     }
   ]
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 3c99947..644186c 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,9 @@ New features since fork
     detuned and added up to together. Great for trance leads (supersaw). Unison
     of up to 4, or 8 if you make stereo unison oscillator and add up both left 
     and right channels. See [this example](tests/test_oscillat_unison.asm).
+  - **Supports 32 and 64 bit builds**. The 64-bit version is done with minimal
+    changes to get it work, mainly for the future prospect of running the MIDI
+    instrument in 64-bit mode. All the tests are passing so it seems to work.
 
 Future goals
 ------------
@@ -90,7 +93,6 @@ Future goals
     case the signal entering skip and the signal leaving out are both close to
     zero.
   - **Even more opcodes**. Maybe an equalizer? DC-offset removal?
-  - **Support for 64-bit targets**.
   - **Browser-based GUI and MIDI instrument**. Modern browsers support WebMIDI,
      WebAudio and, most importantly, they are cross-platform and come installed
      on pretty much any computer. The only thing needed is to be able to
diff --git a/src/opcodes/arithmetic.asm b/src/opcodes/arithmetic.asm
index b266367..3816caa 100644
--- a/src/opcodes/arithmetic.asm
+++ b/src/opcodes/arithmetic.asm
@@ -70,8 +70,8 @@ EXPORT MANGLE_FUNC(su_op_loadnote,0)
     call    su_op_loadnote_mono
 su_op_loadnote_mono:
 %endif
-    fild    dword [ecx+su_unit.size-su_voice.workspace+su_voice.note]
-    fmul    dword [c_i128]
+    fild    dword [_CX+su_unit.size-su_voice.workspace+su_voice.note]
+    apply fmul dword, c_i128
     ret
 
 %endif
diff --git a/src/opcodes/effects.asm b/src/opcodes/effects.asm
index ae0d0ac..db56238 100644
--- a/src/opcodes/effects.asm
+++ b/src/opcodes/effects.asm
@@ -14,7 +14,7 @@ EXPORT MANGLE_FUNC(su_op_distort,0)
         call su_effects_stereohelper
         %define INCLUDE_EFFECTS_STEREOHELPER
     %endif
-    fld     dword [edx+su_distort_ports.drive]
+    fld     dword [INP+su_distort_ports.drive]
     %define SU_INCLUDE_WAVESHAPER
     ; flow into waveshaper
 %endif
@@ -25,7 +25,7 @@ su_waveshaper:
     call    su_clip
     fxch                                    ; a x' (from now on just called x)
     fld     st0                             ; a a x
-    fsub    dword [c_0_5]                   ; a-.5 a x
+    apply fsub dword,c_0_5                  ; a-.5 a x
     fadd    st0                             ; 2*a-1 a x
     fld     st2                             ; x 2*a-1 a x
     fabs                                    ; abs(x) 2*a-1 a x
@@ -53,17 +53,17 @@ EXPORT MANGLE_FUNC(su_op_hold,0)
         call    su_effects_stereohelper
         %define INCLUDE_EFFECTS_STEREOHELPER
     %endif
-    fld     dword [edx+su_hold_ports.freq]    ; f x
+    fld     dword [INP+su_hold_ports.freq]    ; f x
     fmul    st0, st0                        ; f^2 x
     fchs                                    ; -f^2 x
     fadd    dword [WRK+su_hold_wrk.phase]   ; p-f^2 x
     fst     dword [WRK+su_hold_wrk.phase]   ; p <- p-f^2
     fldz                                    ; 0 p x
     fucomip st1                             ; p x
-    fstp    dword [esp-4]                   ; t=p, x
+    fstp    dword [_SP-4]                   ; t=p, x
     jc      short su_op_hold_holding        ; if (0 < p) goto holding
     fld1                                    ; 1 x
-    fadd    dword [esp-4]                   ; 1+t x
+    fadd    dword [_SP-4]                   ; 1+t x
     fstp    dword [WRK+su_hold_wrk.phase]   ; x
     fst     dword [WRK+su_hold_wrk.holdval] ; save holded value
     ret                                     ; x
@@ -85,10 +85,10 @@ EXPORT MANGLE_FUNC(su_op_crush,0)
     %ifdef INCLUDE_STEREO_CRUSH    
         call    su_effects_stereohelper
         %define INCLUDE_EFFECTS_STEREOHELPER
-    %endif    
-    fdiv    dword [edx+su_crush_ports.resolution]
+    %endif
+    fdiv    dword [INP+su_crush_ports.resolution]
     frndint
-    fmul    dword [edx+su_crush_ports.resolution]
+    fmul    dword [INP+su_crush_ports.resolution]
     ret
 
 %endif ; CRUSH_ID > -1
@@ -101,15 +101,15 @@ EXPORT MANGLE_FUNC(su_op_crush,0)
 SECT_TEXT(sugain)
     %ifdef INCLUDE_STEREO_GAIN
         EXPORT MANGLE_FUNC(su_op_gain,0)
-            fld     dword [edx+su_gain_ports.gain] ; g l (r)
+            fld     dword [INP+su_gain_ports.gain] ; g l (r)
             jnc     su_op_gain_mono
-            fmul    st2, st0                             ; g l r/g    
+            fmul    st2, st0                             ; g l r/g
         su_op_gain_mono:
             fmulp   st1, st0                             ; l/g (r/)
             ret
     %else
         EXPORT MANGLE_FUNC(su_op_gain,0)
-            fmul    dword [edx+su_gain_ports.gain]    
+            fmul    dword [INP+su_gain_ports.gain]
             ret
     %endif
 %endif ; GAIN_ID > -1
@@ -122,7 +122,7 @@ SECT_TEXT(sugain)
 SECT_TEXT(suingain)
     %ifdef INCLUDE_STEREO_INVGAIN
         EXPORT MANGLE_FUNC(su_op_invgain,0)
-            fld     dword [edx+su_invgain_ports.invgain] ; g l (r)
+            fld     dword [INP+su_invgain_ports.invgain] ; g l (r)
             jnc     su_op_invgain_mono
             fdiv    st2, st0                             ; g l r/g    
         su_op_invgain_mono:
@@ -130,7 +130,7 @@ SECT_TEXT(suingain)
             ret
     %else
         EXPORT MANGLE_FUNC(su_op_invgain,0)
-            fdiv    dword [edx+su_invgain_ports.invgain]    
+            fdiv    dword [INP+su_invgain_ports.invgain]
             ret
     %endif
 %endif ; INVGAIN_ID > -1
@@ -150,14 +150,14 @@ SECT_TEXT(sufilter)
 
 EXPORT MANGLE_FUNC(su_op_filter,0)
     lodsb ; load the flags to al
-    %ifdef INCLUDE_STEREO_FILTER    
+    %ifdef INCLUDE_STEREO_FILTER
         call    su_effects_stereohelper
         %define INCLUDE_EFFECTS_STEREOHELPER
     %endif
-    fld     dword [edx+su_filter_ports.res] ; r x
-    fld     dword [edx+su_filter_ports.freq]; f r x
+    fld     dword [INP+su_filter_ports.res] ; r x
+    fld     dword [INP+su_filter_ports.freq]; f r x
     fmul    st0, st0                        ; f2 x (square the input so we never get negative and also have a smoother behaviour in the lower frequencies)
-    fst     dword [esp-4]                   ; f2 r x
+    fst     dword [_SP-4]                   ; f2 r x
     fmul    dword [WRK+su_filter_wrk.band]  ; f2*b r x
     fadd    dword [WRK+su_filter_wrk.low]   ; f2*b+l r x
     fst     dword [WRK+su_filter_wrk.low]   ; l'=f2*b+l r x
@@ -165,7 +165,7 @@ EXPORT MANGLE_FUNC(su_op_filter,0)
     fmul    dword [WRK+su_filter_wrk.band]  ; r*b x-l'
     fsubp   st1, st0                        ; x-l'-r*b
     fst     dword [WRK+su_filter_wrk.high]  ; h'=x-l'-r*b
-    fmul    dword [esp-4]                   ; f2*h'
+    fmul    dword [_SP-4]                   ; f2*h'
     fadd    dword [WRK+su_filter_wrk.band]  ; f2*h'+b
     fstp    dword [WRK+su_filter_wrk.band]  ; b'=f2*h'+b
     fldz                                    ; 0
@@ -212,9 +212,9 @@ SECT_TEXT(suclip)
 
 %if CLIP_ID > -1
     EXPORT MANGLE_FUNC(su_op_clip,0)
-    %ifdef INCLUDE_STEREO_CLIP        
+    %ifdef INCLUDE_STEREO_CLIP
         call    su_effects_stereohelper
-        %define INCLUDE_EFFECTS_STEREOHELPER    
+        %define INCLUDE_EFFECTS_STEREOHELPER
     %endif
     %define SU_INCLUDE_CLIP
     ; flow into su_doclip
@@ -255,7 +255,7 @@ EXPORT MANGLE_FUNC(su_op_pan,0)
     jc      su_op_pan_do    ; this time, if this is mono op...
     fld     st0             ;   ...we duplicate the mono into stereo first
 su_op_pan_do:
-    fld     dword [edx+su_pan_ports.panning]    ; p l r
+    fld     dword [INP+su_pan_ports.panning]    ; p l r
     fld1                                        ; 1 p l r
     fsub    st1                                 ; 1-p p l r
     fmulp   st2                                 ; p (1-p)*l r
@@ -265,7 +265,7 @@ su_op_pan_do:
 %else ; ifndef INCLUDE_STEREO_PAN
 
 EXPORT MANGLE_FUNC(su_op_pan,0)
-    fld     dword [edx+su_pan_ports.panning]    ; p s
+    fld     dword [INP+su_pan_ports.panning]    ; p s
     fmul    st1                                 ; p*s s
     fsub    st1, st0                            ; p*s s-p*s
                                                 ; Equal to
@@ -288,7 +288,7 @@ su_effects_stereohelper:
     jnc     su_effects_stereohelper_mono ; carry is still the stereo bit
     add     WRK, 16
     fxch                  ; r l
-    call    dword [esp]   ; call whoever called me...
+    call    [_SP]         ; call whoever called me...
     fxch                  ; l r
     sub     WRK, 16       ; move WRK back to where it was
 su_effects_stereohelper_mono:
@@ -324,68 +324,68 @@ EXPORT MANGLE_FUNC(su_op_delay,0)
     add     edi, eax ; the second delay is done with the delay time index added by count
 su_op_delay_mono:
 %endif
-    pushad
+    push_registers  _AX, _CX, _BX, WRK, _SI, _DI
     mov     ebx, edi; ugly register juggling, refactor
 %ifdef DELAY_NOTE_SYNC
     test    ebx, ebx ; note s
     jne     su_op_delay_skipnotesync
     fld1
-    fild    dword [ecx+su_unit.size-su_voice.workspace+su_voice.note]
-    fmul    dword [c_i12]
+    fild    dword [_CX+su_unit.size-su_voice.workspace+su_voice.note]
+    apply fmul dword, c_i12
     call    MANGLE_FUNC(su_power,0)
-    fmul    dword [c_freq_normalize]  ; // normalize
+    apply fmul dword, c_freq_normalize ; // normalize
     fdivp   st1, st0                ; // invert to get numer of samples
-    fistp   word [MANGLE_DATA(su_delay_times)]  ; store current comb size
+    apply fistp word, MANGLE_DATA(su_delay_times)  ; store current comb size
 su_op_delay_skipnotesync:
 %endif
 kmDLL_func_process:
     mov     ecx, eax                            ;// ecx is the number of parallel delays
-    mov     WRK, dword [MANGLE_DATA(su_delay_buffer_ofs)] ;// ebp is current delay
+    apply {mov WRK, PTRWORD},MANGLE_DATA(su_delay_buffer_ofs) ;// ebp is current delay
     fld     st0                                 ; x x
-    fmul    dword [edx+su_delay_ports.dry]      ; dr*x x
+    fmul    dword [INP+su_delay_ports.dry]      ; dr*x x
     fxch                                        ; x dr*x
-    fmul    dword [edx+su_delay_ports.pregain]  ; p*x dr*x
-    fmul    dword [edx+su_delay_ports.pregain]  ; p^2*x dr*x
+    fmul    dword [INP+su_delay_ports.pregain]  ; p*x dr*x
+    fmul    dword [INP+su_delay_ports.pregain]  ; p^2*x dr*x
 
 kmDLL_func_loop:
         mov     edi, dword [WRK + su_delayline_wrk.time]
         inc     edi
         and     edi, MAX_DELAY-1
         mov     dword [WRK + su_delayline_wrk.time],edi
-        movzx   esi, word [MANGLE_DATA(su_delay_times)+ebx*2] ; esi = comb size from the delay times table
+        apply {movzx esi, word},MANGLE_DATA(su_delay_times),_BX*2,{} ; esi = comb size from the delay times table
         mov     eax, edi
         sub     eax, esi
         and     eax, MAX_DELAY-1
-        fld     dword [WRK+eax*4+su_delayline_wrk.buffer] ; s p^2*x dr*x, where s is the sample from delay buffer
+        apply fld dword, su_delayline_wrk.buffer, WRK, _AX*4,{} ; s p^2*x dr*x, where s is the sample from delay buffer
         ;// add comb output to current output
         fadd    st2, st0                            ; s p^2*x dr*x+s
         fld1                                        ; 1 s p^2*x dr*x+s
-        fsub    dword [edx+su_delay_ports.damp]     ; 1-da s p^2*x dr*x+s
+        fsub    dword [INP+su_delay_ports.damp]     ; 1-da s p^2*x dr*x+s
         fmulp   st1, st0                            ; s*(1-da) p^2*x dr*x+s
-        fld     dword [edx+su_delay_ports.damp]     ; da s*(1-da) p^2*x dr*x+s
+        fld     dword [INP+su_delay_ports.damp]     ; da s*(1-da) p^2*x dr*x+s
         fmul    dword [WRK+su_delayline_wrk.filtstate]      ; o*da s*(1-da) p^2*x dr*x+s, where o is stored
         faddp   st1, st0                            ; o*da+s*(1-da) p^2*x dr*x+s
         fst     dword [WRK+su_delayline_wrk.filtstate]      ; o'=o*da+s*(1-da), o' p^2*x dr*x+s
-        fmul    dword [edx+su_delay_ports.feedback] ; f*o' p^2*x dr*x+s
+        fmul    dword [INP+su_delay_ports.feedback] ; f*o' p^2*x dr*x+s
         fadd    st0, st1                            ; f*o'+p^2*x p^2*x dr*x+s
-        fstp    dword [WRK+edi*4+su_delayline_wrk.buffer]; save f*o'+p^2*x to delay buffer
+        fstp    dword [WRK+_DI*4+su_delayline_wrk.buffer]; save f*o'+p^2*x to delay buffer
         inc     ebx                                 ;// go to next delay lenkmh index
         add     WRK, su_delayline_wrk.size               ;// go to next delay
-        mov     dword [MANGLE_DATA(su_delay_buffer_ofs)], WRK ;// store next delay offset
+        apply mov PTRWORD, MANGLE_DATA(su_delay_buffer_ofs),{, WRK} ;// store next delay offset
         loopne  kmDLL_func_loop
     fstp    st0                                 ; dr*x+s1+s2+s3+...
     ; DC-filtering
     sub     WRK, su_delayline_wrk.size ; the reason to use the last su_delayline_wrk instead of su_delay_wrk is that su_delay_wrk is wiped by retriggering
     fld     dword [WRK+su_delayline_wrk.dcout]  ; o s
-    fmul    dword [c_dc_const]              ; c*o s
+    apply fmul dword, c_dc_const                      ; c*o s
     fsub    dword [WRK+su_delayline_wrk.dcin]   ; c*o-i s
     fxch                                    ; s c*o-i
     fst     dword [WRK+su_delayline_wrk.dcin]   ; i'=s, s c*o-i
     faddp   st1                             ; s+c*o-i
-    fadd    dword [c_0_5]                       ;// add and sub small offset to prevent denormalization
-    fsub    dword [c_0_5]
+    apply fadd dword, c_0_5                       ;// add and sub small offset to prevent denormalization
+    apply fsub dword, c_0_5
     fst     dword [WRK+su_delayline_wrk.dcout]  ; o'=s+c*o-i
-    popad
+    pop_registers  _AX, _CX, _BX, WRK, _SI, _DI
     ret
 
 ;-------------------------------------------------------------------------------
@@ -394,7 +394,8 @@ kmDLL_func_loop:
 SECT_BSS(sudelbuf)
 
 EXPORT MANGLE_DATA(su_delay_buffer_ofs)
-                        resd    1
+                        RESPTR  1
+
 EXPORT MANGLE_DATA(su_delay_buffer)
                         resb   NUM_DELAY_LINES*su_delayline_wrk.size
 
@@ -443,7 +444,7 @@ su_op_compressor_releasing:
     fmulp   st2, st0                            ; l c*(x^2-l) x
     faddp   st1, st0                            ; l+c*(x^2-l) x
     fst     dword [WRK+su_compres_wrk.level] ; l'=l+c*(x^2-l), l' x
-    fld     dword [edx+su_compres_ports.threshold] ; t l' x
+    fld     dword [INP+su_compres_ports.threshold] ; t l' x
     fmul    st0, st0                            ; t*t
     fucomi  st0, st1                            ; if threshold < l'
     jb      su_op_compressor_compress           ;    then we actually do compression
@@ -453,8 +454,8 @@ su_op_compressor_releasing:
     ret                                         ; return unity gain when we are below threshold
 su_op_compressor_compress:                      ; l' x
     fdivrp  st1, st0                            ; t*t/l' x
-    fld     dword [edx+su_compres_ports.ratio]  ; r t*t/l' x
-    fmul    dword [c_0_5]                       ; p=r/2 t*t/l' x
+    fld     dword [INP+su_compres_ports.ratio]  ; r t*t/l' x
+    apply fmul dword, c_0_5                           ; p=r/2 t*t/l' x
     fxch                                        ; t*t/l' p x
     fyl2x                                       ; p*log2(t*t/l') x
     jmp     MANGLE_FUNC(su_power,0)             ; 2^(p*log2(t*t/l')) x
diff --git a/src/opcodes/flowcontrol.asm b/src/opcodes/flowcontrol.asm
index c3c301e..d0ec213 100644
--- a/src/opcodes/flowcontrol.asm
+++ b/src/opcodes/flowcontrol.asm
@@ -21,31 +21,29 @@ SECT_TEXT(suopadvn)
 %ifdef INCLUDE_POLYPHONY
 
 EXPORT MANGLE_FUNC(su_op_advance,0)     ; Stack: addr voice wrkptr valptr comptr
-    mov     WRK, dword [esp+8]          ; WRK = wrkptr
+    mov     WRK, [_SP+PTRSIZE*2]          ; WRK = wrkptr
     add     WRK, su_voice.size          ; move to next voice
-    mov     dword [esp+8], WRK          ; update stack
-    mov     ecx, dword [esp+4]          ; ecx = voice
-    bt      dword [su_polyphony_bitmask],ecx ; if voice bit of su_polyphonism not set
+    mov     [_SP+PTRSIZE*2], WRK        ; update stack
+    mov     ecx, [_SP+PTRSIZE]          ; ecx = voice
+    apply bt dword,su_polyphony_bitmask,{,ecx} ; if voice bit of su_polyphonism not set
     jnc     su_op_advance_next_instrument ; goto next_instrument
-    mov     VAL, dword [esp+12]         ; rollback to where we were earlier
-    mov     COM, dword [esp+16]
+    mov     VAL, PTRWORD [_SP+PTRSIZE*3]         ; rollback to where we were earlier
+    mov     COM, PTRWORD [_SP+PTRSIZE*4]
     jmp     short su_op_advance_finish
 su_op_advance_next_instrument:
-    mov     dword [esp+12], VAL         ; save current VAL as a checkpoint
-    mov     dword [esp+16], COM         ; save current COM as a checkpoint
+    mov     PTRWORD [_SP+PTRSIZE*3], VAL         ; save current VAL as a checkpoint
+    mov     PTRWORD [_SP+PTRSIZE*4], COM         ; save current COM as a checkpoint
 su_op_advance_finish:
-    inc     dword [esp+4]
+    inc     PTRWORD [_SP+PTRSIZE]
     ret
 
 %else
-
-EXPORT MANGLE_FUNC(su_op_advance,0)     ; Stack: addr voice wrkptr valptr comptr
-    mov     WRK, dword [esp+8]          ; WRK = wrkptr
-    add     WRK, su_voice.size          ; move to next voice
-    mov     dword [esp+8], WRK          ; update stack
-    inc     dword [esp+4]               ; voice++
-    ret
-
+    EXPORT MANGLE_FUNC(su_op_advance,0)         ; Stack: addr voice wrkptr valptr comptr
+        mov     WRK, PTRWORD [_SP+PTRSIZE*2]    ; WRK = wrkptr
+        add     WRK, su_voice.size              ; move to next voice
+        mov     PTRWORD [_SP+PTRSIZE*2], WRK      ; update stack
+        inc     PTRWORD [_SP+PTRSIZE]             ; voice++
+        ret
 %endif
 
 ;-------------------------------------------------------------------------------
@@ -56,18 +54,18 @@ EXPORT MANGLE_FUNC(su_op_advance,0)     ; Stack: addr voice wrkptr valptr comptr
 SECT_TEXT(suspeed)
 
 EXPORT MANGLE_FUNC(su_op_speed,0)
-    fsub    dword [c_0_5]                ; s-.5
+    apply fsub dword, c_0_5              ; s-.5
     fadd    st0, st0                     ; 2*s-1
-    fmul    dword [c_bpmscale]           ; (2*s-1)*64/24, let's call this p from now on
+    apply fmul dword, c_bpmscale         ; (2*s-1)*64/24, let's call this p from now on
     call    MANGLE_FUNC(su_power,0)      ; 2^p, this is how many ticks we should be taking
     fld1                                 ; 1 2^p
     fsubp   st1, st0                     ; 2^p-1, the player is advancing 1 tick by its own
     fadd    dword [WRK+su_speed_wrk.remainder] ; t+2^p-1, t is the remainder from previous rounds as ticks have to be rounded to 1
-    push    eax
-    fist    dword [esp]                  ; Main stack: k=int(t+2^p-1)
-    fisub   dword [esp]                  ; t+2^p-1-k, the remainder
-    pop     eax
-    add     dword [esp+24], eax          ; add the whole ticks to song tick count, [esp+24] is the current tick in the row
+    push    _AX
+    fist    dword [_SP]                  ; Main stack: k=int(t+2^p-1)
+    fisub   dword [_SP]                  ; t+2^p-1-k, the remainder
+    pop     _AX
+    add     dword [_SP+6*PTRSIZE], eax          ; add the whole ticks to song tick count, [esp+24] is the current tick in the row
     fstp    dword [WRK+su_speed_wrk.remainder] ; save the remainder for future
     ret
 
diff --git a/src/opcodes/sinks.asm b/src/opcodes/sinks.asm
index 1585d91..cc40606 100644
--- a/src/opcodes/sinks.asm
+++ b/src/opcodes/sinks.asm
@@ -6,16 +6,16 @@
 SECT_TEXT(suopout)
 
 EXPORT MANGLE_FUNC(su_op_out,0) ; l r
-    mov     eax, su_synth_obj + su_synth.left
-%ifdef INCLUDE_STEREO_OUT
-    jnc     su_op_out_mono
-    call    su_op_out_mono
-    add     eax, 4
-su_op_out_mono:
-%endif
-    fmul    dword [edx+su_out_ports.gain] ; g*l
-    fadd    dword [eax]                   ; g*l+o
-    fstp    dword [eax]                   ; o'=g*l+o
+    mov     _AX, PTRWORD su_synth_obj + su_synth.left
+    %ifdef INCLUDE_STEREO_OUT
+        jnc     su_op_out_mono
+        call    su_op_out_mono
+        add     _AX, 4
+    su_op_out_mono:
+    %endif
+    fmul    dword [INP+su_out_ports.gain] ; g*l
+    fadd    dword [_AX]                   ; g*l+o
+    fstp    dword [_AX]                   ; o'=g*l+o
     ret
 
 %endif ; SU_OUT_ID > -1
@@ -38,33 +38,33 @@ EXPORT MANGLE_FUNC(su_op_send,0)
     lodsw
 %ifdef INCLUDE_STEREO_SEND
     jnc     su_op_send_mono
-    mov     edi, eax
-    inc     eax  ; send the right channel first
+    mov     _DI, _AX
+    inc     _AX  ; send the right channel first
     fxch                        ; r l
     call    su_op_send_mono     ; (r) l
-    mov     eax, edi            ; move back to original address
-    test    eax, SEND_POP       ; if r was not popped and is still in the stack
+    mov     _AX, _DI            ; move back to original address
+    test    _AX, SEND_POP       ; if r was not popped and is still in the stack
     jnz     su_op_send_mono
     fxch                        ; swap them back: l r
 su_op_send_mono:
 %endif
 %ifdef INCLUDE_GLOBAL_SEND
-    test    eax, SEND_GLOBAL
+    test    _AX, SEND_GLOBAL
     jz      su_op_send_skipglobal
-    mov     ecx, su_synth_obj - su_unit.size
+    mov     _CX, PTRWORD su_synth_obj - su_unit.size
 su_op_send_skipglobal:
 %endif
-    test    eax, SEND_POP       ; if the SEND_POP bit is not set
+    test    _AX, SEND_POP       ; if the SEND_POP bit is not set
     jnz     su_op_send_skippush
     fld     st0                 ; duplicate the signal on stack: s s
 su_op_send_skippush:            ; there is signal s, but maybe also another: s (s)
-    fld     dword [edx+su_send_ports.amount]   ; a l (l)
-    fsub    dword [c_0_5]                      ; a-.5 l (l)
+    fld     dword [INP+su_send_ports.amount]   ; a l (l)
+    apply   fsub dword, c_0_5                        ; a-.5 l (l)
     fadd    st0                                ; g=2*a-1 l (l)
-    and     eax, 0x0000ffff - SEND_POP - SEND_GLOBAL ; eax = send address
+    and     _AX, 0x0000ffff - SEND_POP - SEND_GLOBAL ; eax = send address
     fmulp   st1, st0                           ; g*l (l)
-    fadd    dword [ecx+su_unit.size+eax*4]     ; g*l+L (l),where L is the current value
-    fstp    dword [ecx+su_unit.size+eax*4]     ; (l)
+    fadd    dword [_CX+su_unit.size+_AX*4]     ; g*l+L (l),where L is the current value
+    fstp    dword [_CX+su_unit.size+_AX*4]     ; (l)
     ret
 
 %endif ; SU_USE_SEND > -1
diff --git a/src/opcodes/sources.asm b/src/opcodes/sources.asm
index ffca345..4294a89 100644
--- a/src/opcodes/sources.asm
+++ b/src/opcodes/sources.asm
@@ -22,7 +22,7 @@ EXPORT MANGLE_FUNC(su_op_envelope,0)
 su_op_envelope_mono:
 %endif
 kmENV_func_do:
-    mov     eax, dword [ecx+su_unit.size-su_voice.workspace+su_voice.release] ; eax = su_instrument.release
+    mov     eax, dword [_CX+su_unit.size-su_voice.workspace+su_voice.release] ; eax = su_instrument.release
     test    eax, eax                            ; if (eax == 0)
     je      kmENV_func_process                  ;   goto process
     mov     dword [WRK+su_env_work.state], ENV_STATE_RELEASE  ; [state]=RELEASE
@@ -45,7 +45,7 @@ kmENV_func_decay:
     jne     short kmENV_func_release            ;   goto release
     call    su_env_map                          ; d x, where d=decay
     fsubp   st1, st0                            ; x-d
-    fld     dword [edx+su_env_ports.sustain]       ; s x-d, where s=sustain
+    fld     dword [INP+su_env_ports.sustain]       ; s x-d, where s=sustain
     fucomi  st1                                 ; if (x-d>s) // is decay complete?
     fcmovb  st0, st1                            ;   x-d x-d
     jnc     short kmENV_func_statechange        ; else goto statechange
@@ -64,7 +64,7 @@ kmENV_func_leave:
     fstp    st1                                 ; x', where x' is the new value
     fst     dword [WRK+su_env_work.level]         ; [level]=x'
 kmENV_func_leave2:
-    fmul    dword [edx+su_env_ports.gain]          ; [gain]*x'
+    fmul    dword [INP+su_env_ports.gain]          ; [gain]*x'
     ret
 
 %endif ; SU_USE_ENVELOPE
@@ -83,9 +83,9 @@ EXPORT MANGLE_FUNC(su_op_noise,0)
 su_op_noise_mono:
 %endif
     call    MANGLE_FUNC(FloatRandomNumber,0)
-    fld     dword [edx+su_noise_ports.shape]
+    fld     dword [INP+su_noise_ports.shape]
     call    su_waveshaper
-    fld     dword [edx+su_noise_ports.gain]
+    fld     dword [INP+su_noise_ports.gain]
     fmulp   st1, st0
     ret
 
@@ -102,8 +102,8 @@ SECT_TEXT(suoscill)
 
 EXPORT MANGLE_FUNC(su_op_oscillat,0)
     lodsb                                   ; load the flags
-    fld     dword [edx+su_osc_ports.detune] ; e, where e is the detune [0,1]
-    fsub    dword [c_0_5]                   ; e-.5
+    fld     dword [INP+su_osc_ports.detune] ; e, where e is the detune [0,1]
+    apply fsub dword,c_0_5                  ; e-.5
     fadd    st0, st0                        ; d=2*e-.5, where d is the detune [-1,1]
 %ifdef INCLUDE_STEREO_OSCILLAT
     jnc     su_op_oscillat_mono
@@ -115,49 +115,49 @@ EXPORT MANGLE_FUNC(su_op_oscillat,0)
     su_op_oscillat_mono:
 %endif
 %ifdef INCLUDE_UNISONS
-    pushad                          ; push eax, WRK, WRK would suffice but this is shorter
+    push_registers _AX, WRK, _AX
     fldz                            ; 0 d
     fxch                            ; d a=0, "accumulated signal"
 su_op_oscillat_unison_loop:
-    fst     dword [esp]             ; save the current detune, d. We could keep it in fpu stack but it was getting big.
+    fst     dword [_SP]             ; save the current detune, d. We could keep it in fpu stack but it was getting big.
     call    su_op_oscillat_single   ; s a
     faddp   st1, st0                ; a+=s
     test    al, UNISON4
-    je      su_op_oscillat_unison_out    
+    je      su_op_oscillat_unison_out
     add     WRK, 8
-    fld     dword [edx+su_osc_ports.phaseofs] ; p s
-    fadd    dword [c_i12]                     ; p s, add some little phase offset to unison oscillators so they don't start in sync
-    fstp    dword [edx+su_osc_ports.phaseofs] ; s    note that this changes the phase for second, possible stereo run. That's probably ok
-    fld     dword [esp]             ; d s
-    fmul    dword [c_0_5]           ; .5*d s    // negate and halve the detune of each oscillator
+    fld     dword [INP+su_osc_ports.phaseofs] ; p s
+    apply fadd dword, c_i12                         ; p s, add some little phase offset to unison oscillators so they don't start in sync
+    fstp    dword [INP+su_osc_ports.phaseofs] ; s    note that this changes the phase for second, possible stereo run. That's probably ok
+    fld     dword [_SP]             ; d s
+    apply fmul dword, c_0_5               ; .5*d s    // negate and halve the detune of each oscillator
     fchs                            ; -.5*d s   // negate and halve the detune of each oscillator
     dec     eax
     jmp     short su_op_oscillat_unison_loop
 su_op_oscillat_unison_out:
-    popad                           ; similarly, pop WRK, WRK, eax would suffice
+    pop_registers _AX, WRK, _AX
     ret
 su_op_oscillat_single:
 %endif
-    fld     dword [edx+su_osc_ports.transpose]
-    fsub    dword [c_0_5]
-    fdiv    dword [c_i128]
+    fld     dword [INP+su_osc_ports.transpose]
+    apply fsub dword,c_0_5
+    apply fdiv dword,c_i128
     faddp   st1
     test    al, byte LFO
     jnz     su_op_oscillat_skipnote
-    fiadd   dword [ecx+su_unit.size-su_voice.workspace+su_voice.note]               ; // st0 is note, st1 is t+d offset
+    fiadd   dword [_CX+su_unit.size-su_voice.workspace+su_voice.note]               ; // st0 is note, st1 is t+d offset
 su_op_oscillat_skipnote:
-    fmul    dword [c_i12]
+    apply fmul dword,c_i12
     call    MANGLE_FUNC(su_power,0)
     test    al, byte LFO
     jz      short su_op_oscillat_normalize_note
-    fmul    dword [c_lfo_normalize]  ; // st0 is now frequency for lfo
+    apply fmul dword,c_lfo_normalize  ; // st0 is now frequency for lfo
     jmp     short su_op_oscillat_normalized
 su_op_oscillat_normalize_note:
-    fmul    dword [c_freq_normalize]  ; // st0 is now frequency
+    apply fmul dword,c_freq_normalize   ; // st0 is now frequency
 su_op_oscillat_normalized:
     fadd    dword [WRK+su_osc_wrk.phase]
     fst     dword [WRK+su_osc_wrk.phase]
-    fadd    dword [edx+su_osc_ports.phaseofs]
+    fadd    dword [INP+su_osc_ports.phaseofs]
 %ifdef INCLUDE_SAMPLES
     test    al, byte SAMPLE
     jz      short su_op_oscillat_not_sample
@@ -170,7 +170,7 @@ su_op_oscillat_not_sample:
     fxch
     fprem
     fstp    st1
-    fld     dword [edx+su_osc_ports.color]               ; // c      p
+    fld     dword [INP+su_osc_ports.color]               ; // c      p
     ; every oscillator test included if needed
 %ifdef INCLUDE_SINE
     test    al, byte SINE
@@ -199,10 +199,10 @@ su_op_oscillat_not_gate:
 %endif
 su_op_oscillat_shaping:
     ; finally, shape the oscillator and apply gain
-    fld     dword [edx+su_osc_ports.shape]
+    fld     dword [INP+su_osc_ports.shape]
     call    su_waveshaper
 su_op_oscillat_gain:
-    fld     dword [edx+su_osc_ports.gain]
+    fld     dword [INP+su_osc_ports.gain]
     fmulp   st1, st0
     ret
     %define SU_INCLUDE_WAVESHAPER
@@ -281,12 +281,12 @@ SECT_TEXT(sugate)
 su_oscillat_gate:
     fxch                                    ; p c
     fstp    st1                             ; p
-    fmul    dword [c_16]                    ; 16*p
-    push    eax
-    push    eax
-    fistp   dword [esp]                     ; s=int(16*p), stack empty
+    apply fmul dword, c_16                        ; 16*p
+    push    _AX
+    push    _AX
+    fistp   dword [_SP]                     ; s=int(16*p), stack empty
     fld1                                    ; 1
-    pop     eax
+    pop     _AX
     and     al, 0xf                         ; ax=int(16*p) & 15, stack: 1
     bt      word [VAL-4],ax                 ; if bit ax of the gate word is set
     jc      go4kVCO_gate_bit                ;   goto gate_bit
@@ -294,10 +294,10 @@ su_oscillat_gate:
 go4kVCO_gate_bit:                           ; stack: 0/1, let's call it x
     fld     dword [WRK+su_osc_wrk.gatestate] ; g x, g is gatestate, x is the input to this filter 0/1
     fsub    st1                             ; g-x x
-    fmul    dword [c_dc_const]              ; c(g-x) x
+    apply fmul dword,c_dc_const                   ; c(g-x) x
     faddp   st1, st0                        ; x+c(g-x)
     fst     dword [WRK+su_osc_wrk.gatestate] ; g'=x+c(g-x)
-    pop     eax                             ; Another way to see this (c~0.996)
+    pop     _AX                             ; Another way to see this (c~0.996)
     ret                                     ; g'=cg+(1-c)x
     ; This is a low-pass to smooth the gate transitions
 
@@ -321,26 +321,26 @@ SECT_DATA(suconst)
 SECT_TEXT(suoscsam)
 
 su_oscillat_sample:                                         ; p
-    pushad                                                  ; edx must be saved, eax & ecx if this is stereo osc
-    push    edx
+    push_registers _AX,_DX,_CX,_BX                              ; edx must be saved, eax & ecx if this is stereo osc
+    push    _AX
     mov     al, byte [VAL-4]                                ; reuse "color" as the sample number
-    lea     edi, [MANGLE_DATA(su_sample_offsets) + eax*8]   ; edi points now to the sample table entry
-    fmul    dword [c_samplefreq_scaling]                    ; p*r
-    fistp   dword [esp]
-    pop     edx                                             ; edx is now the sample number
-    movzx   ebx, word [edi + su_sample_offset.loopstart]    ; ecx = loopstart
+    apply {lea _DI,}, MANGLE_DATA(su_sample_offsets), _AX*8,{}  ; edi points now to the sample table entry
+    apply fmul dword, c_samplefreq_scaling                        ; p*r
+    fistp   dword [_SP]
+    pop     _DX                                             ; edx is now the sample number
+    movzx   ebx, word [_DI + su_sample_offset.loopstart]    ; ecx = loopstart
     sub     edx, ebx                                        ; if sample number < loop start
     jl      su_oscillat_sample_not_looping                  ;   then we're not looping yet
     mov     eax, edx                                        ; eax = sample number
-    movzx   ecx, word [edi + su_sample_offset.looplength]   ; edi is now the loop length
+    movzx   ecx, word [_DI + su_sample_offset.looplength]   ; edi is now the loop length
     xor     edx, edx                                        ; div wants edx to be empty
     div     ecx                                             ; edx is now the remainder
 su_oscillat_sample_not_looping:
     add     edx, ebx                                        ; sampleno += loopstart
-    add     edx, dword [edi + su_sample_offset.start]
-    fild    word [MANGLE_DATA(su_sample_table) + edx*2]
-    fdiv    dword [c_32767]
-    popad
+    add     edx, dword [_DI + su_sample_offset.start]
+    apply fild word, MANGLE_DATA(su_sample_table), _DX*2,{}
+    apply fdiv dword, c_32767
+    pop_registers _AX,_DX,_CX,_BX
     ret
 
 SECT_DATA(suconst)
@@ -369,8 +369,8 @@ EXPORT MANGLE_FUNC(su_op_loadval,0)
     call    su_op_loadval_mono
 su_op_loadval_mono:
 %endif
-    fld     dword [edx+su_load_val_ports.value] ; v
-    fsub    dword [c_0_5]                       ; v-.5
+    fld     dword [INP+su_load_val_ports.value] ; v
+    apply   fsub dword, c_0_5
     fadd    st0                                 ; 2*v-1
     ret
 
@@ -388,18 +388,18 @@ su_op_loadval_mono:
 SECT_TEXT(sureceiv)
 
 EXPORT MANGLE_FUNC(su_op_receive,0)
-    lea     ecx, dword [WRK+su_unit.ports]    
+    lea     _CX, [WRK+su_unit.ports]
 %ifdef INCLUDE_STEREO_RECEIVE
     jnc     su_op_receive_mono
     xor     eax,eax
-    fld     dword [ecx+su_receive_ports.right]
-    mov     dword [ecx+su_receive_ports.right],eax
+    fld     dword [_CX+su_receive_ports.right]
+    mov     dword [_CX+su_receive_ports.right],eax
 su_op_receive_mono:
 %else
     xor     eax,eax
 %endif
-    fld     dword [ecx+su_receive_ports.left]
-    mov     dword [ecx+su_receive_ports.left],eax
+    fld     dword [_CX+su_receive_ports.left]
+    mov     dword [_CX+su_receive_ports.left],eax
     ret
 
 %endif ; RECEIVE_ID > -1
diff --git a/src/player.asm b/src/player.asm
index 2cf3f2c..ece24c6 100644
--- a/src/player.asm
+++ b/src/player.asm
@@ -1,3 +1,26 @@
+%if BITS == 32
+    %define BUFFER_STACK_LOC 44
+    %define render_prologue pushad ; stdcall & everything nonvolatile except eax, ecx, edx
+    %macro render_epilogue 0
+        popad
+        ret     4 ; clean the passed parameter from stack.
+    %endmacro
+%elifidn __OUTPUT_FORMAT__,win64
+    %define BUFFER_STACK_LOC 48
+    %define render_prologue push_registers rcx,rdi,rsi,rbx,rbp  ; rcx = ptr to buf. rdi,rsi,rbx,rbp  nonvolatile
+    %macro render_epilogue 0
+        pop_registers rcx,rdi,rsi,rbx,rbp
+        ret
+    %endmacro
+%else ; 64 bit mac & linux
+    %define BUFFER_STACK_LOC 48
+    %define render_prologue push_registers rdi,rbx,rbp ; rdi = ptr to buf. rbx & rbp nonvolatile
+    %macro render_epilogue 0
+        pop_registers rdi,rbx,rbp
+        ret
+    %endmacro
+%endif
+
 ;-------------------------------------------------------------------------------
 ;    Uninitialized data
 ;-------------------------------------------------------------------------------
@@ -35,46 +58,46 @@ SECT_DATA(suconst)
 %macro output_sound 0
     %ifndef SU_USE_16BIT_OUTPUT
         %ifndef SU_CLIP_OUTPUT ; The modern way. No need to clip; OS can do it.
-            mov     edi, dword [esp+44] ; edi containts ptr
-            mov     esi, su_synth_obj+su_synth.left
+            mov     _DI, [_SP+BUFFER_STACK_LOC] ; edi containts ptr
+            mov     _SI, PTRWORD su_synth_obj + su_synth.left
             movsd   ; copy left channel to output buffer
             movsd   ; copy right channel to output buffer
-            mov     dword [esp+44], edi ; save back the updated ptr
-            lea     edi, [esi-8]
-            xor     eax,eax
+            mov     [_SP+BUFFER_STACK_LOC], _DI ; save back the updated ptr
+            lea     _DI, [_SI-8]
+            xor     eax, eax
             stosd   ; clear left channel so the VM is ready to write them again
             stosd   ; clear right channel so the VM is ready to write them again
         %else
-            mov     esi, dword [esp+44] ; esi points to the output buffer
-            xor     ecx,ecx
+            mov     _SI, qword [_SP+BUFFER_STACK_LOC] ; esi points to the output buffer
+            xor     _CX,_CX
             xor     eax,eax
             %%loop: ; loop over two channels, left & right
-                fld     dword [su_synth_obj+su_synth.left+ecx*4]
+                apply fld dword,su_synth_obj+su_synth.left,_CX*4,{}
                 call    su_clip
-                fstp    dword [esi]
-                mov     dword [su_synth_obj+su_synth.left+ecx*4],eax ; clear the sample so the VM is ready to write it
-                add     esi,4
+                fstp    dword [_SI]
+                apply mov dword,su_synth_obj+su_synth.left,_CX*4,{,eax} ; clear the sample so the VM is ready to write it
+                add     _SI,4
                 cmp     ecx,2
                 jl      %%loop
-            mov     dword [esp+44], esi ; save esi back to stack
+            mov     dword [_SP+BUFFER_STACK_LOC], _SI ; save esi back to stack
         %endif
     %else ; 16-bit output, always clipped. This is a bit legacy method.
-        mov     esi, dword [esp+44] ; esi points to the output buffer
-        mov     edi, su_synth_obj+su_synth.left
+        mov     _SI, [_SP+BUFFER_STACK_LOC] ; esi points to the output buffer
+        mov     _DI, PTRWORD su_synth_obj+su_synth.left
         mov     ecx, 2
         %%loop: ; loop over two channels, left & right
-            fld     dword [edi]
+            fld     dword [_DI]
             call    su_clip
-            fmul    dword [c_32767]
-            push    eax
-            fistp   dword [esp]
-            pop     eax
-            mov     word [esi],ax   ; // store integer converted right sample
+            apply fmul dword, c_32767
+            push    _AX
+            fistp   dword [_SP]
+            pop     _AX
+            mov     word [_SI],ax   ; // store integer converted right sample
             xor     eax,eax
             stosd
-            add     esi,2
+            add     _SI,2
             loop    %%loop
-        mov     dword [esp+44], esi ; save esi back to stack
+        mov     [_SP+BUFFER_STACK_LOC], _SI ; save esi back to stack
     %endif
 %endmacro
 
@@ -87,30 +110,29 @@ SECT_DATA(suconst)
 ;-------------------------------------------------------------------------------
 SECT_TEXT(surender)
 
-EXPORT MANGLE_FUNC(su_render,4)         ; Stack: ptr
-    pushad                              ; Stack: pushad ptr
+EXPORT MANGLE_FUNC(su_render,PTRSIZE)   ; Stack: ptr
+    render_prologue
 %ifdef INCLUDE_GMDLS
     call    su_gmdls_load
 %endif
     xor     eax, eax                    ; ecx is the current row
 su_render_rowloop:                      ; loop through every row in the song
-        push    eax                     ; Stack: row pushad ptr
+        push    _AX                     ; Stack: row pushad ptr
         call    su_update_voices        ; update instruments for the new row
         xor     eax, eax                ; ecx is the current sample within row
 su_render_sampleloop:                   ; loop through every sample in the row
-            push    eax                 ; Stack: sample row pushad ptr
+            push    _AX                 ; Stack: sample row pushad ptr
             call    MANGLE_FUNC(su_run_vm,0) ; run through the VM code
             output_sound                ; *ptr++ = left, *ptr++ = right
-            pop     eax                 ; Stack: row pushad ptr
+            pop     _AX                 ; Stack: row pushad ptr
             inc     eax
             cmp     eax, SAMPLES_PER_ROW
             jl      su_render_sampleloop
-        pop     eax                     ; Stack: pushad ptr
+        pop     _AX                     ; Stack: pushad ptr
         inc     eax
         cmp     eax, TOTAL_ROWS
         jl      su_render_rowloop
-    popad                               ; Stack: ptr
-    ret     4                           ; Stack emptied by ret
+    render_epilogue
 
 ;-------------------------------------------------------------------------------
 ;   su_update_voices function: polyphonic & chord implementation
@@ -126,30 +148,30 @@ su_update_voices: ; Stack: retaddr row
     xor     edx, edx
     mov     ebx, PATTERN_SIZE                   ; we could do xor ebx,ebx; mov bl,PATTERN_SIZE, but that would limit patternsize to 256...
     div     ebx                                 ; eax = current pattern, edx = current row in pattern
-    lea     esi, [MANGLE_DATA(su_tracks)+eax]   ; esi points to the pattern data for current track
+    apply {lea _SI,},MANGLE_DATA(su_tracks),_AX,{} ; esi points to the pattern data for current track
     xor     eax, eax                            ; eax is the first voice of next track
     xor     ebx, ebx                            ; ebx is the first voice of current track
-    mov     ebp, su_current_voiceno             ; ebp points to the current_voiceno array
+    mov     _BP, PTRWORD su_current_voiceno     ; ebp points to the current_voiceno array
 su_update_voices_trackloop:
-        movzx   eax, byte [esi]                     ; eax = current pattern
+        movzx   eax, byte [_SI]                     ; eax = current pattern
         imul    eax, PATTERN_SIZE                   ; eax = offset to current pattern data
-        movzx   eax, byte [MANGLE_DATA(su_patterns)+eax+edx]  ; eax = note
-        push    edx                                 ; Stack: ptrnrow
+        apply {movzx eax,byte},MANGLE_DATA(su_patterns),_AX,_DX,{}  ; eax = note
+        push    _DX                                 ; Stack: ptrnrow
         xor     edx, edx                            ; edx=0
         mov     ecx, ebx                            ; ecx=first voice of the track to be done
 su_calculate_voices_loop:                           ; do {
-        bt      dword [su_voicetrack_bitmask],ecx   ;   // notice that the incs don't set carry
+        apply bt dword, su_voicetrack_bitmask,{,ecx};   // notice that the incs don't set carry
         inc     edx                                 ;   edx++   // edx=numvoices
         inc     ecx                                 ;   ecx++   // ecx=the first voice of next track
         jc      su_calculate_voices_loop            ; } while bit ecx-1 of bitmask is on
-        push    ecx                                 ; Stack: next_instr ptrnrow
+        push    _CX                                 ; Stack: next_instr ptrnrow
         cmp     al, HLD                             ; anything but hold causes action
         je      short su_update_voices_nexttrack
-        mov     ecx, dword [ebp]
+        mov     ecx, dword [_BP]
         mov     edi, ecx
         add     edi, ebx
         shl     edi, MAX_UNITS_SHIFT + 6            ; each unit = 64 bytes and there are 1<<MAX_UNITS_SHIFT units + small header
-        inc     dword [su_synth_obj+su_synth.voices+edi+su_voice.release] ; set the voice currently active to release; notice that it could increment any number of times
+        apply inc dword, su_synth_obj+su_synth.voices+su_voice.release,_DI,{} ; set the voice currently active to release; notice that it could increment any number of times
         cmp     al, HLD                             ; if cl < HLD (no new note triggered)
         jl      su_update_voices_nexttrack          ;   goto nexttrack
         inc     ecx                                 ; curvoice++
@@ -157,52 +179,53 @@ su_calculate_voices_loop:                           ; do {
         jl      su_update_voices_skipreset
         xor     ecx,ecx                             ;   curvoice = 0
 su_update_voices_skipreset:
-        mov     dword [ebp],ecx
-        add     ecx, ebx        
+        mov     dword [_BP],ecx
+        add     ecx, ebx
         shl     ecx, MAX_UNITS_SHIFT + 6            ; each unit = 64 bytes and there are 1<<MAX_UNITS_SHIFT units + small header
-        lea     edi, [su_synth_obj+su_synth.voices+ecx]
+        apply {lea _DI,},su_synth_obj+su_synth.voices,_CX,{}
         stosd                                       ; save note
         mov     ecx, (su_voice.size - su_voice.release)/4
         xor     eax, eax
         rep stosd                                   ; clear the workspace of the new voice, retriggering oscillators
 su_update_voices_nexttrack:
-        pop     ebx                                 ; ebx=first voice of next instrument, Stack: ptrnrow
-        pop     edx                                 ; edx=patrnrow
-        add     esi, MAX_PATTERNS
-        add     ebp, 4
-        cmp     ebp, su_current_voiceno+MAX_TRACKS*4
-        jl      short su_update_voices_trackloop
+        pop     _BX                                 ; ebx=first voice of next instrument, Stack: ptrnrow
+        pop     _DX                                 ; edx=patrnrow
+        add     _SI, MAX_PATTERNS
+        add     _BP, 4
+        apply {cmp _BP,},su_current_voiceno,MAX_TRACKS*4,{}
+        jl      su_update_voices_trackloop
     ret
 
-%else ; INCLUDE_MULTIVOICE_TRACKS not defined -> one voice per track version
+
+%else ; INCLUDE_MULTIVOICE_TRACKS not defined -> one voice per track ve_SIon
 
 su_update_voices: ; Stack: retaddr row
     xor     edx, edx
     xor     ebx, ebx
     mov     bl, PATTERN_SIZE
     div     ebx                                 ; eax = current pattern, edx = current row in pattern
-    lea     esi, [MANGLE_DATA(su_tracks)+eax]   ; esi points to the pattern data for current track
-    lea     edi, [su_synth_obj+su_synth.voices]
+    apply {lea _SI,},MANGLE_DATA(su_tracks),_AX,{}; esi points to the pattern data for current track
+    mov     _DI, PTRWORD su_synth_obj+su_synth.voices
     mov     bl, MAX_TRACKS                      ; MAX_TRACKS is always <= 32 so this is ok
 su_update_voices_trackloop:
-        movzx   eax, byte [esi]                     ; eax = current pattern
+        movzx   eax, byte [_SI]                     ; eax = current pattern
         imul    eax, PATTERN_SIZE                   ; eax = offset to current pattern data
-        movzx   eax, byte [MANGLE_DATA(su_patterns)+eax+edx]  ; ecx = note
+        apply {movzx eax, byte}, MANGLE_DATA(su_patterns), _AX, _DX, {}  ; ecx = note
         cmp     al, HLD                             ; anything but hold causes action
         je      short su_update_voices_nexttrack
-        inc     dword [edi+su_voice.release]        ; set the voice currently active to release; notice that it could increment any number of times
+        inc     dword [_DI+su_voice.release]        ; set the voice currently active to release; notice that it could increment any number of times
         cmp     al, HLD
         jl      su_update_voices_nexttrack          ; if cl < HLD (no new note triggered)  goto nexttrack
 su_update_voices_retrigger:
-        stosd                                       ; save note        
+        stosd                                       ; save note
         mov     ecx, (su_voice.size - su_voice.release)/4  ; could be xor ecx, ecx; mov ch,...>>8, but will it actually be smaller after compression?
         xor     eax, eax
         rep stosd                                   ; clear the workspace of the new voice, retriggering oscillators
         jmp     short su_update_voices_skipadd
 su_update_voices_nexttrack:
-        add     edi, su_voice.size
+        add     _DI, su_voice.size
 su_update_voices_skipadd:
-        add     esi, MAX_PATTERNS
+        add     _SI, MAX_PATTERNS
         dec     ebx
         jnz     short su_update_voices_trackloop
     ret
diff --git a/src/sointu.asm b/src/sointu.asm
index b57ba57..3e23bfd 100644
--- a/src/sointu.asm
+++ b/src/sointu.asm
@@ -1,6 +1,97 @@
-%define WRK ebp             ; // alias for unit workspace
-%define VAL esi             ; // alias for unit values (transformed/untransformed)
-%define COM ebx             ; // alias for instrument opcodes
+%if BITS == 64
+    %define WRK rbp ; alias for unit workspace
+    %define VAL rsi ; alias for unit values (transformed/untransformed)
+    %define COM rbx ; alias for instrument opcodes
+    %define INP rdx ; alias for transformed inputs
+    %define _AX rax ; push and offsets have to be r* on 64-bit and e* on 32-bit
+    %define _BX rbx
+    %define _CX rcx
+    %define _DX rdx
+    %define _SP rsp
+    %define _SI rsi
+    %define _DI rdi
+    %define _BP rbp
+    %define PTRSIZE 8
+    %define PTRWORD qword
+    %define RESPTR resq
+    %define DPTR dq
+
+    %macro apply 2
+        mov r9, qword %2
+        %1 [r9]
+    %endmacro
+
+    %macro apply 3
+        mov r9, qword %2
+        %1 [r9] %3
+    %endmacro
+
+    %macro apply 4
+        mov r9, qword %2
+        %1 [r9+%3] %4
+    %endmacro
+
+    %macro apply 5
+        mov r9, qword %2
+        lea r9, [r9+%3]
+        %1 [r9+%4] %5
+    %endmacro
+
+    %macro  push_registers 1-*
+        %rep  %0
+            push    %1
+            %rotate 1
+        %endrep
+    %endmacro
+
+    %macro  pop_registers 1-*
+        %rep %0
+            %rotate -1
+            pop     %1
+        %endrep
+    %endmacro
+%else
+    %define WRK ebp ; alias for unit workspace
+    %define VAL esi ; alias for unit values (transformed/untransformed)
+    %define COM ebx ; alias for instrument opcodes
+    %define INP edx ; alias for transformed inputs
+    %define _AX eax
+    %define _BX ebx
+    %define _CX ecx
+    %define _DX edx
+    %define _SP esp
+    %define _SI esi
+    %define _DI edi
+    %define _BP ebp
+    %define PTRSIZE 4
+    %define PTRWORD dword
+    %define RESPTR resd
+    %define DPTR dd
+
+    %macro apply 2
+        %1 [%2]
+    %endmacro
+
+    %macro apply 3
+        %1 [%2] %3
+    %endmacro
+
+    %macro apply 4
+        %1 [%2+%3] %4
+    %endmacro
+
+    %macro apply 5
+        %1 [%2+%3+%4] %5
+    %endmacro
+
+    %macro  push_registers 1-*
+        pushad ; in 32-bit mode, this is the easiest way to store all the registers
+    %endmacro
+
+    %macro  pop_registers 1-*
+        popad
+    %endmacro
+%endif
 
 ;===============================================================================
 ;   Uninitialized data: The one and only synth object
@@ -16,16 +107,14 @@ su_transformed_values   resd    16
 ;===============================================================================
 SECT_DATA(suoptabl)
 
-su_synth_commands
-                        dd      OPCODES
+su_synth_commands       DPTR    OPCODES
 
 ;===============================================================================
 ; The number of transformed parameters each opcode takes
 ;===============================================================================
 SECT_DATA(suparcnt)
 
-su_opcode_numparams
-                        db      NUMPARAMS
+su_opcode_numparams     db      NUMPARAMS
 
 ;-------------------------------------------------------------------------------
 ;   Constants used by the common functions
@@ -58,34 +147,40 @@ su_polyphony_bitmask    dd      POLYPHONY_BITMASK ; does the next voice reuse th
 SECT_TEXT(surunvm)
 
 EXPORT MANGLE_FUNC(su_run_vm,0)
-    mov     COM, MANGLE_DATA(su_commands)           ; COM points to vm code
-    mov     VAL, MANGLE_DATA(su_params)             ; VAL points to unit params
+    mov     COM, PTRWORD MANGLE_DATA(su_commands)           ; COM points to vm code
+    mov     VAL, PTRWORD MANGLE_DATA(su_params)             ; VAL points to unit params
     ; su_unit.size will be added back before WRK is used
-    mov     WRK, su_synth_obj + su_synth.voices + su_voice.workspace - su_unit.size
+    mov     WRK, PTRWORD su_synth_obj + su_synth.voices + su_voice.workspace - su_unit.size
     push    COM                                     ; Stack: COM
     push    VAL                                     ; Stack: VAL COM
     push    WRK                                     ; Stack: WRK VAL COM
-%if DELAY_ID > -1    
-    mov     dword [MANGLE_DATA(su_delay_buffer_ofs)], MANGLE_DATA(su_delay_buffer) ; reset delaywrk to first delayline
+%if DELAY_ID > -1
+    %if BITS == 64 ; TODO: find a way to do this with a macro
+        mov     r9,PTRWORD MANGLE_DATA(su_delay_buffer_ofs)
+        mov     _AX,PTRWORD MANGLE_DATA(su_delay_buffer)
+        mov     qword [r9],_AX                      ; reset delaywrk to first delayline
+    %else
+        mov     dword [MANGLE_DATA(su_delay_buffer_ofs)],MANGLE_DATA(su_delay_buffer) ; reset delaywrk to first
+    %endif
 %endif
     xor     ecx, ecx                                ; voice = 0
-    push    ecx                                     ; Stack: voice WRK VAL COM
+    push    _CX                                     ; Stack: voice WRK VAL COM
 su_run_vm_loop:                                     ; loop until all voices done
     movzx   eax, byte [COM]                         ; eax = command byte
     inc     COM                                     ; move to next instruction
     add     WRK, su_unit.size                       ; move WRK to next unit
-    push    eax
+    push    _AX
     shr     eax,1
-    mov     al,byte [eax+su_opcode_numparams]
-    push    eax
+    apply {mov al,byte},su_opcode_numparams,_AX,{}
+    push    _AX
     call    su_transform_values
-    mov     ecx, dword [esp+8]
-    pop     eax
+    mov     _CX, PTRWORD [_SP+2*PTRSIZE]
+    pop     _AX
     shr     eax,1
-    call    dword [eax*4+su_synth_commands]         ; call the function corresponding to the instruction
-    cmp     dword [esp],MAX_VOICES                  ; if (voice < MAX_VOICES)
+    apply call,su_synth_commands,_AX*PTRSIZE,{}     ; call the function corresponding to the instruction
+    cmp     dword [_SP],MAX_VOICES                  ; if (voice < MAX_VOICES)
     jl      su_run_vm_loop                          ;   goto vm_loop
-    add     esp, 16                                 ; Stack cleared
+    add     _SP, 4*PTRSIZE                          ; Stack cleared
     ret
 
 ;-------------------------------------------------------------------------------
@@ -96,12 +191,12 @@ su_run_vm_loop:                                     ; loop until all voices done
 SECT_TEXT(surandom)
 
 EXPORT MANGLE_FUNC(FloatRandomNumber,0)
-    push    eax
-    imul    eax,dword [MANGLE_DATA(RandSeed)],16007
-    mov     dword [MANGLE_DATA(RandSeed)], eax
-    fild    dword [MANGLE_DATA(RandSeed)]
-    fidiv   dword [c_RandDiv]
-    pop     eax
+    push    _AX
+    apply {imul eax,},MANGLE_DATA(RandSeed),{,16007}
+    apply mov,MANGLE_DATA(RandSeed),{, eax}
+    apply fild dword,MANGLE_DATA(RandSeed)
+    apply fidiv dword,c_RandDiv
+    pop     _AX
     ret
 
 ;-------------------------------------------------------------------------------
@@ -117,31 +212,26 @@ EXPORT MANGLE_FUNC(FloatRandomNumber,0)
 SECT_TEXT(sutransf)
 
 su_transform_values:
-    push    ecx
+    push    _CX
     xor     ecx, ecx
     xor     eax, eax
-    mov     edx, su_transformed_values
+    mov     INP, PTRWORD su_transformed_values
 su_transform_values_loop:
-    cmp     ecx, dword [esp+8]
+    cmp     ecx, dword [_SP+2*PTRSIZE]
     jge     su_transform_values_out
     lodsb
-    push    eax
-    fild    dword [esp]
-    fmul    dword [c_i128]
-    fadd    dword [WRK+su_unit.ports+ecx*4]
-    fstp    dword [edx+ecx*4]
-    mov     dword [WRK+su_unit.ports+ecx*4], 0
-    pop     eax
+    push    _AX
+    fild    dword [_SP]
+    apply fmul dword, c_i128
+    fadd    dword [WRK+su_unit.ports+_CX*4]
+    fstp    dword [INP+_CX*4]
+    mov     dword [WRK+su_unit.ports+_CX*4], 0
+    pop     _AX
     inc     ecx
     jmp     su_transform_values_loop
 su_transform_values_out:
-    pop     ecx
-    ret     4
-
-%macro TRANSFORM_VALUES 1
-    push %1 %+ .params/4
-    call su_transform_values
-%endmacro
+    pop     _CX
+    ret     PTRSIZE
 
 ;-------------------------------------------------------------------------------
 ;   su_env_map function: computes 2^(-24*x) of the envelope parameter
@@ -154,8 +244,8 @@ SECT_TEXT(supower)
 
 %if ENVELOPE_ID > -1 ; TODO: compressor also uses this, so should be compiled if either
 su_env_map:
-    fld     dword [edx+eax*4]   ; x, where x is the parameter in the range 0-1
-    fimul   dword [c_24]        ; 24*x
+    fld     dword [INP+_AX*4]   ; x, where x is the parameter in the range 0-1
+    apply   fimul dword,c_24          ; 24*x
     fchs                        ; -24*x
     ; flow into Power function, which outputs 2^(-24*x)
 %endif
@@ -189,6 +279,13 @@ EXPORT MANGLE_FUNC(su_power,0)
 ; sources, as sources.asm defines SU_USE_WAVESHAPER
 ; if needed.
 %include "opcodes/effects.asm"
-%include "player.asm"
 %include "introspection.asm"
-%include "gmdls.asm"
+%include "player.asm"
+
+%ifidn __OUTPUT_FORMAT__,win64
+    %include "win64/gmdls_win64.asm"
+%endif
+
+%ifidn __OUTPUT_FORMAT__,win32
+    %include "win32/gmdls_win32.asm"
+%endif
\ No newline at end of file
diff --git a/src/sointu.inc b/src/sointu.inc
index a457f2e..b9f2705 100644
--- a/src/sointu.inc
+++ b/src/sointu.inc
@@ -19,17 +19,33 @@
     ; on win32, function f with n parameters is mangled as "_f@n"
     %define MANGLE_FUNC(f,n) _ %+ f %+ @ %+ n
     %define WIN_OR_MAC
+    %assign BITS 32
+    ; On windows and mac, data label d is mangled as "_d"
+    %define MANGLE_DATA(d) _ %+ d
+%endif
+
+%ifidn __OUTPUT_FORMAT__,win64
+    ; on win32, function f with n parameters is mangled as "_f@n"
+    %define MANGLE_FUNC(f,n) f
+    %define WIN_OR_MAC
+    %assign BITS 64
+    ; On windows and mac, data label d is mangled as "_d"
+    %define MANGLE_DATA(d) d
 %endif
 
 %ifidn __OUTPUT_FORMAT__,elf32
     ; on linux, function f with n parameters is mangled as "f"
     %define MANGLE_FUNC(f,n) f
+    ; On linux, data label d is mangled as "d"
+    %define MANGLE_DATA(d) d
 %endif
 
 %ifidn __OUTPUT_FORMAT__,macho32
     ; on mac, function f with x parameters is mangled as "_f"
     %define MANGLE_FUNC(f,n) _f
     %define WIN_OR_MAC
+    ; On windows and mac, data label d is mangled as "_d"
+    %define MANGLE_DATA(d) _ %+ d
 %endif
 
 %ifdef WIN_OR_MAC
@@ -44,8 +60,6 @@
         %define SECT_DATA(n) section .data align=1
         %define SECT_TEXT(n) section .code align=1
     %endif
-    ; On windows and mac, data label d is mangled as "_d"
-    %define MANGLE_DATA(d) _ %+ d
 %else
     ; Linux
     %ifdef USE_SECTIONS
@@ -57,8 +71,6 @@
         %define SECT_DATA(n) section .data. progbits alloc noexec write align=1
         %define SECT_TEXT(n) section .text. progbits alloc exec nowrite align=1
     %endif
-    ; On linux, data label d is mangled as "d"
-    %define MANGLE_DATA(d) d
 %endif
 
 %ifdef SU_USE_ALL
diff --git a/src/gmdls.asm b/src/win32/gmdls_win32.asm
similarity index 87%
rename from src/gmdls.asm
rename to src/win32/gmdls_win32.asm
index 7670f78..50b7c99 100644
--- a/src/gmdls.asm
+++ b/src/win32/gmdls_win32.asm
@@ -8,15 +8,15 @@ extern _ReadFile@20 ; requires windows
 SECT_TEXT(sugmdls)
 
 su_gmdls_load:
-    mov     edi, MANGLE_DATA(su_sample_table)      
-    mov     esi, su_gmdls_path1   
-    su_gmdls_pathloop:            
+    mov     edi, MANGLE_DATA(su_sample_table)
+    mov     esi, su_gmdls_path1
+    su_gmdls_pathloop:
         push    0                   ; OF_READ
         push    edi                 ; &ofstruct, blatantly reuse the sample table
         push    esi                 ; path
         call    _OpenFile@12        ; eax = OpenFile(path,&ofstruct,OF_READ)
         add     esi, su_gmdls_path2 - su_gmdls_path1 ; if we ever get to third, then crash
-        cmp     eax, -1             ; eax == INVALID? 
+        cmp     eax, -1             ; eax == INVALID?
         je      su_gmdls_pathloop
     push    0                       ; NULL
     push    edi                     ; &bytes_read, reusing sample table again; it does not matter that the first four bytes are trashed
@@ -31,7 +31,7 @@ SECT_DATA(sugmpath)
 su_gmdls_path1:
     db 'drivers/gm.dls',0
 su_gmdls_path2:
-    db 'drivers/etc/gm.dls',0    
+    db 'drivers/etc/gm.dls',0
 
 SECT_DATA(suconst)
     c_samplefreq_scaling    dd      84.28074964676522       ; o = 0.000092696138, n = 72, f = 44100*o*2**(n/12), scaling = 22050/f <- so note 72 plays at the "normal rate"
diff --git a/src/win64/gmdls_win64.asm b/src/win64/gmdls_win64.asm
new file mode 100644
index 0000000..724287d
--- /dev/null
+++ b/src/win64/gmdls_win64.asm
@@ -0,0 +1,44 @@
+%ifdef INCLUDE_GMDLS
+
+%define SAMPLE_TABLE_SIZE 3440660 ; size of gmdls
+
+extern OpenFile ; requires windows
+extern ReadFile ; requires windows
+
+SECT_TEXT(sugmdls)
+;        Win64 ABI: RCX, RDX, R8, and R9
+su_gmdls_load:
+    sub     rsp, 40         ; Win64 ABI requires "shadow space" + space for one parameter.
+    mov     rdi, PTRWORD MANGLE_DATA(su_sample_table)
+    mov     rsi, PTRWORD su_gmdls_path1
+    su_gmdls_pathloop:
+        xor     r8,r8 ; OF_READ
+        mov     rdx, rdi             ; &ofstruct, blatantly reuse the sample table
+        mov     rcx, rsi        ; path
+        call    OpenFile            ; eax = OpenFile(path,&ofstruct,OF_READ)
+        add     rsi, su_gmdls_path2 - su_gmdls_path1 ; if we ever get to third, then crash
+        movsxd  rcx,eax
+        cmp     rcx, -1             ; ecx == INVALID?
+        je      su_gmdls_pathloop
+    mov     qword [rsp+32],0
+    mov     r9, rdi
+    mov     r8d, SAMPLE_TABLE_SIZE   ; number of bytes to read
+    mov     rdx, rdi
+    call    ReadFile                ; Readfile(handle,&su_sample_table,SAMPLE_TABLE_SIZE,&bytes_read,NULL)
+    add     rsp, 40         ; shadow space, as required by Win64 ABI
+    ret
+
+SECT_DATA(sugmpath)
+
+su_gmdls_path1:
+    db 'drivers/gm.dls',0
+su_gmdls_path2:
+    db 'drivers/etc/gm.dls',0
+
+SECT_DATA(suconst)
+    c_samplefreq_scaling    dd      84.28074964676522       ; o = 0.000092696138, n = 72, f = 44100*o*2**(n/12), scaling = 22050/f <- so note 72 plays at the "normal rate"
+
+SECT_BSS(susamtbl)
+    EXPORT MANGLE_DATA(su_sample_table)    resb    SAMPLE_TABLE_SIZE    ; size of gmdls.
+
+%endif
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 509ab3e..e1c521b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,7 +8,14 @@ function(regression_test testname)
     add_executable(${testname} ${source}.asm test_renderer.c)
 
     # the tests include the entire ASM but we still want to rebuild when they change
-    file(GLOB SOINTU ${PROJECT_SOURCE_DIR}/src/*.inc ${PROJECT_SOURCE_DIR}/src/*.asm ${PROJECT_SOURCE_DIR}/src/opcodes/*.asm ${PROJECT_SOURCE_DIR}/src/opcodes/*.inc)
+    file(GLOB SOINTU ${PROJECT_SOURCE_DIR}/src/*.inc
+                     ${PROJECT_SOURCE_DIR}/src/*.asm
+                     ${PROJECT_SOURCE_DIR}/src/opcodes/*.asm
+                     ${PROJECT_SOURCE_DIR}/src/opcodes/*.inc
+                     ${PROJECT_SOURCE_DIR}/src/win32/*.asm
+                     ${PROJECT_SOURCE_DIR}/src/win32/*.inc
+                     ${PROJECT_SOURCE_DIR}/src/win64/*.asm
+                     ${PROJECT_SOURCE_DIR}/src/win64/*.inc)
     set_source_files_properties(${source}.asm PROPERTIES OBJECT_DEPENDS "${SOINTU}")
     set_source_files_properties(${FOURKLANG} PROPERTIES HEADER_FILE_ONLY TRUE)