diff --git a/src/x86-64.S b/src/x86-64.S
index 7c38964..dbf437a 100644
--- a/src/x86-64.S
+++ b/src/x86-64.S
@@ -31,10 +31,55 @@
 
 #include "arch.h"
 
+#ifdef _WIN64
+/*
+ * MS use a different x64 calling convention than everyone else:
+ * Arguments: RCX, RDX, R8, R9 then stack right-to-left.
+ * Non-volatile: R12:R15, RBX, RBP, RDI, RSI, RSP, XMM6:XMM15
+ * Return: RAX.
+ */
+#define ARG1 %rcx
+#define PROLOGUE \
+	subq $(8+10*16), %rsp; \
+	movapd %xmm6, 0*16(%rsp); \
+	movapd %xmm7, 1*16(%rsp); \
+	movapd %xmm8, 2*16(%rsp); \
+	movapd %xmm9, 3*16(%rsp); \
+	movapd %xmm10, 4*16(%rsp); \
+	movapd %xmm11, 5*16(%rsp); \
+	movapd %xmm12, 6*16(%rsp); \
+	movapd %xmm13, 7*16(%rsp); \
+	movapd %xmm14, 8*16(%rsp); \
+	movapd %xmm15, 9*16(%rsp)
+#define EPILOGUE \
+	movapd 0*16(%rsp), %xmm6; \
+	movapd 1*16(%rsp), %xmm7; \
+	movapd 2*16(%rsp), %xmm8; \
+	movapd 3*16(%rsp), %xmm9; \
+	movapd 4*16(%rsp), %xmm10; \
+	movapd 5*16(%rsp), %xmm11; \
+	movapd 6*16(%rsp), %xmm12; \
+	movapd 7*16(%rsp), %xmm13; \
+	movapd 8*16(%rsp), %xmm14; \
+	movapd 9*16(%rsp), %xmm15; \
+	addq $(8+10*16), %rsp
+#else
+/*
+ * System V AMD64 ABI (followed by everybody else including linux-X32):
+ * Arguments: RDI, RSI, RDX, RCX, R8, R9 then stack right-to-left.
+ * Non-volatile: R12:R15, RBX, RBP
+ * Return: RAX.
+ */
+#define ARG1 %rdi
+#define PROLOGUE
+#define EPILOGUE
+#endif
+
 /*
  * Some broken systems don't offer section alignments larger than 8 bytes,
  * while for the SSE code we need at least a 16 byte alignment.  ALIGN_FIX
  * is here to work around this issue when we happen to get bad addresses.
+ * We haven't actually seen this on any 64-bit system as of yet.
  */
 #ifndef ALIGN_FIX
 #ifdef ALIGN_LOG
@@ -699,7 +744,7 @@ DO_SPACE(nptr(48))
 #define k(i)				nptr(i)(k_ptr)
 
 #define tmp1				%rcx
-#define tmp2				%rsi
+#define tmp2				%r9 /* %rsi is non-volatile on win64 */
 
 #define xor_E(i) \
 	movq E(i),tmp1; \
@@ -1030,6 +1075,7 @@ DES_bs_init_asm:
 DO_ALIGN(6)
 .globl DES_bs_crypt
 DES_bs_crypt:
+	PROLOGUE
 	cmpl $0,DES_bs_all_keys_changed(%rip)
 	jz DES_bs_crypt_body
 	pushq %rdi
@@ -1083,17 +1129,20 @@ DES_bs_crypt_swap:
 	movl $0x108,rounds_and_swapped
 	subl $1,iterations
 	jnz DES_bs_crypt_swap
+	EPILOGUE
 	ret
 DES_bs_crypt_next:
 	subq $nvec(0x300-48),k_ptr
 	movl $8,rounds_and_swapped
 	subl $1,iterations
 	jnz DES_bs_crypt_start
+	EPILOGUE
 	ret
 
 DO_ALIGN(6)
 .globl DES_bs_crypt_25
 DES_bs_crypt_25:
+	PROLOGUE
 	cmpl $0,DES_bs_all_keys_changed(%rip)
 	jnz DES_bs_finalize_keys_25
 DES_bs_crypt_25_body:
@@ -1145,6 +1194,7 @@ DES_bs_crypt_25_swap:
 	movl $0x108,rounds_and_swapped
 	subl $1,iterations
 	jnz DES_bs_crypt_25_swap
+	EPILOGUE
 	ret
 DES_bs_crypt_25_next:
 	subq $nvec(0x300-48),k_ptr
@@ -1214,7 +1264,8 @@ DES_bs_finalize_keys_expand_loop:
 DO_ALIGN(6)
 .globl DES_bs_crypt_LM
 DES_bs_crypt_LM:
-	movl (%rdi),%r8d
+	PROLOGUE
+	movl (ARG1),%r8d
 	movdqa mask01,%xmm7
 	movdqa mask02,%xmm8
 	leaq DES_bs_all_xkeys(%rip),v_ptr
@@ -1370,6 +1421,7 @@ DES_bs_crypt_LM_loop:
 	subl $1,rounds
 	jnz DES_bs_crypt_LM_loop
 	xchgq %r8,%rax
+	EPILOGUE
 	ret
 
 #define rounds				%eax
@@ -1377,6 +1429,7 @@ DES_bs_crypt_LM_loop:
 DO_ALIGN(6)
 .globl DES_bs_crypt_plain
 DES_bs_crypt_plain:
+	PROLOGUE
 	movdqa mask01,%xmm7
 	movdqa mask02,%xmm8
 	leaq DES_bs_all_xkeys(%rip),v_ptr
@@ -1561,6 +1614,7 @@ DES_bs_crypt_plain_loop:
 	S8(B(4), B(26), B(14), B(20))
 	subl $1,rounds
 	jnz DES_bs_crypt_plain_loop
+	EPILOGUE
 	ret
 #endif
 
@@ -1939,7 +1993,8 @@ const_stage3:
 DO_ALIGN(6)
 
 nt_crypt_all_x86_64:
-	movl (%rdi),%r8d
+	PROLOGUE
+	movl (ARG1),%r8d
 	movdqa const_stage2(%rip), t3
 	movdqa const_stage3(%rip), t4
 
@@ -1949,10 +2004,12 @@ nt_crypt_all_x86_64:
 	NT_CRYPT_BODY(3)
 
 	xchgq %r8,%rax
+	EPILOGUE
 	ret
 
 nt_crypt_all_8859_1_x86_64:
-	movl (%rdi),%r8d
+	PROLOGUE
+	movl (ARG1),%r8d
 	movdqa const_stage2(%rip), t3
 	movdqa const_stage3(%rip), t4
 
@@ -1962,6 +2019,7 @@ nt_crypt_all_8859_1_x86_64:
 	NT_CRYPT_BODY_8859_1(3)
 
 	xchgq %r8,%rax
+	EPILOGUE
 	ret
 
 #if defined(__ELF__) && defined(__linux__)