From d7c208a92e6b15cdcd159e30cd1fc0177fd967e9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Aug 2009 16:13:20 -0700
Subject: [PATCH 01/17] Add new optimized C 'block-sha1' routines

Based on the mozilla SHA1 routine, but doing the input data accesses a
word at a time and with 'htonl()' instead of loading bytes and shifting.

It requires an architecture that is ok with unaligned 32-bit loads and a
fast htonl().

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile          |   9 +++
 block-sha1/sha1.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++
 block-sha1/sha1.h |  21 +++++++
 3 files changed, 175 insertions(+)
 create mode 100644 block-sha1/sha1.c
 create mode 100644 block-sha1/sha1.h

diff --git a/Makefile b/Makefile
index daf4296706..e6df8ecde6 100644
--- a/Makefile
+++ b/Makefile
@@ -84,6 +84,10 @@ all::
 # specify your own (or DarwinPort's) include directories and
 # library directories by defining CFLAGS and LDFLAGS appropriately.
 #
+# Define BLK_SHA1 environment variable if you want the C version
+# of the SHA1 that assumes you can do unaligned 32-bit loads and
+# have a fast htonl() function.
+#
 # Define PPC_SHA1 environment variable when running make to make use of
 # a bundled SHA1 routine optimized for PowerPC.
 #
@@ -1167,6 +1171,10 @@ ifdef NO_DEFLATE_BOUND
 	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
 endif
 
+ifdef BLK_SHA1
+	SHA1_HEADER = "block-sha1/sha1.h"
+	LIB_OBJS += block-sha1/sha1.o
+else
 ifdef PPC_SHA1
 	SHA1_HEADER = "ppc/sha1.h"
 	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
@@ -1184,6 +1192,7 @@ else
 endif
 endif
 endif
+endif
 ifdef NO_PERL_MAKEMAKER
 	export NO_PERL_MAKEMAKER
 endif
diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
new file mode 100644
index 0000000000..50b2b42b03
--- /dev/null
+++ b/block-sha1/sha1.c
@@ -0,0 +1,145 @@
+/*
+ * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.c),
+ * optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ */
+
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "sha1.h"
+
+/* Hash one 64-byte block of data */
+static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data);
+
+void blk_SHA1_Init(blk_SHA_CTX *ctx)
+{
+	ctx->lenW = 0;
+	ctx->size = 0;
+
+	/* Initialize H with the magic constants (see FIPS180 for constants)
+	 */
+	ctx->H[0] = 0x67452301;
+	ctx->H[1] = 0xefcdab89;
+	ctx->H[2] = 0x98badcfe;
+	ctx->H[3] = 0x10325476;
+	ctx->H[4] = 0xc3d2e1f0;
+}
+
+
+void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
+{
+	int lenW = ctx->lenW;
+
+	ctx->size += len;
+
+	/* Read the data into W and process blocks as they get full
+	 */
+	if (lenW) {
+		int left = 64 - lenW;
+		if (len < left)
+			left = len;
+		memcpy(lenW + (char *)ctx->W, data, left);
+		lenW = (lenW + left) & 63;
+		len -= left;
+		data += left;
+		ctx->lenW = lenW;
+		if (lenW)
+			return;
+		blk_SHA1Block(ctx, ctx->W);
+	}
+	while (len >= 64) {
+		blk_SHA1Block(ctx, data);
+		data += 64;
+		len -= 64;
+	}
+	if (len) {
+		memcpy(ctx->W, data, len);
+		ctx->lenW = len;
+	}
+}
+
+
+void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
+{
+	static const unsigned char pad[64] = { 0x80 };
+	unsigned int padlen[2];
+	int i;
+
+	/* Pad with a binary 1 (ie 0x80), then zeroes, then length
+	 */
+	padlen[0] = htonl(ctx->size >> (32 - 3));
+	padlen[1] = htonl(ctx->size << 3);
+
+	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW)));
+	blk_SHA1_Update(ctx, padlen, 8);
+
+	/* Output hash
+	 */
+	for (i = 0; i < 5; i++)
+		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
+}
+
+#define SHA_ROT(X,n) (((X) << (n)) | ((X) >> (32-(n))))
+
+static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
+{
+	int t;
+	unsigned int A,B,C,D,E,TEMP;
+	unsigned int W[80];
+
+	for (t = 0; t < 16; t++)
+		W[t] = htonl(data[t]);
+
+	/* Unroll it? */
+	for (t = 16; t <= 79; t++)
+		W[t] = SHA_ROT(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+
+	A = ctx->H[0];
+	B = ctx->H[1];
+	C = ctx->H[2];
+	D = ctx->H[3];
+	E = ctx->H[4];
+
+#define T_0_19(t) \
+	TEMP = SHA_ROT(A,5) + (((C^D)&B)^D)     + E + W[t] + 0x5a827999; \
+	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+
+	T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4);
+	T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9);
+	T_0_19(10); T_0_19(11); T_0_19(12); T_0_19(13); T_0_19(14);
+	T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19);
+
+#define T_20_39(t) \
+	TEMP = SHA_ROT(A,5) + (B^C^D)           + E + W[t] + 0x6ed9eba1; \
+	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+
+	T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24);
+	T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29);
+	T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34);
+	T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39);
+
+#define T_40_59(t) \
+	TEMP = SHA_ROT(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \
+	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+
+	T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44);
+	T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49);
+	T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54);
+	T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59);
+
+#define T_60_79(t) \
+	TEMP = SHA_ROT(A,5) + (B^C^D)           + E + W[t] + 0xca62c1d6; \
+	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+
+	T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64);
+	T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69);
+	T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74);
+	T_60_79(75); T_60_79(76); T_60_79(77); T_60_79(78); T_60_79(79);
+
+	ctx->H[0] += A;
+	ctx->H[1] += B;
+	ctx->H[2] += C;
+	ctx->H[3] += D;
+	ctx->H[4] += E;
+}
diff --git a/block-sha1/sha1.h b/block-sha1/sha1.h
new file mode 100644
index 0000000000..7be2d93a2a
--- /dev/null
+++ b/block-sha1/sha1.h
@@ -0,0 +1,21 @@
+/*
+ * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.h),
+ * optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ */
+
+typedef struct {
+	unsigned int H[5];
+	unsigned int W[16];
+	int lenW;
+	unsigned long long size;
+} blk_SHA_CTX;
+
+void blk_SHA1_Init(blk_SHA_CTX *ctx);
+void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *dataIn, unsigned long len);
+void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx);
+
+#define git_SHA_CTX	blk_SHA_CTX
+#define git_SHA1_Init	blk_SHA1_Init
+#define git_SHA1_Update	blk_SHA1_Update
+#define git_SHA1_Final	blk_SHA1_Final

From b26a9d50899a5d65bafcb521c5495f03b2e2e0e9 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Thu, 6 Aug 2009 13:56:19 -0700
Subject: [PATCH 02/17] block-sha1: undo ctx->size change

Undo the change I picked up from the mailing list discussion suggested
by Nico, not because it is wrong, but it will be done at the end of the
follow-up series.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 50b2b42b03..eef32f7859 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -31,7 +31,7 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
 {
 	int lenW = ctx->lenW;
 
-	ctx->size += len;
+	ctx->size += (unsigned long long) len << 3;
 
 	/* Read the data into W and process blocks as they get full
 	 */
@@ -68,8 +68,8 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 	/* Pad with a binary 1 (ie 0x80), then zeroes, then length
 	 */
-	padlen[0] = htonl(ctx->size >> (32 - 3));
-	padlen[1] = htonl(ctx->size << 3);
+	padlen[0] = htonl(ctx->size >> 32);
+	padlen[1] = htonl(ctx->size);
 
 	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW)));
 	blk_SHA1_Update(ctx, padlen, 8);

From b8e48a89b8f581eaf95b57782bb8e620ca30e968 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Aug 2009 19:42:15 -0700
Subject: [PATCH 03/17] block-sha1: try to use rol/ror appropriately

Use the one with the smaller constant.  It _can_ generate slightly
smaller code (a constant of 1 is special), but perhaps more importantly
it's possibly faster on any uarch that does a rotate with a loop.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index eef32f7859..a45a3dec1e 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -80,7 +80,19 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
 }
 
-#define SHA_ROT(X,n) (((X) << (n)) | ((X) >> (32-(n))))
+#if defined(__i386__) || defined(__x86_64__)
+
+#define SHA_ASM(op, x, n) ({ unsigned int __res; asm(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
+#define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
+#define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
+
+#else
+
+#define SHA_ROT(X,n)	(((X) << (l)) | ((X) >> (r)))
+#define SHA_ROL(X,n)	SHA_ROT(X,n,32-(n))
+#define SHA_ROR(X,n)	SHA_ROT(X,32-(n),n)
+
+#endif
 
 static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 {
@@ -93,7 +105,7 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 
 	/* Unroll it? */
 	for (t = 16; t <= 79; t++)
-		W[t] = SHA_ROT(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+		W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
 
 	A = ctx->H[0];
 	B = ctx->H[1];
@@ -102,8 +114,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	E = ctx->H[4];
 
 #define T_0_19(t) \
-	TEMP = SHA_ROT(A,5) + (((C^D)&B)^D)     + E + W[t] + 0x5a827999; \
-	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+	TEMP = SHA_ROL(A,5) + (((C^D)&B)^D)     + E + W[t] + 0x5a827999; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4);
 	T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9);
@@ -111,8 +123,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19);
 
 #define T_20_39(t) \
-	TEMP = SHA_ROT(A,5) + (B^C^D)           + E + W[t] + 0x6ed9eba1; \
-	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+	TEMP = SHA_ROL(A,5) + (B^C^D)           + E + W[t] + 0x6ed9eba1; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24);
 	T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29);
@@ -120,8 +132,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39);
 
 #define T_40_59(t) \
-	TEMP = SHA_ROT(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \
-	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+	TEMP = SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44);
 	T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49);
@@ -129,8 +141,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59);
 
 #define T_60_79(t) \
-	TEMP = SHA_ROT(A,5) + (B^C^D)           + E + W[t] + 0xca62c1d6; \
-	E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP;
+	TEMP = SHA_ROL(A,5) + (B^C^D)           + E + W[t] + 0xca62c1d6; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64);
 	T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69);

From fd536d3439fa2a06730884df31e2e98c9006c947 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Thu, 6 Aug 2009 13:52:58 -0700
Subject: [PATCH 04/17] block-sha1: minor fixups

Bert Wesarg noticed non-x86 version of SHA_ROT() had a typo.
Also spell in-line assembly as __asm__(), otherwise I seem to get
error: implicit declaration of function 'asm' from my compiler.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index a45a3dec1e..698e435a39 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -82,13 +82,13 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 #if defined(__i386__) || defined(__x86_64__)
 
-#define SHA_ASM(op, x, n) ({ unsigned int __res; asm(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
+#define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
 #define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
 #define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
 
 #else
 
-#define SHA_ROT(X,n)	(((X) << (l)) | ((X) >> (r)))
+#define SHA_ROT(X,l,r)	(((X) << (l)) | ((X) >> (r)))
 #define SHA_ROL(X,n)	SHA_ROT(X,n,32-(n))
 #define SHA_ROR(X,n)	SHA_ROT(X,32-(n),n)
 

From 139e3456ecf18fc03a75eda7a77441e8fec344b9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Aug 2009 20:28:07 -0700
Subject: [PATCH 05/17] block-sha1: make the 'ntohl()' part of the first SHA1
 loop

This helps a teeny bit.  But what I -really- want to do is to avoid the
whole 80-array loop, and do the xor updates as I go along..

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 698e435a39..13da511b78 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -100,27 +100,31 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	unsigned int A,B,C,D,E,TEMP;
 	unsigned int W[80];
 
-	for (t = 0; t < 16; t++)
-		W[t] = htonl(data[t]);
-
-	/* Unroll it? */
-	for (t = 16; t <= 79; t++)
-		W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
-
 	A = ctx->H[0];
 	B = ctx->H[1];
 	C = ctx->H[2];
 	D = ctx->H[3];
 	E = ctx->H[4];
 
-#define T_0_19(t) \
+#define T_0_15(t) \
+	TEMP = htonl(data[t]); W[t] = TEMP; \
+	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D)     + E + 0x5a827999; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \
+
+	T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4);
+	T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9);
+	T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14);
+	T_0_15(15);
+
+	/* Unroll it? */
+	for (t = 16; t <= 79; t++)
+		W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+
+#define T_16_19(t) \
 	TEMP = SHA_ROL(A,5) + (((C^D)&B)^D)     + E + W[t] + 0x5a827999; \
 	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
-	T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4);
-	T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9);
-	T_0_19(10); T_0_19(11); T_0_19(12); T_0_19(13); T_0_19(14);
-	T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19);
+	T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19);
 
 #define T_20_39(t) \
 	TEMP = SHA_ROL(A,5) + (B^C^D)           + E + W[t] + 0x6ed9eba1; \

From 7b5075fcfb069fc36ba4cfe5567234974793ab58 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Aug 2009 20:49:41 -0700
Subject: [PATCH 06/17] block-sha1: re-use the temporary array as we calculate
 the SHA1

The mozilla-SHA1 code did this 80-word array for the 80 iterations.  But
the SHA1 state is really just 512 bits, and you can actually keep it in
a kind of "circular queue" of just 16 words instead.

This requires us to do the xor updates as we go along (rather than as a
pre-phase), but that's really what we want to do anyway.

This gets me really close to the OpenSSL performance on my Nehalem.
Look ma, all C code (ok, there's the rol/ror hack, but that one doesn't
strictly even matter on my Nehalem, it's just a local optimization).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 13da511b78..8c4c216f93 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -96,9 +96,8 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 {
-	int t;
 	unsigned int A,B,C,D,E,TEMP;
-	unsigned int W[80];
+	unsigned int array[16];
 
 	A = ctx->H[0];
 	B = ctx->H[1];
@@ -107,8 +106,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	E = ctx->H[4];
 
 #define T_0_15(t) \
-	TEMP = htonl(data[t]); W[t] = TEMP; \
-	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D)     + E + 0x5a827999; \
+	TEMP = htonl(data[t]); array[t] = TEMP; \
+	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \
 	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \
 
 	T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4);
@@ -116,18 +115,21 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14);
 	T_0_15(15);
 
-	/* Unroll it? */
-	for (t = 16; t <= 79; t++)
-		W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+#define SHA_XOR(t) \
+	TEMP = SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1); W(t) = TEMP;
 
 #define T_16_19(t) \
-	TEMP = SHA_ROL(A,5) + (((C^D)&B)^D)     + E + W[t] + 0x5a827999; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
+	SHA_XOR(t); \
+	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \
 
 	T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19);
 
 #define T_20_39(t) \
-	TEMP = SHA_ROL(A,5) + (B^C^D)           + E + W[t] + 0x6ed9eba1; \
+	SHA_XOR(t); \
+	TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0x6ed9eba1; \
 	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24);
@@ -136,7 +138,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39);
 
 #define T_40_59(t) \
-	TEMP = SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \
+	SHA_XOR(t); \
+	TEMP += SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + 0x8f1bbcdc; \
 	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44);
@@ -145,7 +148,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59);
 
 #define T_60_79(t) \
-	TEMP = SHA_ROL(A,5) + (B^C^D)           + E + W[t] + 0xca62c1d6; \
+	SHA_XOR(t); \
+	TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0xca62c1d6; \
 	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
 
 	T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64);

From ab14c823dfbf1245712c8179952b51822135d8a8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Aug 2009 07:20:54 -0700
Subject: [PATCH 07/17] block-sha1: macroize the rounds a bit further

Avoid repeating the shared parts of the different rounds by adding a
macro layer or two. It was already more cpp than C.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 56 ++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 8c4c216f93..53c93ba603 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -94,6 +94,27 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 #endif
 
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
+/*
+ * Where do we get the source from? The first 16 iterations get it from
+ * the input data, the next mix it from the 512-bit array.
+ */
+#define SHA_SRC(t) htonl(data[t])
+#define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
+
+#define SHA_ROUND(t, input, fn, constant) \
+	TEMP = input(t); W(t) = TEMP; \
+	TEMP += SHA_ROL(A,5) + (fn) + E + (constant); \
+	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP
+
+#define T_0_15(t)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 )
+#define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 )
+#define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 )
+#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)|(D&(B|C))) , 0x8f1bbcdc )
+#define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6 )
+
 static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 {
 	unsigned int A,B,C,D,E,TEMP;
@@ -105,53 +126,28 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	D = ctx->H[3];
 	E = ctx->H[4];
 
-#define T_0_15(t) \
-	TEMP = htonl(data[t]); array[t] = TEMP; \
-	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \
-
+	/* Round 1 - iterations 0-16 take their input from 'data' */
 	T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4);
 	T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9);
 	T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14);
 	T_0_15(15);
 
-/* This "rolls" over the 512-bit array */
-#define W(x) (array[(x)&15])
-#define SHA_XOR(t) \
-	TEMP = SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1); W(t) = TEMP;
-
-#define T_16_19(t) \
-	SHA_XOR(t); \
-	TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \
-
+	/* Round 1 - tail. Input from 512-bit mixing array */
 	T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19);
 
-#define T_20_39(t) \
-	SHA_XOR(t); \
-	TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0x6ed9eba1; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
-
+	/* Round 2 */
 	T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24);
 	T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29);
 	T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34);
 	T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39);
 
-#define T_40_59(t) \
-	SHA_XOR(t); \
-	TEMP += SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + 0x8f1bbcdc; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
-
+	/* Round 3 */
 	T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44);
 	T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49);
 	T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54);
 	T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59);
 
-#define T_60_79(t) \
-	SHA_XOR(t); \
-	TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0xca62c1d6; \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP;
-
+	/* Round 4 */
 	T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64);
 	T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69);
 	T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74);

From e869e113c8f91999f9a433436e0b863fe2727b61 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Aug 2009 07:27:57 -0700
Subject: [PATCH 08/17] block-sha1: Use '(B&C)+(D&(B^C))' instead of
 '(B&C)|(D&(B|C))' in round 3

It's an equivalent expression, but the '+' gives us some freedom in
instruction selection (for example, we can use 'lea' rather than 'add'),
and associates with the other additions around it to give some minor
scheduling freedom.

Suggested-by: linux@horizon.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 53c93ba603..5bf1b36bd1 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -112,7 +112,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 #define T_0_15(t)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 )
 #define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 )
 #define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 )
-#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)|(D&(B|C))) , 0x8f1bbcdc )
+#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc )
 #define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6 )
 
 static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)

From 5d5210c35aa83342163ab0ab80b8e6d6fa3ce931 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Aug 2009 07:45:46 -0700
Subject: [PATCH 09/17] block-sha1: get rid of redundant 'lenW' context

.. and simplify the ctx->size logic.

We now count the size in bytes, which means that 'lenW' was always just
the low 6 bits of the total size, so we don't carry it around separately
any more.  And we do the 'size in bits' shift at the end.

Suggested by Nicolas Pitre and linux@horizon.com.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 17 +++++++----------
 block-sha1/sha1.h |  1 -
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 5bf1b36bd1..fdd400f22d 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -14,7 +14,6 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data);
 
 void blk_SHA1_Init(blk_SHA_CTX *ctx)
 {
-	ctx->lenW = 0;
 	ctx->size = 0;
 
 	/* Initialize H with the magic constants (see FIPS180 for constants)
@@ -29,9 +28,9 @@ void blk_SHA1_Init(blk_SHA_CTX *ctx)
 
 void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
 {
-	int lenW = ctx->lenW;
+	int lenW = ctx->size & 63;
 
-	ctx->size += (unsigned long long) len << 3;
+	ctx->size += len;
 
 	/* Read the data into W and process blocks as they get full
 	 */
@@ -43,7 +42,6 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
 		lenW = (lenW + left) & 63;
 		len -= left;
 		data += left;
-		ctx->lenW = lenW;
 		if (lenW)
 			return;
 		blk_SHA1Block(ctx, ctx->W);
@@ -53,10 +51,8 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
 		data += 64;
 		len -= 64;
 	}
-	if (len) {
+	if (len)
 		memcpy(ctx->W, data, len);
-		ctx->lenW = len;
-	}
 }
 
 
@@ -68,10 +64,11 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 	/* Pad with a binary 1 (ie 0x80), then zeroes, then length
 	 */
-	padlen[0] = htonl(ctx->size >> 32);
-	padlen[1] = htonl(ctx->size);
+	padlen[0] = htonl(ctx->size >> 29);
+	padlen[1] = htonl(ctx->size << 3);
 
-	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW)));
+	i = ctx->size & 63;
+	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i)));
 	blk_SHA1_Update(ctx, padlen, 8);
 
 	/* Output hash
diff --git a/block-sha1/sha1.h b/block-sha1/sha1.h
index 7be2d93a2a..c1ae74d3da 100644
--- a/block-sha1/sha1.h
+++ b/block-sha1/sha1.h
@@ -7,7 +7,6 @@
 typedef struct {
 	unsigned int H[5];
 	unsigned int W[16];
-	int lenW;
 	unsigned long long size;
 } blk_SHA_CTX;
 

From 30d12d4c16abc052e8961c07651f97bea2c061bd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Aug 2009 12:41:00 -0700
Subject: [PATCH 10/17] block-sha1: perform register rotation using cpp

Instead of letting the compiler to figure out the optimal way to rotate
register usage, explicitly rotate the register names with cpp.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 117 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 27 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index fdd400f22d..b715916675 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -101,20 +101,20 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 #define SHA_SRC(t) htonl(data[t])
 #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
 
-#define SHA_ROUND(t, input, fn, constant) \
-	TEMP = input(t); W(t) = TEMP; \
-	TEMP += SHA_ROL(A,5) + (fn) + E + (constant); \
-	E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
+	unsigned int TEMP = input(t); W(t) = TEMP; \
+	TEMP += E + SHA_ROL(A,5) + (fn) + (constant); \
+	B = SHA_ROR(B, 2); E = TEMP; } while (0)
 
-#define T_0_15(t)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 )
-#define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 )
-#define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 )
-#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc )
-#define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6 )
+#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
+#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
+#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
+#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
 
 static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 {
-	unsigned int A,B,C,D,E,TEMP;
+	unsigned int A,B,C,D,E;
 	unsigned int array[16];
 
 	A = ctx->H[0];
@@ -124,31 +124,94 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	E = ctx->H[4];
 
 	/* Round 1 - iterations 0-16 take their input from 'data' */
-	T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4);
-	T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9);
-	T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14);
-	T_0_15(15);
+	T_0_15( 0, A, B, C, D, E);
+	T_0_15( 1, E, A, B, C, D);
+	T_0_15( 2, D, E, A, B, C);
+	T_0_15( 3, C, D, E, A, B);
+	T_0_15( 4, B, C, D, E, A);
+	T_0_15( 5, A, B, C, D, E);
+	T_0_15( 6, E, A, B, C, D);
+	T_0_15( 7, D, E, A, B, C);
+	T_0_15( 8, C, D, E, A, B);
+	T_0_15( 9, B, C, D, E, A);
+	T_0_15(10, A, B, C, D, E);
+	T_0_15(11, E, A, B, C, D);
+	T_0_15(12, D, E, A, B, C);
+	T_0_15(13, C, D, E, A, B);
+	T_0_15(14, B, C, D, E, A);
+	T_0_15(15, A, B, C, D, E);
 
 	/* Round 1 - tail. Input from 512-bit mixing array */
-	T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19);
+	T_16_19(16, E, A, B, C, D);
+	T_16_19(17, D, E, A, B, C);
+	T_16_19(18, C, D, E, A, B);
+	T_16_19(19, B, C, D, E, A);
 
 	/* Round 2 */
-	T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24);
-	T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29);
-	T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34);
-	T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39);
+	T_20_39(20, A, B, C, D, E);
+	T_20_39(21, E, A, B, C, D);
+	T_20_39(22, D, E, A, B, C);
+	T_20_39(23, C, D, E, A, B);
+	T_20_39(24, B, C, D, E, A);
+	T_20_39(25, A, B, C, D, E);
+	T_20_39(26, E, A, B, C, D);
+	T_20_39(27, D, E, A, B, C);
+	T_20_39(28, C, D, E, A, B);
+	T_20_39(29, B, C, D, E, A);
+	T_20_39(30, A, B, C, D, E);
+	T_20_39(31, E, A, B, C, D);
+	T_20_39(32, D, E, A, B, C);
+	T_20_39(33, C, D, E, A, B);
+	T_20_39(34, B, C, D, E, A);
+	T_20_39(35, A, B, C, D, E);
+	T_20_39(36, E, A, B, C, D);
+	T_20_39(37, D, E, A, B, C);
+	T_20_39(38, C, D, E, A, B);
+	T_20_39(39, B, C, D, E, A);
 
 	/* Round 3 */
-	T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44);
-	T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49);
-	T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54);
-	T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59);
+	T_40_59(40, A, B, C, D, E);
+	T_40_59(41, E, A, B, C, D);
+	T_40_59(42, D, E, A, B, C);
+	T_40_59(43, C, D, E, A, B);
+	T_40_59(44, B, C, D, E, A);
+	T_40_59(45, A, B, C, D, E);
+	T_40_59(46, E, A, B, C, D);
+	T_40_59(47, D, E, A, B, C);
+	T_40_59(48, C, D, E, A, B);
+	T_40_59(49, B, C, D, E, A);
+	T_40_59(50, A, B, C, D, E);
+	T_40_59(51, E, A, B, C, D);
+	T_40_59(52, D, E, A, B, C);
+	T_40_59(53, C, D, E, A, B);
+	T_40_59(54, B, C, D, E, A);
+	T_40_59(55, A, B, C, D, E);
+	T_40_59(56, E, A, B, C, D);
+	T_40_59(57, D, E, A, B, C);
+	T_40_59(58, C, D, E, A, B);
+	T_40_59(59, B, C, D, E, A);
 
 	/* Round 4 */
-	T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64);
-	T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69);
-	T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74);
-	T_60_79(75); T_60_79(76); T_60_79(77); T_60_79(78); T_60_79(79);
+	T_60_79(60, A, B, C, D, E);
+	T_60_79(61, E, A, B, C, D);
+	T_60_79(62, D, E, A, B, C);
+	T_60_79(63, C, D, E, A, B);
+	T_60_79(64, B, C, D, E, A);
+	T_60_79(65, A, B, C, D, E);
+	T_60_79(66, E, A, B, C, D);
+	T_60_79(67, D, E, A, B, C);
+	T_60_79(68, C, D, E, A, B);
+	T_60_79(69, B, C, D, E, A);
+	T_60_79(70, A, B, C, D, E);
+	T_60_79(71, E, A, B, C, D);
+	T_60_79(72, D, E, A, B, C);
+	T_60_79(73, C, D, E, A, B);
+	T_60_79(74, B, C, D, E, A);
+	T_60_79(75, A, B, C, D, E);
+	T_60_79(76, E, A, B, C, D);
+	T_60_79(77, D, E, A, B, C);
+	T_60_79(78, C, D, E, A, B);
+	T_60_79(79, B, C, D, E, A);
 
 	ctx->H[0] += A;
 	ctx->H[1] += B;

From 66c9c6c0fbba0894ebce3da572f62eb05162e547 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 7 Aug 2009 21:16:46 -0700
Subject: [PATCH 11/17] block-sha1: improved SHA1 hashing

I think I have found a way to avoid the gcc crazyness.

Lookie here:

	#             TIME[s] SPEED[MB/s]
	rfc3174         5.094       119.8
	rfc3174         5.098       119.7
	linus           1.462       417.5
	linusas         2.008         304
	linusas2        1.878         325
	mozilla         5.566       109.6
	mozillaas       5.866       104.1
	openssl         1.609       379.3
	spelvin         1.675       364.5
	spelvina        1.601       381.3
	nettle          1.591       383.6

notice? I outperform all the hand-tuned asm on 32-bit too. By quite a
margin, in fact.

Now, I didn't try a P4, and it's possible that it won't do that there, but
the 32-bit code generation sure looks impressive on my Nehalem box. The
magic? I force the stores to the 512-bit hash bucket to be done in order.
That seems to help a lot.

The diff is trivial (on top of the "rename registers with cpp" patch), as
appended. And it does seem to fix the P4 issues too, although I can
obviously (once again) only test Prescott, and only in 64-bit mode:

	#             TIME[s] SPEED[MB/s]
	rfc3174         1.662       36.73
	rfc3174          1.64       37.22
	linus          0.2523       241.9
	linusas        0.4367       139.8
	linusas2       0.4487         136
	mozilla        0.9704        62.9
	mozillaas      0.9399       64.94

that's some really impressive improvement. All from just saying "do the
stores in the order I told you to, dammit!" to the compiler.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index b715916675..886bcf25e2 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -93,6 +93,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 /* This "rolls" over the 512-bit array */
 #define W(x) (array[(x)&15])
+#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
 
 /*
  * Where do we get the source from? The first 16 iterations get it from
@@ -102,9 +103,9 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
 
 #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
-	unsigned int TEMP = input(t); W(t) = TEMP; \
-	TEMP += E + SHA_ROL(A,5) + (fn) + (constant); \
-	B = SHA_ROR(B, 2); E = TEMP; } while (0)
+	unsigned int TEMP = input(t); setW(t, TEMP); \
+	E += TEMP + SHA_ROL(A,5) + (fn) + (constant); \
+	B = SHA_ROR(B, 2); } while (0)
 
 #define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
 #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )

From 926172c5e4808726244713ef70398cd38b055f1e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 10 Aug 2009 16:52:07 -0700
Subject: [PATCH 12/17] block-sha1: improve code on large-register-set machines

For x86 performance (especially in 32-bit mode) I added that hack to write
the SHA1 internal temporary hash using a volatile pointer, in order to get
gcc to not try to cache the array contents. Because gcc will do all the
wrong things, and then spill things in insane random ways.

But on architectures like PPC, where you have 32 registers, it's actually
perfectly reasonable to put the whole temporary array[] into the register
set, and gcc can do so.

So make the 'volatile unsigned int *' cast be dependent on a
SMALL_REGISTER_SET preprocessor symbol, and enable it (currently) on just
x86 and x86-64.  With that, the routine is fairly reasonable even when
compared to the hand-scheduled PPC version. Ben Herrenschmidt reports on
a G5:

 * Paulus asm version:       about 3.67s
 * Yours with no change:     about 5.74s
 * Yours without "volatile": about 3.78s

so with this the C version is within about 3% of the asm one.

And add a lot of commentary on what the heck is going on.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 886bcf25e2..304cd0452d 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -82,6 +82,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
 #define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
 #define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
+#define SMALL_REGISTER_SET
 
 #else
 
@@ -93,7 +94,29 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 /* This "rolls" over the 512-bit array */
 #define W(x) (array[(x)&15])
-#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
+
+/*
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ *
+ * Ben Herrenschmidt reports that on PPC, the C version comes close
+ * to the optimized asm with this (ie on PPC you don't want that
+ * 'volatile', since there are lots of registers).
+ */
+#ifdef SMALL_REGISTER_SET
+  #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
+#else
+  #define setW(x, val) (W(x) = (val))
+#endif
 
 /*
  * Where do we get the source from? The first 16 iterations get it from

From 30ba0de726d92ccfc93009eb60f2c30b0886f61b Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Wed, 12 Aug 2009 15:45:48 -0400
Subject: [PATCH 13/17] block-sha1: move code around

Move the code around so specific architecture hacks are defined first.
Also make one line comments actually one line.  No code change.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 129 +++++++++++++++++++++-------------------------
 1 file changed, 60 insertions(+), 69 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 304cd0452d..c3f1ae59b9 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -9,74 +9,6 @@
 
 #include "sha1.h"
 
-/* Hash one 64-byte block of data */
-static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data);
-
-void blk_SHA1_Init(blk_SHA_CTX *ctx)
-{
-	ctx->size = 0;
-
-	/* Initialize H with the magic constants (see FIPS180 for constants)
-	 */
-	ctx->H[0] = 0x67452301;
-	ctx->H[1] = 0xefcdab89;
-	ctx->H[2] = 0x98badcfe;
-	ctx->H[3] = 0x10325476;
-	ctx->H[4] = 0xc3d2e1f0;
-}
-
-
-void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
-{
-	int lenW = ctx->size & 63;
-
-	ctx->size += len;
-
-	/* Read the data into W and process blocks as they get full
-	 */
-	if (lenW) {
-		int left = 64 - lenW;
-		if (len < left)
-			left = len;
-		memcpy(lenW + (char *)ctx->W, data, left);
-		lenW = (lenW + left) & 63;
-		len -= left;
-		data += left;
-		if (lenW)
-			return;
-		blk_SHA1Block(ctx, ctx->W);
-	}
-	while (len >= 64) {
-		blk_SHA1Block(ctx, data);
-		data += 64;
-		len -= 64;
-	}
-	if (len)
-		memcpy(ctx->W, data, len);
-}
-
-
-void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
-{
-	static const unsigned char pad[64] = { 0x80 };
-	unsigned int padlen[2];
-	int i;
-
-	/* Pad with a binary 1 (ie 0x80), then zeroes, then length
-	 */
-	padlen[0] = htonl(ctx->size >> 29);
-	padlen[1] = htonl(ctx->size << 3);
-
-	i = ctx->size & 63;
-	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i)));
-	blk_SHA1_Update(ctx, padlen, 8);
-
-	/* Output hash
-	 */
-	for (i = 0; i < 5; i++)
-		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
-}
-
 #if defined(__i386__) || defined(__x86_64__)
 
 #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
@@ -136,7 +68,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
 #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
 
-static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
+static void blk_SHA1_Block(blk_SHA_CTX *ctx, const unsigned int *data)
 {
 	unsigned int A,B,C,D,E;
 	unsigned int array[16];
@@ -243,3 +175,62 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data)
 	ctx->H[3] += D;
 	ctx->H[4] += E;
 }
+
+void blk_SHA1_Init(blk_SHA_CTX *ctx)
+{
+	ctx->size = 0;
+
+	/* Initialize H with the magic constants (see FIPS180 for constants) */
+	ctx->H[0] = 0x67452301;
+	ctx->H[1] = 0xefcdab89;
+	ctx->H[2] = 0x98badcfe;
+	ctx->H[3] = 0x10325476;
+	ctx->H[4] = 0xc3d2e1f0;
+}
+
+void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
+{
+	int lenW = ctx->size & 63;
+
+	ctx->size += len;
+
+	/* Read the data into W and process blocks as they get full */
+	if (lenW) {
+		int left = 64 - lenW;
+		if (len < left)
+			left = len;
+		memcpy(lenW + (char *)ctx->W, data, left);
+		lenW = (lenW + left) & 63;
+		len -= left;
+		data += left;
+		if (lenW)
+			return;
+		blk_SHA1_Block(ctx, ctx->W);
+	}
+	while (len >= 64) {
+		blk_SHA1_Block(ctx, data);
+		data += 64;
+		len -= 64;
+	}
+	if (len)
+		memcpy(ctx->W, data, len);
+}
+
+void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
+{
+	static const unsigned char pad[64] = { 0x80 };
+	unsigned int padlen[2];
+	int i;
+
+	/* Pad with a binary 1 (ie 0x80), then zeroes, then length */
+	padlen[0] = htonl(ctx->size >> 29);
+	padlen[1] = htonl(ctx->size << 3);
+
+	i = ctx->size & 63;
+	blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i)));
+	blk_SHA1_Update(ctx, padlen, 8);
+
+	/* Output hash */
+	for (i = 0; i < 5; i++)
+		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
+}

From dc52fd29738c2af98f3e986691eca34addfd4914 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Wed, 12 Aug 2009 15:46:41 -0400
Subject: [PATCH 14/17] block-sha1: split the different "hacks" to be
 individually selected

This is to make it easier for them to be selected individually depending
on the architecture instead of the other way around i.e. having each
architecture select a list of hacks up front.  That makes for clearer
documentation as well.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index c3f1ae59b9..67c9bd0723 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -11,10 +11,16 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 
+/*
+ * Force usage of rol or ror by selecting the one with the smaller constant.
+ * It _can_ generate slightly smaller code (a constant of 1 is special), but
+ * perhaps more importantly it's possibly faster on any uarch that does a
+ * rotate with a loop.
+ */
+
 #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
 #define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
 #define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
-#define SMALL_REGISTER_SET
 
 #else
 
@@ -24,9 +30,6 @@
 
 #endif
 
-/* This "rolls" over the 512-bit array */
-#define W(x) (array[(x)&15])
-
 /*
  * If you have 32 registers or more, the compiler can (and should)
  * try to change the array[] accesses into registers. However, on
@@ -43,13 +46,23 @@
  * Ben Herrenschmidt reports that on PPC, the C version comes close
  * to the optimized asm with this (ie on PPC you don't want that
  * 'volatile', since there are lots of registers).
+ *
+ * On ARM we get the best code generation by forcing a full memory barrier
+ * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
+ * the stack frame size simply explode and performance goes down the drain.
  */
-#ifdef SMALL_REGISTER_SET
+
+#if defined(__i386__) || defined(__x86_64__)
   #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
+#elif defined(__arm__)
+  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
 #else
   #define setW(x, val) (W(x) = (val))
 #endif
 
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
 /*
  * Where do we get the source from? The first 16 iterations get it from
  * the input data, the next mix it from the 512-bit array.

From 660231aa9727d29c7d2c16319bc6a3fa8bed3e0e Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Wed, 12 Aug 2009 15:47:55 -0400
Subject: [PATCH 15/17] block-sha1: support for architectures with memory
 alignment restrictions

This is needed on architectures with poor or non-existent unaligned memory
support and/or no fast byte swap instruction (such as ARM) by using byte
accesses to memory and shifting the result together.

This also makes the code portable, therefore the byte access methods are
the defaults.  Any architecture that properly supports unaligned word
accesses in hardware simply has to enable the alternative methods.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index 67c9bd0723..d3121f7a02 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -60,6 +60,34 @@
   #define setW(x, val) (W(x) = (val))
 #endif
 
+/*
+ * Performance might be improved if the CPU architecture is OK with
+ * unaligned 32-bit loads and a fast ntohl() is available.
+ * Otherwise fall back to byte loads and shifts which is portable,
+ * and is faster on architectures with memory alignment issues.
+ */
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#define get_be32(p)	ntohl(*(unsigned int *)(p))
+#define put_be32(p, v)	do { *(unsigned int *)(p) = htonl(v); } while (0)
+
+#else
+
+#define get_be32(p)	( \
+	(*((unsigned char *)(p) + 0) << 24) | \
+	(*((unsigned char *)(p) + 1) << 16) | \
+	(*((unsigned char *)(p) + 2) <<  8) | \
+	(*((unsigned char *)(p) + 3) <<  0) )
+#define put_be32(p, v)	do { \
+	unsigned int __v = (v); \
+	*((unsigned char *)(p) + 0) = __v >> 24; \
+	*((unsigned char *)(p) + 1) = __v >> 16; \
+	*((unsigned char *)(p) + 2) = __v >>  8; \
+	*((unsigned char *)(p) + 3) = __v >>  0; } while (0)
+
+#endif
+
 /* This "rolls" over the 512-bit array */
 #define W(x) (array[(x)&15])
 
@@ -67,7 +95,7 @@
  * Where do we get the source from? The first 16 iterations get it from
  * the input data, the next mix it from the 512-bit array.
  */
-#define SHA_SRC(t) htonl(data[t])
+#define SHA_SRC(t) get_be32(data + t)
 #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
 
 #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
@@ -245,5 +273,5 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx)
 
 	/* Output hash */
 	for (i = 0; i < 5; i++)
-		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
+		put_be32(hashout + i*4, ctx->H[i]);
 }

From ee7dc310af660f423732369e955651ef2f05011d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Thu, 13 Aug 2009 00:29:14 -0400
Subject: [PATCH 16/17] block-sha1: more good unaligned memory access
 candidates

In addition to X86, PowerPC and S390 are capable of unaligned memory
accesses.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index d3121f7a02..e5a100754e 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -67,7 +67,10 @@
  * and is faster on architectures with memory alignment issues.
  */
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(__ppc__) || defined(__ppc64__) || \
+    defined(__powerpc__) || defined(__powerpc64__) || \
+    defined(__s390__) || defined(__s390x__)
 
 #define get_be32(p)	ntohl(*(unsigned int *)(p))
 #define put_be32(p, v)	do { *(unsigned int *)(p) = htonl(v); } while (0)

From a12218572f2875e91b6c3c12559b076c4949a675 Mon Sep 17 00:00:00 2001
From: Brandon Casey <drafnel@gmail.com>
Date: Fri, 14 Aug 2009 17:52:15 -0500
Subject: [PATCH 17/17] block-sha1/sha1.c: silence compiler complaints by
 casting void * to char *

Some compilers produce errors when arithmetic is attempted on pointers to
void.  We want computations done on byte addresses, so cast them to char *
to work them around.

Signed-off-by: Brandon Casey <casey@nrlssc.navy.mil>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 block-sha1/sha1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index e5a100754e..464cb258aa 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -246,14 +246,14 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len)
 		memcpy(lenW + (char *)ctx->W, data, left);
 		lenW = (lenW + left) & 63;
 		len -= left;
-		data += left;
+		data = ((const char *)data + left);
 		if (lenW)
 			return;
 		blk_SHA1_Block(ctx, ctx->W);
 	}
 	while (len >= 64) {
 		blk_SHA1_Block(ctx, data);
-		data += 64;
+		data = ((const char *)data + 64);
 		len -= 64;
 	}
 	if (len)