From d7c208a92e6b15cdcd159e30cd1fc0177fd967e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Aug 2009 16:13:20 -0700 Subject: [PATCH 01/17] Add new optimized C 'block-sha1' routines Based on the mozilla SHA1 routine, but doing the input data accesses a word at a time and with 'htonl()' instead of loading bytes and shifting. It requires an architecture that is ok with unaligned 32-bit loads and a fast htonl(). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- Makefile | 9 +++ block-sha1/sha1.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++ block-sha1/sha1.h | 21 +++++++ 3 files changed, 175 insertions(+) create mode 100644 block-sha1/sha1.c create mode 100644 block-sha1/sha1.h diff --git a/Makefile b/Makefile index daf4296706..e6df8ecde6 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,10 @@ all:: # specify your own (or DarwinPort's) include directories and # library directories by defining CFLAGS and LDFLAGS appropriately. # +# Define BLK_SHA1 environment variable if you want the C version +# of the SHA1 that assumes you can do unaligned 32-bit loads and +# have a fast htonl() function. +# # Define PPC_SHA1 environment variable when running make to make use of # a bundled SHA1 routine optimized for PowerPC. # @@ -1167,6 +1171,10 @@ ifdef NO_DEFLATE_BOUND BASIC_CFLAGS += -DNO_DEFLATE_BOUND endif +ifdef BLK_SHA1 + SHA1_HEADER = "block-sha1/sha1.h" + LIB_OBJS += block-sha1/sha1.o +else ifdef PPC_SHA1 SHA1_HEADER = "ppc/sha1.h" LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o @@ -1184,6 +1192,7 @@ else endif endif endif +endif ifdef NO_PERL_MAKEMAKER export NO_PERL_MAKEMAKER endif diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c new file mode 100644 index 0000000000..50b2b42b03 --- /dev/null +++ b/block-sha1/sha1.c @@ -0,0 +1,145 @@ +/* + * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.c), + * optimized to do word accesses rather than byte accesses, + * and to avoid unnecessary copies into the context array. + */ + +#include +#include + +#include "sha1.h" + +/* Hash one 64-byte block of data */ +static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data); + +void blk_SHA1_Init(blk_SHA_CTX *ctx) +{ + ctx->lenW = 0; + ctx->size = 0; + + /* Initialize H with the magic constants (see FIPS180 for constants) + */ + ctx->H[0] = 0x67452301; + ctx->H[1] = 0xefcdab89; + ctx->H[2] = 0x98badcfe; + ctx->H[3] = 0x10325476; + ctx->H[4] = 0xc3d2e1f0; +} + + +void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) +{ + int lenW = ctx->lenW; + + ctx->size += len; + + /* Read the data into W and process blocks as they get full + */ + if (lenW) { + int left = 64 - lenW; + if (len < left) + left = len; + memcpy(lenW + (char *)ctx->W, data, left); + lenW = (lenW + left) & 63; + len -= left; + data += left; + ctx->lenW = lenW; + if (lenW) + return; + blk_SHA1Block(ctx, ctx->W); + } + while (len >= 64) { + blk_SHA1Block(ctx, data); + data += 64; + len -= 64; + } + if (len) { + memcpy(ctx->W, data, len); + ctx->lenW = len; + } +} + + +void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) +{ + static const unsigned char pad[64] = { 0x80 }; + unsigned int padlen[2]; + int i; + + /* Pad with a binary 1 (ie 0x80), then zeroes, then length + */ + padlen[0] = htonl(ctx->size >> (32 - 3)); + padlen[1] = htonl(ctx->size << 3); + + blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW))); + blk_SHA1_Update(ctx, padlen, 8); + + /* Output hash + */ + for (i = 0; i < 5; i++) + ((unsigned int *)hashout)[i] = htonl(ctx->H[i]); +} + +#define SHA_ROT(X,n) (((X) << (n)) | ((X) >> (32-(n)))) + +static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) +{ + int t; + unsigned int A,B,C,D,E,TEMP; + unsigned int W[80]; + + for (t = 0; t < 16; t++) + W[t] = htonl(data[t]); + + /* Unroll it? */ + for (t = 16; t <= 79; t++) + W[t] = SHA_ROT(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); + + A = ctx->H[0]; + B = ctx->H[1]; + C = ctx->H[2]; + D = ctx->H[3]; + E = ctx->H[4]; + +#define T_0_19(t) \ + TEMP = SHA_ROT(A,5) + (((C^D)&B)^D) + E + W[t] + 0x5a827999; \ + E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + + T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4); + T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9); + T_0_19(10); T_0_19(11); T_0_19(12); T_0_19(13); T_0_19(14); + T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19); + +#define T_20_39(t) \ + TEMP = SHA_ROT(A,5) + (B^C^D) + E + W[t] + 0x6ed9eba1; \ + E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + + T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24); + T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29); + T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34); + T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39); + +#define T_40_59(t) \ + TEMP = SHA_ROT(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \ + E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + + T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44); + T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49); + T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54); + T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59); + +#define T_60_79(t) \ + TEMP = SHA_ROT(A,5) + (B^C^D) + E + W[t] + 0xca62c1d6; \ + E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + + T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64); + T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69); + T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74); + T_60_79(75); T_60_79(76); T_60_79(77); T_60_79(78); T_60_79(79); + + ctx->H[0] += A; + ctx->H[1] += B; + ctx->H[2] += C; + ctx->H[3] += D; + ctx->H[4] += E; +} diff --git a/block-sha1/sha1.h b/block-sha1/sha1.h new file mode 100644 index 0000000000..7be2d93a2a --- /dev/null +++ b/block-sha1/sha1.h @@ -0,0 +1,21 @@ +/* + * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.h), + * optimized to do word accesses rather than byte accesses, + * and to avoid unnecessary copies into the context array. + */ + +typedef struct { + unsigned int H[5]; + unsigned int W[16]; + int lenW; + unsigned long long size; +} blk_SHA_CTX; + +void blk_SHA1_Init(blk_SHA_CTX *ctx); +void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *dataIn, unsigned long len); +void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx); + +#define git_SHA_CTX blk_SHA_CTX +#define git_SHA1_Init blk_SHA1_Init +#define git_SHA1_Update blk_SHA1_Update +#define git_SHA1_Final blk_SHA1_Final From b26a9d50899a5d65bafcb521c5495f03b2e2e0e9 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 6 Aug 2009 13:56:19 -0700 Subject: [PATCH 02/17] block-sha1: undo ctx->size change Undo the change I picked up from the mailing list discussion suggested by Nico, not because it is wrong, but it will be done at the end of the follow-up series. Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 50b2b42b03..eef32f7859 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -31,7 +31,7 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) { int lenW = ctx->lenW; - ctx->size += len; + ctx->size += (unsigned long long) len << 3; /* Read the data into W and process blocks as they get full */ @@ -68,8 +68,8 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* Pad with a binary 1 (ie 0x80), then zeroes, then length */ - padlen[0] = htonl(ctx->size >> (32 - 3)); - padlen[1] = htonl(ctx->size << 3); + padlen[0] = htonl(ctx->size >> 32); + padlen[1] = htonl(ctx->size); blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW))); blk_SHA1_Update(ctx, padlen, 8); From b8e48a89b8f581eaf95b57782bb8e620ca30e968 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Aug 2009 19:42:15 -0700 Subject: [PATCH 03/17] block-sha1: try to use rol/ror appropriately Use the one with the smaller constant. It _can_ generate slightly smaller code (a constant of 1 is special), but perhaps more importantly it's possibly faster on any uarch that does a rotate with a loop. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index eef32f7859..a45a3dec1e 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -80,7 +80,19 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) ((unsigned int *)hashout)[i] = htonl(ctx->H[i]); } -#define SHA_ROT(X,n) (((X) << (n)) | ((X) >> (32-(n)))) +#if defined(__i386__) || defined(__x86_64__) + +#define SHA_ASM(op, x, n) ({ unsigned int __res; asm(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) +#define SHA_ROL(x,n) SHA_ASM("rol", x, n) +#define SHA_ROR(x,n) SHA_ASM("ror", x, n) + +#else + +#define SHA_ROT(X,n) (((X) << (l)) | ((X) >> (r))) +#define SHA_ROL(X,n) SHA_ROT(X,n,32-(n)) +#define SHA_ROR(X,n) SHA_ROT(X,32-(n),n) + +#endif static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) { @@ -93,7 +105,7 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) /* Unroll it? */ for (t = 16; t <= 79; t++) - W[t] = SHA_ROT(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); + W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); A = ctx->H[0]; B = ctx->H[1]; @@ -102,8 +114,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) E = ctx->H[4]; #define T_0_19(t) \ - TEMP = SHA_ROT(A,5) + (((C^D)&B)^D) + E + W[t] + 0x5a827999; \ - E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + TEMP = SHA_ROL(A,5) + (((C^D)&B)^D) + E + W[t] + 0x5a827999; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4); T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9); @@ -111,8 +123,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19); #define T_20_39(t) \ - TEMP = SHA_ROT(A,5) + (B^C^D) + E + W[t] + 0x6ed9eba1; \ - E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + TEMP = SHA_ROL(A,5) + (B^C^D) + E + W[t] + 0x6ed9eba1; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24); T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29); @@ -120,8 +132,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39); #define T_40_59(t) \ - TEMP = SHA_ROT(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \ - E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + TEMP = SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44); T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49); @@ -129,8 +141,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59); #define T_60_79(t) \ - TEMP = SHA_ROT(A,5) + (B^C^D) + E + W[t] + 0xca62c1d6; \ - E = D; D = C; C = SHA_ROT(B, 30); B = A; A = TEMP; + TEMP = SHA_ROL(A,5) + (B^C^D) + E + W[t] + 0xca62c1d6; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64); T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69); From fd536d3439fa2a06730884df31e2e98c9006c947 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 6 Aug 2009 13:52:58 -0700 Subject: [PATCH 04/17] block-sha1: minor fixups Bert Wesarg noticed non-x86 version of SHA_ROT() had a typo. Also spell in-line assembly as __asm__(), otherwise I seem to get error: implicit declaration of function 'asm' from my compiler. Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index a45a3dec1e..698e435a39 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -82,13 +82,13 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #if defined(__i386__) || defined(__x86_64__) -#define SHA_ASM(op, x, n) ({ unsigned int __res; asm(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) +#define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) #define SHA_ROL(x,n) SHA_ASM("rol", x, n) #define SHA_ROR(x,n) SHA_ASM("ror", x, n) #else -#define SHA_ROT(X,n) (((X) << (l)) | ((X) >> (r))) +#define SHA_ROT(X,l,r) (((X) << (l)) | ((X) >> (r))) #define SHA_ROL(X,n) SHA_ROT(X,n,32-(n)) #define SHA_ROR(X,n) SHA_ROT(X,32-(n),n) From 139e3456ecf18fc03a75eda7a77441e8fec344b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Aug 2009 20:28:07 -0700 Subject: [PATCH 05/17] block-sha1: make the 'ntohl()' part of the first SHA1 loop This helps a teeny bit. But what I -really- want to do is to avoid the whole 80-array loop, and do the xor updates as I go along.. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 698e435a39..13da511b78 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -100,27 +100,31 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) unsigned int A,B,C,D,E,TEMP; unsigned int W[80]; - for (t = 0; t < 16; t++) - W[t] = htonl(data[t]); - - /* Unroll it? */ - for (t = 16; t <= 79; t++) - W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); - A = ctx->H[0]; B = ctx->H[1]; C = ctx->H[2]; D = ctx->H[3]; E = ctx->H[4]; -#define T_0_19(t) \ +#define T_0_15(t) \ + TEMP = htonl(data[t]); W[t] = TEMP; \ + TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \ + + T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4); + T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9); + T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14); + T_0_15(15); + + /* Unroll it? */ + for (t = 16; t <= 79; t++) + W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); + +#define T_16_19(t) \ TEMP = SHA_ROL(A,5) + (((C^D)&B)^D) + E + W[t] + 0x5a827999; \ E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; - T_0_19( 0); T_0_19( 1); T_0_19( 2); T_0_19( 3); T_0_19( 4); - T_0_19( 5); T_0_19( 6); T_0_19( 7); T_0_19( 8); T_0_19( 9); - T_0_19(10); T_0_19(11); T_0_19(12); T_0_19(13); T_0_19(14); - T_0_19(15); T_0_19(16); T_0_19(17); T_0_19(18); T_0_19(19); + T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19); #define T_20_39(t) \ TEMP = SHA_ROL(A,5) + (B^C^D) + E + W[t] + 0x6ed9eba1; \ From 7b5075fcfb069fc36ba4cfe5567234974793ab58 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Aug 2009 20:49:41 -0700 Subject: [PATCH 06/17] block-sha1: re-use the temporary array as we calculate the SHA1 The mozilla-SHA1 code did this 80-word array for the 80 iterations. But the SHA1 state is really just 512 bits, and you can actually keep it in a kind of "circular queue" of just 16 words instead. This requires us to do the xor updates as we go along (rather than as a pre-phase), but that's really what we want to do anyway. This gets me really close to the OpenSSL performance on my Nehalem. Look ma, all C code (ok, there's the rol/ror hack, but that one doesn't strictly even matter on my Nehalem, it's just a local optimization). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 13da511b78..8c4c216f93 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -96,9 +96,8 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) { - int t; unsigned int A,B,C,D,E,TEMP; - unsigned int W[80]; + unsigned int array[16]; A = ctx->H[0]; B = ctx->H[1]; @@ -107,8 +106,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) E = ctx->H[4]; #define T_0_15(t) \ - TEMP = htonl(data[t]); W[t] = TEMP; \ - TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ + TEMP = htonl(data[t]); array[t] = TEMP; \ + TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \ T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4); @@ -116,18 +115,21 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14); T_0_15(15); - /* Unroll it? */ - for (t = 16; t <= 79; t++) - W[t] = SHA_ROL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); +/* This "rolls" over the 512-bit array */ +#define W(x) (array[(x)&15]) +#define SHA_XOR(t) \ + TEMP = SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1); W(t) = TEMP; #define T_16_19(t) \ - TEMP = SHA_ROL(A,5) + (((C^D)&B)^D) + E + W[t] + 0x5a827999; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; + SHA_XOR(t); \ + TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \ T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19); #define T_20_39(t) \ - TEMP = SHA_ROL(A,5) + (B^C^D) + E + W[t] + 0x6ed9eba1; \ + SHA_XOR(t); \ + TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0x6ed9eba1; \ E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24); @@ -136,7 +138,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39); #define T_40_59(t) \ - TEMP = SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + W[t] + 0x8f1bbcdc; \ + SHA_XOR(t); \ + TEMP += SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + 0x8f1bbcdc; \ E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44); @@ -145,7 +148,8 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59); #define T_60_79(t) \ - TEMP = SHA_ROL(A,5) + (B^C^D) + E + W[t] + 0xca62c1d6; \ + SHA_XOR(t); \ + TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0xca62c1d6; \ E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64); From ab14c823dfbf1245712c8179952b51822135d8a8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Aug 2009 07:20:54 -0700 Subject: [PATCH 07/17] block-sha1: macroize the rounds a bit further Avoid repeating the shared parts of the different rounds by adding a macro layer or two. It was already more cpp than C. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 56 ++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 8c4c216f93..53c93ba603 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -94,6 +94,27 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #endif +/* This "rolls" over the 512-bit array */ +#define W(x) (array[(x)&15]) + +/* + * Where do we get the source from? The first 16 iterations get it from + * the input data, the next mix it from the 512-bit array. + */ +#define SHA_SRC(t) htonl(data[t]) +#define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) + +#define SHA_ROUND(t, input, fn, constant) \ + TEMP = input(t); W(t) = TEMP; \ + TEMP += SHA_ROL(A,5) + (fn) + E + (constant); \ + E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP + +#define T_0_15(t) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 ) +#define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 ) +#define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 ) +#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)|(D&(B|C))) , 0x8f1bbcdc ) +#define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6 ) + static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) { unsigned int A,B,C,D,E,TEMP; @@ -105,53 +126,28 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) D = ctx->H[3]; E = ctx->H[4]; -#define T_0_15(t) \ - TEMP = htonl(data[t]); array[t] = TEMP; \ - TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \ - + /* Round 1 - iterations 0-16 take their input from 'data' */ T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4); T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9); T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14); T_0_15(15); -/* This "rolls" over the 512-bit array */ -#define W(x) (array[(x)&15]) -#define SHA_XOR(t) \ - TEMP = SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1); W(t) = TEMP; - -#define T_16_19(t) \ - SHA_XOR(t); \ - TEMP += SHA_ROL(A,5) + (((C^D)&B)^D) + E + 0x5a827999; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; \ - + /* Round 1 - tail. Input from 512-bit mixing array */ T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19); -#define T_20_39(t) \ - SHA_XOR(t); \ - TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0x6ed9eba1; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; - + /* Round 2 */ T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24); T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29); T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34); T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39); -#define T_40_59(t) \ - SHA_XOR(t); \ - TEMP += SHA_ROL(A,5) + ((B&C)|(D&(B|C))) + E + 0x8f1bbcdc; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; - + /* Round 3 */ T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44); T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49); T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54); T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59); -#define T_60_79(t) \ - SHA_XOR(t); \ - TEMP += SHA_ROL(A,5) + (B^C^D) + E + 0xca62c1d6; \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP; - + /* Round 4 */ T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64); T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69); T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74); From e869e113c8f91999f9a433436e0b863fe2727b61 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Aug 2009 07:27:57 -0700 Subject: [PATCH 08/17] block-sha1: Use '(B&C)+(D&(B^C))' instead of '(B&C)|(D&(B|C))' in round 3 It's an equivalent expression, but the '+' gives us some freedom in instruction selection (for example, we can use 'lea' rather than 'add'), and associates with the other additions around it to give some minor scheduling freedom. Suggested-by: linux@horizon.com Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 53c93ba603..5bf1b36bd1 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -112,7 +112,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define T_0_15(t) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 ) #define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 ) #define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 ) -#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)|(D&(B|C))) , 0x8f1bbcdc ) +#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc ) #define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6 ) static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) From 5d5210c35aa83342163ab0ab80b8e6d6fa3ce931 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Aug 2009 07:45:46 -0700 Subject: [PATCH 09/17] block-sha1: get rid of redundant 'lenW' context .. and simplify the ctx->size logic. We now count the size in bytes, which means that 'lenW' was always just the low 6 bits of the total size, so we don't carry it around separately any more. And we do the 'size in bits' shift at the end. Suggested by Nicolas Pitre and linux@horizon.com. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 17 +++++++---------- block-sha1/sha1.h | 1 - 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 5bf1b36bd1..fdd400f22d 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -14,7 +14,6 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data); void blk_SHA1_Init(blk_SHA_CTX *ctx) { - ctx->lenW = 0; ctx->size = 0; /* Initialize H with the magic constants (see FIPS180 for constants) @@ -29,9 +28,9 @@ void blk_SHA1_Init(blk_SHA_CTX *ctx) void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) { - int lenW = ctx->lenW; + int lenW = ctx->size & 63; - ctx->size += (unsigned long long) len << 3; + ctx->size += len; /* Read the data into W and process blocks as they get full */ @@ -43,7 +42,6 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) lenW = (lenW + left) & 63; len -= left; data += left; - ctx->lenW = lenW; if (lenW) return; blk_SHA1Block(ctx, ctx->W); @@ -53,10 +51,8 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) data += 64; len -= 64; } - if (len) { + if (len) memcpy(ctx->W, data, len); - ctx->lenW = len; - } } @@ -68,10 +64,11 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* Pad with a binary 1 (ie 0x80), then zeroes, then length */ - padlen[0] = htonl(ctx->size >> 32); - padlen[1] = htonl(ctx->size); + padlen[0] = htonl(ctx->size >> 29); + padlen[1] = htonl(ctx->size << 3); - blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - ctx->lenW))); + i = ctx->size & 63; + blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i))); blk_SHA1_Update(ctx, padlen, 8); /* Output hash diff --git a/block-sha1/sha1.h b/block-sha1/sha1.h index 7be2d93a2a..c1ae74d3da 100644 --- a/block-sha1/sha1.h +++ b/block-sha1/sha1.h @@ -7,7 +7,6 @@ typedef struct { unsigned int H[5]; unsigned int W[16]; - int lenW; unsigned long long size; } blk_SHA_CTX; From 30d12d4c16abc052e8961c07651f97bea2c061bd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Aug 2009 12:41:00 -0700 Subject: [PATCH 10/17] block-sha1: perform register rotation using cpp Instead of letting the compiler to figure out the optimal way to rotate register usage, explicitly rotate the register names with cpp. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 117 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 27 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index fdd400f22d..b715916675 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -101,20 +101,20 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define SHA_SRC(t) htonl(data[t]) #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) -#define SHA_ROUND(t, input, fn, constant) \ - TEMP = input(t); W(t) = TEMP; \ - TEMP += SHA_ROL(A,5) + (fn) + E + (constant); \ - E = D; D = C; C = SHA_ROR(B, 2); B = A; A = TEMP +#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ + unsigned int TEMP = input(t); W(t) = TEMP; \ + TEMP += E + SHA_ROL(A,5) + (fn) + (constant); \ + B = SHA_ROR(B, 2); E = TEMP; } while (0) -#define T_0_15(t) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999 ) -#define T_16_19(t) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999 ) -#define T_20_39(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1 ) -#define T_40_59(t) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc ) -#define T_60_79(t) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6 ) +#define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) +#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) +#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) +#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) +#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) { - unsigned int A,B,C,D,E,TEMP; + unsigned int A,B,C,D,E; unsigned int array[16]; A = ctx->H[0]; @@ -124,31 +124,94 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) E = ctx->H[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ - T_0_15( 0); T_0_15( 1); T_0_15( 2); T_0_15( 3); T_0_15( 4); - T_0_15( 5); T_0_15( 6); T_0_15( 7); T_0_15( 8); T_0_15( 9); - T_0_15(10); T_0_15(11); T_0_15(12); T_0_15(13); T_0_15(14); - T_0_15(15); + T_0_15( 0, A, B, C, D, E); + T_0_15( 1, E, A, B, C, D); + T_0_15( 2, D, E, A, B, C); + T_0_15( 3, C, D, E, A, B); + T_0_15( 4, B, C, D, E, A); + T_0_15( 5, A, B, C, D, E); + T_0_15( 6, E, A, B, C, D); + T_0_15( 7, D, E, A, B, C); + T_0_15( 8, C, D, E, A, B); + T_0_15( 9, B, C, D, E, A); + T_0_15(10, A, B, C, D, E); + T_0_15(11, E, A, B, C, D); + T_0_15(12, D, E, A, B, C); + T_0_15(13, C, D, E, A, B); + T_0_15(14, B, C, D, E, A); + T_0_15(15, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ - T_16_19(16); T_16_19(17); T_16_19(18); T_16_19(19); + T_16_19(16, E, A, B, C, D); + T_16_19(17, D, E, A, B, C); + T_16_19(18, C, D, E, A, B); + T_16_19(19, B, C, D, E, A); /* Round 2 */ - T_20_39(20); T_20_39(21); T_20_39(22); T_20_39(23); T_20_39(24); - T_20_39(25); T_20_39(26); T_20_39(27); T_20_39(28); T_20_39(29); - T_20_39(30); T_20_39(31); T_20_39(32); T_20_39(33); T_20_39(34); - T_20_39(35); T_20_39(36); T_20_39(37); T_20_39(38); T_20_39(39); + T_20_39(20, A, B, C, D, E); + T_20_39(21, E, A, B, C, D); + T_20_39(22, D, E, A, B, C); + T_20_39(23, C, D, E, A, B); + T_20_39(24, B, C, D, E, A); + T_20_39(25, A, B, C, D, E); + T_20_39(26, E, A, B, C, D); + T_20_39(27, D, E, A, B, C); + T_20_39(28, C, D, E, A, B); + T_20_39(29, B, C, D, E, A); + T_20_39(30, A, B, C, D, E); + T_20_39(31, E, A, B, C, D); + T_20_39(32, D, E, A, B, C); + T_20_39(33, C, D, E, A, B); + T_20_39(34, B, C, D, E, A); + T_20_39(35, A, B, C, D, E); + T_20_39(36, E, A, B, C, D); + T_20_39(37, D, E, A, B, C); + T_20_39(38, C, D, E, A, B); + T_20_39(39, B, C, D, E, A); /* Round 3 */ - T_40_59(40); T_40_59(41); T_40_59(42); T_40_59(43); T_40_59(44); - T_40_59(45); T_40_59(46); T_40_59(47); T_40_59(48); T_40_59(49); - T_40_59(50); T_40_59(51); T_40_59(52); T_40_59(53); T_40_59(54); - T_40_59(55); T_40_59(56); T_40_59(57); T_40_59(58); T_40_59(59); + T_40_59(40, A, B, C, D, E); + T_40_59(41, E, A, B, C, D); + T_40_59(42, D, E, A, B, C); + T_40_59(43, C, D, E, A, B); + T_40_59(44, B, C, D, E, A); + T_40_59(45, A, B, C, D, E); + T_40_59(46, E, A, B, C, D); + T_40_59(47, D, E, A, B, C); + T_40_59(48, C, D, E, A, B); + T_40_59(49, B, C, D, E, A); + T_40_59(50, A, B, C, D, E); + T_40_59(51, E, A, B, C, D); + T_40_59(52, D, E, A, B, C); + T_40_59(53, C, D, E, A, B); + T_40_59(54, B, C, D, E, A); + T_40_59(55, A, B, C, D, E); + T_40_59(56, E, A, B, C, D); + T_40_59(57, D, E, A, B, C); + T_40_59(58, C, D, E, A, B); + T_40_59(59, B, C, D, E, A); /* Round 4 */ - T_60_79(60); T_60_79(61); T_60_79(62); T_60_79(63); T_60_79(64); - T_60_79(65); T_60_79(66); T_60_79(67); T_60_79(68); T_60_79(69); - T_60_79(70); T_60_79(71); T_60_79(72); T_60_79(73); T_60_79(74); - T_60_79(75); T_60_79(76); T_60_79(77); T_60_79(78); T_60_79(79); + T_60_79(60, A, B, C, D, E); + T_60_79(61, E, A, B, C, D); + T_60_79(62, D, E, A, B, C); + T_60_79(63, C, D, E, A, B); + T_60_79(64, B, C, D, E, A); + T_60_79(65, A, B, C, D, E); + T_60_79(66, E, A, B, C, D); + T_60_79(67, D, E, A, B, C); + T_60_79(68, C, D, E, A, B); + T_60_79(69, B, C, D, E, A); + T_60_79(70, A, B, C, D, E); + T_60_79(71, E, A, B, C, D); + T_60_79(72, D, E, A, B, C); + T_60_79(73, C, D, E, A, B); + T_60_79(74, B, C, D, E, A); + T_60_79(75, A, B, C, D, E); + T_60_79(76, E, A, B, C, D); + T_60_79(77, D, E, A, B, C); + T_60_79(78, C, D, E, A, B); + T_60_79(79, B, C, D, E, A); ctx->H[0] += A; ctx->H[1] += B; From 66c9c6c0fbba0894ebce3da572f62eb05162e547 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 7 Aug 2009 21:16:46 -0700 Subject: [PATCH 11/17] block-sha1: improved SHA1 hashing I think I have found a way to avoid the gcc crazyness. Lookie here: # TIME[s] SPEED[MB/s] rfc3174 5.094 119.8 rfc3174 5.098 119.7 linus 1.462 417.5 linusas 2.008 304 linusas2 1.878 325 mozilla 5.566 109.6 mozillaas 5.866 104.1 openssl 1.609 379.3 spelvin 1.675 364.5 spelvina 1.601 381.3 nettle 1.591 383.6 notice? I outperform all the hand-tuned asm on 32-bit too. By quite a margin, in fact. Now, I didn't try a P4, and it's possible that it won't do that there, but the 32-bit code generation sure looks impressive on my Nehalem box. The magic? I force the stores to the 512-bit hash bucket to be done in order. That seems to help a lot. The diff is trivial (on top of the "rename registers with cpp" patch), as appended. And it does seem to fix the P4 issues too, although I can obviously (once again) only test Prescott, and only in 64-bit mode: # TIME[s] SPEED[MB/s] rfc3174 1.662 36.73 rfc3174 1.64 37.22 linus 0.2523 241.9 linusas 0.4367 139.8 linusas2 0.4487 136 mozilla 0.9704 62.9 mozillaas 0.9399 64.94 that's some really impressive improvement. All from just saying "do the stores in the order I told you to, dammit!" to the compiler. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index b715916675..886bcf25e2 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -93,6 +93,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) +#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) /* * Where do we get the source from? The first 16 iterations get it from @@ -102,9 +103,9 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ - unsigned int TEMP = input(t); W(t) = TEMP; \ - TEMP += E + SHA_ROL(A,5) + (fn) + (constant); \ - B = SHA_ROR(B, 2); E = TEMP; } while (0) + unsigned int TEMP = input(t); setW(t, TEMP); \ + E += TEMP + SHA_ROL(A,5) + (fn) + (constant); \ + B = SHA_ROR(B, 2); } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) From 926172c5e4808726244713ef70398cd38b055f1e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Aug 2009 16:52:07 -0700 Subject: [PATCH 12/17] block-sha1: improve code on large-register-set machines For x86 performance (especially in 32-bit mode) I added that hack to write the SHA1 internal temporary hash using a volatile pointer, in order to get gcc to not try to cache the array contents. Because gcc will do all the wrong things, and then spill things in insane random ways. But on architectures like PPC, where you have 32 registers, it's actually perfectly reasonable to put the whole temporary array[] into the register set, and gcc can do so. So make the 'volatile unsigned int *' cast be dependent on a SMALL_REGISTER_SET preprocessor symbol, and enable it (currently) on just x86 and x86-64. With that, the routine is fairly reasonable even when compared to the hand-scheduled PPC version. Ben Herrenschmidt reports on a G5: * Paulus asm version: about 3.67s * Yours with no change: about 5.74s * Yours without "volatile": about 3.78s so with this the C version is within about 3% of the asm one. And add a lot of commentary on what the heck is going on. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 886bcf25e2..304cd0452d 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -82,6 +82,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) #define SHA_ROL(x,n) SHA_ASM("rol", x, n) #define SHA_ROR(x,n) SHA_ASM("ror", x, n) +#define SMALL_REGISTER_SET #else @@ -93,7 +94,29 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) -#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) + +/* + * If you have 32 registers or more, the compiler can (and should) + * try to change the array[] accesses into registers. However, on + * machines with less than ~25 registers, that won't really work, + * and at least gcc will make an unholy mess of it. + * + * So to avoid that mess which just slows things down, we force + * the stores to memory to actually happen (we might be better off + * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as + * suggested by Artur Skawina - that will also make gcc unable to + * try to do the silly "optimize away loads" part because it won't + * see what the value will be). + * + * Ben Herrenschmidt reports that on PPC, the C version comes close + * to the optimized asm with this (ie on PPC you don't want that + * 'volatile', since there are lots of registers). + */ +#ifdef SMALL_REGISTER_SET + #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) +#else + #define setW(x, val) (W(x) = (val)) +#endif /* * Where do we get the source from? The first 16 iterations get it from From 30ba0de726d92ccfc93009eb60f2c30b0886f61b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 12 Aug 2009 15:45:48 -0400 Subject: [PATCH 13/17] block-sha1: move code around Move the code around so specific architecture hacks are defined first. Also make one line comments actually one line. No code change. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 129 +++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 69 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 304cd0452d..c3f1ae59b9 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -9,74 +9,6 @@ #include "sha1.h" -/* Hash one 64-byte block of data */ -static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data); - -void blk_SHA1_Init(blk_SHA_CTX *ctx) -{ - ctx->size = 0; - - /* Initialize H with the magic constants (see FIPS180 for constants) - */ - ctx->H[0] = 0x67452301; - ctx->H[1] = 0xefcdab89; - ctx->H[2] = 0x98badcfe; - ctx->H[3] = 0x10325476; - ctx->H[4] = 0xc3d2e1f0; -} - - -void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) -{ - int lenW = ctx->size & 63; - - ctx->size += len; - - /* Read the data into W and process blocks as they get full - */ - if (lenW) { - int left = 64 - lenW; - if (len < left) - left = len; - memcpy(lenW + (char *)ctx->W, data, left); - lenW = (lenW + left) & 63; - len -= left; - data += left; - if (lenW) - return; - blk_SHA1Block(ctx, ctx->W); - } - while (len >= 64) { - blk_SHA1Block(ctx, data); - data += 64; - len -= 64; - } - if (len) - memcpy(ctx->W, data, len); -} - - -void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) -{ - static const unsigned char pad[64] = { 0x80 }; - unsigned int padlen[2]; - int i; - - /* Pad with a binary 1 (ie 0x80), then zeroes, then length - */ - padlen[0] = htonl(ctx->size >> 29); - padlen[1] = htonl(ctx->size << 3); - - i = ctx->size & 63; - blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i))); - blk_SHA1_Update(ctx, padlen, 8); - - /* Output hash - */ - for (i = 0; i < 5; i++) - ((unsigned int *)hashout)[i] = htonl(ctx->H[i]); -} - #if defined(__i386__) || defined(__x86_64__) #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) @@ -136,7 +68,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) -static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) +static void blk_SHA1_Block(blk_SHA_CTX *ctx, const unsigned int *data) { unsigned int A,B,C,D,E; unsigned int array[16]; @@ -243,3 +175,62 @@ static void blk_SHA1Block(blk_SHA_CTX *ctx, const unsigned int *data) ctx->H[3] += D; ctx->H[4] += E; } + +void blk_SHA1_Init(blk_SHA_CTX *ctx) +{ + ctx->size = 0; + + /* Initialize H with the magic constants (see FIPS180 for constants) */ + ctx->H[0] = 0x67452301; + ctx->H[1] = 0xefcdab89; + ctx->H[2] = 0x98badcfe; + ctx->H[3] = 0x10325476; + ctx->H[4] = 0xc3d2e1f0; +} + +void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) +{ + int lenW = ctx->size & 63; + + ctx->size += len; + + /* Read the data into W and process blocks as they get full */ + if (lenW) { + int left = 64 - lenW; + if (len < left) + left = len; + memcpy(lenW + (char *)ctx->W, data, left); + lenW = (lenW + left) & 63; + len -= left; + data += left; + if (lenW) + return; + blk_SHA1_Block(ctx, ctx->W); + } + while (len >= 64) { + blk_SHA1_Block(ctx, data); + data += 64; + len -= 64; + } + if (len) + memcpy(ctx->W, data, len); +} + +void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) +{ + static const unsigned char pad[64] = { 0x80 }; + unsigned int padlen[2]; + int i; + + /* Pad with a binary 1 (ie 0x80), then zeroes, then length */ + padlen[0] = htonl(ctx->size >> 29); + padlen[1] = htonl(ctx->size << 3); + + i = ctx->size & 63; + blk_SHA1_Update(ctx, pad, 1+ (63 & (55 - i))); + blk_SHA1_Update(ctx, padlen, 8); + + /* Output hash */ + for (i = 0; i < 5; i++) + ((unsigned int *)hashout)[i] = htonl(ctx->H[i]); +} From dc52fd29738c2af98f3e986691eca34addfd4914 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 12 Aug 2009 15:46:41 -0400 Subject: [PATCH 14/17] block-sha1: split the different "hacks" to be individually selected This is to make it easier for them to be selected individually depending on the architecture instead of the other way around i.e. having each architecture select a list of hacks up front. That makes for clearer documentation as well. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index c3f1ae59b9..67c9bd0723 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -11,10 +11,16 @@ #if defined(__i386__) || defined(__x86_64__) +/* + * Force usage of rol or ror by selecting the one with the smaller constant. + * It _can_ generate slightly smaller code (a constant of 1 is special), but + * perhaps more importantly it's possibly faster on any uarch that does a + * rotate with a loop. + */ + #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) #define SHA_ROL(x,n) SHA_ASM("rol", x, n) #define SHA_ROR(x,n) SHA_ASM("ror", x, n) -#define SMALL_REGISTER_SET #else @@ -24,9 +30,6 @@ #endif -/* This "rolls" over the 512-bit array */ -#define W(x) (array[(x)&15]) - /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on @@ -43,13 +46,23 @@ * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). + * + * On ARM we get the best code generation by forcing a full memory barrier + * between each SHA_ROUND, otherwise gcc happily get wild with spilling and + * the stack frame size simply explode and performance goes down the drain. */ -#ifdef SMALL_REGISTER_SET + +#if defined(__i386__) || defined(__x86_64__) #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) +#elif defined(__arm__) + #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif +/* This "rolls" over the 512-bit array */ +#define W(x) (array[(x)&15]) + /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. From 660231aa9727d29c7d2c16319bc6a3fa8bed3e0e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 12 Aug 2009 15:47:55 -0400 Subject: [PATCH 15/17] block-sha1: support for architectures with memory alignment restrictions This is needed on architectures with poor or non-existent unaligned memory support and/or no fast byte swap instruction (such as ARM) by using byte accesses to memory and shifting the result together. This also makes the code portable, therefore the byte access methods are the defaults. Any architecture that properly supports unaligned word accesses in hardware simply has to enable the alternative methods. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 67c9bd0723..d3121f7a02 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -60,6 +60,34 @@ #define setW(x, val) (W(x) = (val)) #endif +/* + * Performance might be improved if the CPU architecture is OK with + * unaligned 32-bit loads and a fast ntohl() is available. + * Otherwise fall back to byte loads and shifts which is portable, + * and is faster on architectures with memory alignment issues. + */ + +#if defined(__i386__) || defined(__x86_64__) + +#define get_be32(p) ntohl(*(unsigned int *)(p)) +#define put_be32(p, v) do { *(unsigned int *)(p) = htonl(v); } while (0) + +#else + +#define get_be32(p) ( \ + (*((unsigned char *)(p) + 0) << 24) | \ + (*((unsigned char *)(p) + 1) << 16) | \ + (*((unsigned char *)(p) + 2) << 8) | \ + (*((unsigned char *)(p) + 3) << 0) ) +#define put_be32(p, v) do { \ + unsigned int __v = (v); \ + *((unsigned char *)(p) + 0) = __v >> 24; \ + *((unsigned char *)(p) + 1) = __v >> 16; \ + *((unsigned char *)(p) + 2) = __v >> 8; \ + *((unsigned char *)(p) + 3) = __v >> 0; } while (0) + +#endif + /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) @@ -67,7 +95,7 @@ * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ -#define SHA_SRC(t) htonl(data[t]) +#define SHA_SRC(t) get_be32(data + t) #define SHA_MIX(t) SHA_ROL(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ @@ -245,5 +273,5 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* Output hash */ for (i = 0; i < 5; i++) - ((unsigned int *)hashout)[i] = htonl(ctx->H[i]); + put_be32(hashout + i*4, ctx->H[i]); } From ee7dc310af660f423732369e955651ef2f05011d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 13 Aug 2009 00:29:14 -0400 Subject: [PATCH 16/17] block-sha1: more good unaligned memory access candidates In addition to X86, PowerPC and S390 are capable of unaligned memory accesses. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index d3121f7a02..e5a100754e 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -67,7 +67,10 @@ * and is faster on architectures with memory alignment issues. */ -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || \ + defined(__ppc__) || defined(__ppc64__) || \ + defined(__powerpc__) || defined(__powerpc64__) || \ + defined(__s390__) || defined(__s390x__) #define get_be32(p) ntohl(*(unsigned int *)(p)) #define put_be32(p, v) do { *(unsigned int *)(p) = htonl(v); } while (0) From a12218572f2875e91b6c3c12559b076c4949a675 Mon Sep 17 00:00:00 2001 From: Brandon Casey Date: Fri, 14 Aug 2009 17:52:15 -0500 Subject: [PATCH 17/17] block-sha1/sha1.c: silence compiler complaints by casting void * to char * Some compilers produce errors when arithmetic is attempted on pointers to void. We want computations done on byte addresses, so cast them to char * to work them around. Signed-off-by: Brandon Casey Signed-off-by: Junio C Hamano --- block-sha1/sha1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index e5a100754e..464cb258aa 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -246,14 +246,14 @@ void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) memcpy(lenW + (char *)ctx->W, data, left); lenW = (lenW + left) & 63; len -= left; - data += left; + data = ((const char *)data + left); if (lenW) return; blk_SHA1_Block(ctx, ctx->W); } while (len >= 64) { blk_SHA1_Block(ctx, data); - data += 64; + data = ((const char *)data + 64); len -= 64; } if (len)