From 6b7d25d97bdb8a26719f90d17ff5c9720be68762 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Tue, 21 Feb 2006 20:45:36 -0500
Subject: [PATCH 01/16] diff-delta: produce optimal pack data

Indexing based on adler32 has a match precision based on the block size
(currently 16).  Lowering the block size would produce smaller deltas
but the indexing memory and computing cost increases significantly.

For optimal delta result the indexing block size should be 3 with an
increment of 1 (instead of 16 and 16).  With such low params the adler32
becomes a clear overhead increasing the time for git-repack by a factor
of 3.  And with such small blocks the adler 32 is not very useful as the
whole of the block bits can be used directly.

This patch replaces the adler32 with an open coded index value based on
3 characters directly.  This gives sufficient bits for hashing and
allows for optimal delta with reasonable CPU cycles.

The resulting packs are 6% smaller on average.  The increase in CPU time
is about 25%.  But this cost is now hidden by the delta reuse patch
while the saving on data transfers is always there.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diff-delta.c | 77 ++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 2ed5984b1c..27f83a0858 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -20,21 +20,11 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <zlib.h>
 #include "delta.h"
 
 
-/* block size: min = 16, max = 64k, power of 2 */
-#define BLK_SIZE 16
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define GR_PRIME 0x9e370001
-#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
-
 struct index {
 	const unsigned char *ptr;
-	unsigned int val;
 	struct index *next;
 };
 
@@ -42,21 +32,21 @@ static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned int hsize, hshift, entries, blksize, i;
+	unsigned long hsize;
+	unsigned int hshift, i;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
-	hsize = entries / 4;
-	for (i = 4; (1 << i) < hsize && i < 16; i++);
+	hsize = bufsize / 4;
+	for (i = 8; (1 << i) < hsize && i < 16; i++);
 	hsize = 1 << i;
-	hshift = 32 - i;
+	hshift = i - 8;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
+	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
@@ -64,17 +54,12 @@ static struct index ** delta_index(const unsigned char *buf,
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* then populate it */
-	data = buf + entries * BLK_SIZE - BLK_SIZE;
-	blksize = bufsize - (data - buf);
-	while (data >= buf) {
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hshift);
-		entry->ptr = data;
-		entry->val = val;
+	data = buf + bufsize - 2;
+	while (data > buf) {
+		entry->ptr = --data;
+		i = data[0] ^ data[1] ^ (data[2] << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
-		blksize = BLK_SIZE;
-		data -= BLK_SIZE;
  	}
 
 	return hash;
@@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		unsigned int blksize = MIN(top - data, BLK_SIZE);
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hash_shift);
-		for (entry = hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			while (ref_size && *src++ == *ref) {
-				ref++;
-				ref_size--;
-			}
-			ref_size = ref - entry->ptr;
-			if (ref_size > msize) {
-				/* this is our best match so far */
-				moff = entry->ptr - ref_data;
-				msize = ref_size;
-				if (msize >= 0x10000) {
-					msize = 0x10000;
+		if (data + 2 < top) {
+			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+			for (entry = hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size > 0x10000)
+					ref_size = 0x10000;
+				if (ref_size <= msize)
 					break;
+				while (ref_size && *src++ == *ref) {
+					ref++;
+					ref_size--;
+				}
+				ref_size = ref - entry->ptr;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
 				}
 			}
 		}

From 907380eeff27e9a07d6f1c03847c3d80f9e0e79a Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Thu, 23 Feb 2006 02:58:37 -0800
Subject: [PATCH 02/16] count-delta: tweak counting of copied source material.

With the finer grained delta algorithm, count-delta algorithm
started overcounting copied source material, since the new delta
output tends to reuse the same source range more than once and
more aggressively.  This broke an earlier assumption that the
number of bytes copied out from the source buffer is a good
approximation how much source material is actually remaining in
the result.

This uses fairly inefficient algorithm to keep track of ranges
of source material that are actually copied out to the
destination buffer.  With this tweak, the obvious rename/break
detection tests in the testsuite start to work again.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 count-delta.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 6 deletions(-)

diff --git a/count-delta.c b/count-delta.c
index 058a2aadb1..4e4d2f4fcc 100644
--- a/count-delta.c
+++ b/count-delta.c
@@ -3,11 +3,74 @@
  * The delta-parsing part is almost straight copy of patch-delta.c
  * which is (C) 2005 Nicolas Pitre <nico@cam.org>.
  */
+#include "cache.h"
+#include "delta.h"
+#include "count-delta.h"
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
-#include "delta.h"
-#include "count-delta.h"
+
+struct span {
+	struct span *next;
+	unsigned long ofs;
+	unsigned long end;
+};
+
+static void touch_range(struct span **span,
+			unsigned long ofs, unsigned long end)
+{
+	struct span *e = *span;
+	struct span *p = NULL;
+
+	while (e && e->ofs <= ofs) {
+		again:
+		if (ofs < e->end) {
+			while (e->end < end) {
+				if (e->next) {
+					e->end = e->next->ofs;
+					e = e->next;
+				}
+				else {
+					e->end = end;
+					return;
+				}
+			}
+			return;
+		}
+		p = e;
+		e = e->next;
+	}
+	if (e && e->ofs <= end) {
+		e->ofs = ofs;
+		goto again;
+	}
+	else {
+		e = xmalloc(sizeof(*e));
+		e->ofs = ofs;
+		e->end = end;
+		if (p) {
+			e->next = p->next;
+			p->next = e;
+		}
+		else {
+			e->next = *span;
+			*span = e;
+		}
+	}
+}
+
+static unsigned long count_range(struct span *s)
+{
+	struct span *t;
+	unsigned long sz = 0;
+	while (s) {
+		t = s;
+		sz += s->end - s->ofs;
+		s = s->next;
+		free(t);
+	}
+	return sz;
+}
 
 /*
  * NOTE.  We do not _interpret_ delta fully.  As an approximation, we
@@ -21,10 +84,11 @@
 int count_delta(void *delta_buf, unsigned long delta_size,
 		unsigned long *src_copied, unsigned long *literal_added)
 {
-	unsigned long copied_from_source, added_literal;
+	unsigned long added_literal;
 	const unsigned char *data, *top;
 	unsigned char cmd;
 	unsigned long src_size, dst_size, out;
+	struct span *span = NULL;
 
 	if (delta_size < DELTA_SIZE_MIN)
 		return -1;
@@ -35,7 +99,7 @@ int count_delta(void *delta_buf, unsigned long delta_size,
 	src_size = get_delta_hdr_size(&data);
 	dst_size = get_delta_hdr_size(&data);
 
-	added_literal = copied_from_source = out = 0;
+	added_literal = out = 0;
 	while (data < top) {
 		cmd = *data++;
 		if (cmd & 0x80) {
@@ -49,7 +113,7 @@ int count_delta(void *delta_buf, unsigned long delta_size,
 			if (cmd & 0x40) cp_size |= (*data++ << 16);
 			if (cp_size == 0) cp_size = 0x10000;
 
-			copied_from_source += cp_size;
+			touch_range(&span, cp_off, cp_off+cp_size);
 			out += cp_size;
 		} else {
 			/* write literal into dst */
@@ -59,6 +123,8 @@ int count_delta(void *delta_buf, unsigned long delta_size,
 		}
 	}
 
+	*src_copied = count_range(span);
+
 	/* sanity check */
 	if (data != top || out != dst_size)
 		return -1;
@@ -66,7 +132,6 @@ int count_delta(void *delta_buf, unsigned long delta_size,
 	/* delete size is what was _not_ copied from source.
 	 * edit size is that and literal additions.
 	 */
-	*src_copied = copied_from_source;
 	*literal_added = added_literal;
 	return 0;
 }

From c86e8568d87aec483379f2cef0ab81580abd1af5 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Thu, 23 Feb 2006 02:58:37 -0800
Subject: [PATCH 03/16] count-delta: fix counting of copied source.

The previous one wrongly coalesced a span with the next one
even though the span being added does not reach it.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 count-delta.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/count-delta.c b/count-delta.c
index 4e4d2f4fcc..3ee3a0ccf1 100644
--- a/count-delta.c
+++ b/count-delta.c
@@ -26,7 +26,7 @@ static void touch_range(struct span **span,
 		again:
 		if (ofs < e->end) {
 			while (e->end < end) {
-				if (e->next) {
+				if (e->next && e->next->ofs <= end) {
 					e->end = e->next->ofs;
 					e = e->next;
 				}

From 581845f0b8ed97cb718fffe2bc9613b6186d84ee Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Thu, 23 Feb 2006 17:02:56 -0800
Subject: [PATCH 04/16] Tweak break/merge score to adjust to the new delta
 generation code.

This lowers the default merge threshold score to 75% from
earlier 80%.  The break threshold stays the same at 50% for now,
but we might want to revisit it (and the rename detection limit
as well).

 * break score: this much edit (both insertion of new material
   and deletion of old material) needs to be there in the file
   before we consider this _might_ be a rewrite and break the
   filepair.

 * merge score: after a filepair is broken by the above criteria
   and goes through rename detection, if their pieces did not
   match with other files as rename/copy, we merge them back
   into one as if nothing happened.  If the filepair had at
   least this much deletion of old material, however, we say
   this is completely rewritten with dissimilarity index X% when
   we do so.

The updated delta code by Nico is so good that what we earlier
thought to be complete rewrite now reuses a lot more from the
source material (reducing the counted "delete"), so this
adjustment is needed to keep the perceived behaviour similar to
what we had earlier.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diffcore.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/diffcore.h b/diffcore.h
index 12cd816591..91d6c631e6 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -18,7 +18,7 @@
 #define MAX_SCORE 60000.0
 #define DEFAULT_RENAME_SCORE 30000 /* rename/copy similarity minimum (50%) */
 #define DEFAULT_BREAK_SCORE  30000 /* minimum for break to happen (50%)*/
-#define DEFAULT_MERGE_SCORE  48000 /* maximum for break-merge to happen (80%)*/
+#define DEFAULT_MERGE_SCORE  45000 /* maximum for break-merge to happen (75%)*/
 
 #define MINIMUM_BREAK_SIZE     400 /* do not break a file smaller than this */
 

From eae3fe5e509f3d3890bc99015cb02f9b67aa501c Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Fri, 24 Feb 2006 00:54:59 -0800
Subject: [PATCH 05/16] Revert "diff-delta: produce optimal pack data"

This reverts 6b7d25d97bdb8a26719f90d17ff5c9720be68762 commit.

It turns out that the new algorithm has a really bad corner
case, that literally spends minutes for inputs that takes less
than a quater seconds to delta with the old algorithm.  The
resulting delta is 50% smaller which is admirable, but the
performance degradation is simply unacceptable for unconditional
use.

Some example cases are these blobs in Linux 2.6 repository:

    4917ec509720a42846d513addc11cbd25e0e3c4f
    9af06ba723df75fed49f7ccae5b6c9c34bc5115f
    dfc9cd58dc065d17030d875d3fea6e7862ede143

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diff-delta.c | 77 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 27f83a0858..2ed5984b1c 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -20,11 +20,21 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <zlib.h>
 #include "delta.h"
 
 
+/* block size: min = 16, max = 64k, power of 2 */
+#define BLK_SIZE 16
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define GR_PRIME 0x9e370001
+#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
+
 struct index {
 	const unsigned char *ptr;
+	unsigned int val;
 	struct index *next;
 };
 
@@ -32,21 +42,21 @@ static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned long hsize;
-	unsigned int hshift, i;
+	unsigned int hsize, hshift, entries, blksize, i;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	hsize = bufsize / 4;
-	for (i = 8; (1 << i) < hsize && i < 16; i++);
+	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
+	hsize = entries / 4;
+	for (i = 4; (1 << i) < hsize && i < 16; i++);
 	hsize = 1 << i;
-	hshift = i - 8;
+	hshift = 32 - i;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
+	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
@@ -54,12 +64,17 @@ static struct index ** delta_index(const unsigned char *buf,
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* then populate it */
-	data = buf + bufsize - 2;
-	while (data > buf) {
-		entry->ptr = --data;
-		i = data[0] ^ data[1] ^ (data[2] << hshift);
+	data = buf + entries * BLK_SIZE - BLK_SIZE;
+	blksize = bufsize - (data - buf);
+	while (data >= buf) {
+		unsigned int val = adler32(0, data, blksize);
+		i = HASH(val, hshift);
+		entry->ptr = data;
+		entry->val = val;
 		entry->next = hash[i];
 		hash[i] = entry++;
+		blksize = BLK_SIZE;
+		data -= BLK_SIZE;
  	}
 
 	return hash;
@@ -126,27 +141,29 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		if (data + 2 < top) {
-			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
-			for (entry = hash[i]; entry; entry = entry->next) {
-				const unsigned char *ref = entry->ptr;
-				const unsigned char *src = data;
-				unsigned int ref_size = ref_top - ref;
-				if (ref_size > top - src)
-					ref_size = top - src;
-				if (ref_size > 0x10000)
-					ref_size = 0x10000;
-				if (ref_size <= msize)
+		unsigned int blksize = MIN(top - data, BLK_SIZE);
+		unsigned int val = adler32(0, data, blksize);
+		i = HASH(val, hash_shift);
+		for (entry = hash[i]; entry; entry = entry->next) {
+			const unsigned char *ref = entry->ptr;
+			const unsigned char *src = data;
+			unsigned int ref_size = ref_top - ref;
+			if (entry->val != val)
+				continue;
+			if (ref_size > top - src)
+				ref_size = top - src;
+			while (ref_size && *src++ == *ref) {
+				ref++;
+				ref_size--;
+			}
+			ref_size = ref - entry->ptr;
+			if (ref_size > msize) {
+				/* this is our best match so far */
+				moff = entry->ptr - ref_data;
+				msize = ref_size;
+				if (msize >= 0x10000) {
+					msize = 0x10000;
 					break;
-				while (ref_size && *src++ == *ref) {
-					ref++;
-					ref_size--;
-				}
-				ref_size = ref - entry->ptr;
-				if (msize < ref - entry->ptr) {
-					/* this is our best match so far */
-					msize = ref - entry->ptr;
-					moff = entry->ptr - ref_data;
 				}
 			}
 		}

From bec2a69fe4c2dbb377d2742a4def7e3569b4c1d4 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Mon, 27 Feb 2006 21:37:56 -0800
Subject: [PATCH 06/16] Revert "Revert "diff-delta: produce optimal pack data""

---
 diff-delta.c | 77 ++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 2ed5984b1c..27f83a0858 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -20,21 +20,11 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <zlib.h>
 #include "delta.h"
 
 
-/* block size: min = 16, max = 64k, power of 2 */
-#define BLK_SIZE 16
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define GR_PRIME 0x9e370001
-#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
-
 struct index {
 	const unsigned char *ptr;
-	unsigned int val;
 	struct index *next;
 };
 
@@ -42,21 +32,21 @@ static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned int hsize, hshift, entries, blksize, i;
+	unsigned long hsize;
+	unsigned int hshift, i;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
-	hsize = entries / 4;
-	for (i = 4; (1 << i) < hsize && i < 16; i++);
+	hsize = bufsize / 4;
+	for (i = 8; (1 << i) < hsize && i < 16; i++);
 	hsize = 1 << i;
-	hshift = 32 - i;
+	hshift = i - 8;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
+	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
@@ -64,17 +54,12 @@ static struct index ** delta_index(const unsigned char *buf,
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* then populate it */
-	data = buf + entries * BLK_SIZE - BLK_SIZE;
-	blksize = bufsize - (data - buf);
-	while (data >= buf) {
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hshift);
-		entry->ptr = data;
-		entry->val = val;
+	data = buf + bufsize - 2;
+	while (data > buf) {
+		entry->ptr = --data;
+		i = data[0] ^ data[1] ^ (data[2] << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
-		blksize = BLK_SIZE;
-		data -= BLK_SIZE;
  	}
 
 	return hash;
@@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		unsigned int blksize = MIN(top - data, BLK_SIZE);
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hash_shift);
-		for (entry = hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			while (ref_size && *src++ == *ref) {
-				ref++;
-				ref_size--;
-			}
-			ref_size = ref - entry->ptr;
-			if (ref_size > msize) {
-				/* this is our best match so far */
-				moff = entry->ptr - ref_data;
-				msize = ref_size;
-				if (msize >= 0x10000) {
-					msize = 0x10000;
+		if (data + 2 < top) {
+			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+			for (entry = hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size > 0x10000)
+					ref_size = 0x10000;
+				if (ref_size <= msize)
 					break;
+				while (ref_size && *src++ == *ref) {
+					ref++;
+					ref_size--;
+				}
+				ref_size = ref - entry->ptr;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
 				}
 			}
 		}

From 2b8d9347aa1a11f1ac13591f89ca9f984d467c77 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Mon, 27 Feb 2006 23:09:55 -0500
Subject: [PATCH 07/16] diff-delta: bound hash list length to avoid O(m*n)
 behavior

The diff-delta code can exhibit O(m*n) behavior with some patological
data set where most hash entries end up in the same hash bucket.

The latest code rework reduced the block size making it particularly
vulnerable to this issue, but the issue was always there and can be
triggered regardless of the block size.

This patch does two things:

1) the hashing has been reworked to offer a better distribution to
   atenuate the problem a bit, and

2) a limit is imposed to the number of entries that can exist in the
   same hash bucket.

Because of the above the code is a bit more expensive on average, but
the problematic samples used to diagnoze the issue are now orders of
magnitude less expensive to process with only a slight loss in
compression.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diff-delta.c | 69 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 13 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 27f83a0858..0730b24df8 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -30,19 +30,20 @@ struct index {
 
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
+				   unsigned long trg_bufsize,
 				   unsigned int *hash_shift)
 {
 	unsigned long hsize;
-	unsigned int hshift, i;
+	unsigned int i, hshift, hlimit, *hash_count;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
 	hsize = bufsize / 4;
-	for (i = 8; (1 << i) < hsize && i < 16; i++);
+	for (i = 8; (1 << i) < hsize && i < 24; i += 2);
 	hsize = 1 << i;
-	hshift = i - 8;
+	hshift = (i - 8) / 2;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
@@ -53,15 +54,59 @@ static struct index ** delta_index(const unsigned char *buf,
 	entry = mem + hsize * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 
-	/* then populate it */
+	/* allocate an array to count hash entries */
+	hash_count = calloc(hsize, sizeof(*hash_count));
+	if (!hash_count) {
+		free(hash);
+		return NULL;
+	}
+
+	/* then populate the index */
 	data = buf + bufsize - 2;
 	while (data > buf) {
 		entry->ptr = --data;
-		i = data[0] ^ data[1] ^ (data[2] << hshift);
+		i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
+		hash_count[i]++;
  	}
 
+	/*
+	 * Determine a limit on the number of entries in the same hash
+	 * bucket.  This guard us against patological data sets causing
+	 * really bad hash distribution with most entries in the same hash
+	 * bucket that would bring us to O(m*n) computing costs (m and n
+	 * corresponding to reference and target buffer sizes).
+	 *
+	 * The more the target buffer is large, the more it is important to
+	 * have small entry lists for each hash buckets.  With such a limit
+	 * the cost is bounded to something more like O(m+n).
+	 */
+	hlimit = (1 << 26) / trg_bufsize;
+	if (hlimit < 16)
+		hlimit = 16;
+
+	/*
+	 * Now make sure none of the hash buckets has more entries than
+	 * we're willing to test.  Otherwise we short-circuit the entry
+	 * list uniformly to still preserve a good repartition across
+	 * the reference buffer.
+	 */
+	for (i = 0; i < hsize; i++) {
+		if (hash_count[i] < hlimit)
+			continue;
+		entry = hash[i];
+		do {
+			struct index *keep = entry;
+			int skip = hash_count[i] / hlimit / 2;
+			do {
+				entry = entry->next;
+			} while(--skip && entry);
+			keep->next = entry;
+		} while(entry);
+	}
+	free(hash_count);
+
 	return hash;
 }
 
@@ -85,7 +130,7 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	if (!from_size || !to_size)
 		return NULL;
-	hash = delta_index(from_buf, from_size, &hash_shift);
+	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 	if (!hash)
 		return NULL;
 
@@ -126,8 +171,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		if (data + 2 < top) {
-			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+		if (data + 3 <= top) {
+			i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 			for (entry = hash[i]; entry; entry = entry->next) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
@@ -138,11 +183,9 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 					ref_size = 0x10000;
 				if (ref_size <= msize)
 					break;
-				while (ref_size && *src++ == *ref) {
-					ref++;
-					ref_size--;
-				}
-				ref_size = ref - entry->ptr;
+				if (*ref != *src)
+					continue;
+				while (ref_size-- && *++src == *++ref);
 				if (msize < ref - entry->ptr) {
 					/* this is our best match so far */
 					msize = ref - entry->ptr;

From 49cc27bb46dc5dddc59c9802c1eed1ee6126537e Mon Sep 17 00:00:00 2001
From: Martin Langhoff <martin@catalyst.net.nz>
Date: Fri, 24 Feb 2006 12:20:51 +1300
Subject: [PATCH 08/16] cvsserver: add notes on how to get a checkout under
 Eclipse

---
 Documentation/git-cvsserver.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Documentation/git-cvsserver.txt b/Documentation/git-cvsserver.txt
index 88f07ff15d..fcc49b29c9 100644
--- a/Documentation/git-cvsserver.txt
+++ b/Documentation/git-cvsserver.txt
@@ -54,6 +54,30 @@ INSTALLATION
    of branches in git).
      $ cvs co -d mylocaldir master
 
+Eclipse CVS Client Notes
+------------------------
+
+To get a checkout with the Eclipse CVS client:
+
+1. Create a new project from CVS checkout, giving it repository and module 
+2. Context Menu->Team->Share Project...
+3. Enter the repository and module information again and click Finish
+4. The Synchronize view appears. Untick  "launch commit wizard" to avoid 
+committing the .project file, and select HEAD as the tag to synchronize to. 
+Update all incoming changes.
+
+Note that most versions of Eclipse ignore CVS_SERVER (which you can set in 
+the Preferences->Team->CVS->ExtConnection pane), so you may have to 
+rename, alias or symlink git-cvsserver to 'cvs' on the server.
+
+Clients known to work
+---------------------
+
+CVS 1.12.9 on Debian
+CVS 1.11.17 on MacOSX (from Fink package)
+Eclipse 3.0, 3.1.2 on MacOSX (see Eclipse CVS Client Notes)
+TortoiseCVS
+
 Operations supported
 --------------------
 

From 5793aa1cc08df7c4021509556637ade2d639083f Mon Sep 17 00:00:00 2001
From: Martin Langhoff <martin@catalyst.net.nz>
Date: Wed, 1 Mar 2006 19:30:35 +1300
Subject: [PATCH 09/16] cvsserver: Eclipse compat fixes - implement
 Questionable, alias rlog, add a space after the U

A few things to satisfy Eclipse's strange habits as a cvs client:

- Implement Questionable
- Aliased rlog to log, but more work may be needed
- Add a space after the U that indicates updated
---
 git-cvsserver.perl | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/git-cvsserver.perl b/git-cvsserver.perl
index d20d1a8c4b..33fcc1156b 100755
--- a/git-cvsserver.perl
+++ b/git-cvsserver.perl
@@ -53,6 +53,7 @@
     'Entry'           => \&req_Entry,
     'Modified'        => \&req_Modified,
     'Unchanged'       => \&req_Unchanged,
+    'Questionable'    => \&req_Questionable,
     'Argument'        => \&req_Argument,
     'Argumentx'       => \&req_Argument,
     'expand-modules'  => \&req_expandmodules,
@@ -63,6 +64,7 @@
     'ci'              => \&req_ci,
     'diff'            => \&req_diff,
     'log'             => \&req_log,
+    'rlog'            => \&req_log,
     'tag'             => \&req_CATCHALL,
     'status'          => \&req_status,
     'admin'           => \&req_CATCHALL,
@@ -459,6 +461,22 @@ sub req_Unchanged
     #$log->debug("req_Unchanged : $data");
 }
 
+# Questionable filename \n
+#     Response expected: no. Additional data: no. 
+#     Tell the server to check whether filename should be ignored,
+#     and if not, next time the server sends responses, send (in
+#     a M response) `?' followed by the directory and filename.
+#     filename must not contain `/'; it needs to be a file in the
+#     directory named by the most recent Directory request.
+sub req_Questionable
+{
+    my ( $cmd, $data ) = @_;
+
+    $state->{entries}{$state->{directory}.$data}{questionable} = 1;
+
+    #$log->debug("req_Questionable : $data");
+}
+
 # Argument text \n
 #     Response expected: no. Save argument for use in a subsequent command.
 #     Arguments accumulate until an argument-using command is given, at which
@@ -568,7 +586,7 @@ sub req_co
 
         # print some information to the client
         print "MT +updated\n";
-        print "MT text U\n";
+        print "MT text U \n";
         if ( defined ( $git->{dir} ) and $git->{dir} ne "./" )
         {
             print "MT fname $checkout_path/$git->{dir}$git->{name}\n";
@@ -579,9 +597,9 @@ sub req_co
         print "MT -updated\n";
 
         # instruct client we're sending a file to put in this path
-        print "Created $checkout_path/" . ( defined ( $git->{dir} ) ? $git->{dir} . "/" : "" ) . "\n";
+        print "Created $checkout_path/" . ( defined ( $git->{dir} ) and $git->{dir} ne "./" ? $git->{dir} . "/" : "" ) . "\n";
 
-        print $state->{CVSROOT} . "/$module/" . ( defined ( $git->{dir} ) ? $git->{dir} . "/" : "" ) . "$git->{name}\n";
+        print $state->{CVSROOT} . "/$module/" . ( defined ( $git->{dir} ) and $git->{dir} ne "./" ? $git->{dir} . "/" : "" ) . "$git->{name}\n";
 
         # this is an "entries" line
         print "/$git->{name}/1.$git->{revision}///\n";

From f0bcd511ee3a00b7fd3975a386aa1165c07a0721 Mon Sep 17 00:00:00 2001
From: Martin Langhoff <martin@catalyst.net.nz>
Date: Wed, 1 Mar 2006 20:03:58 +1300
Subject: [PATCH 10/16] cvsserver: Eclipse compat - browsing 'modules' (heads
 in our case) works

Eclipse CVS clients have an odd way of perusing the top level of
the repository, by calling update on module "". So reproduce cvs'
odd behaviour in the interest of compatibility.

It makes it much easier to get a checkout when using Eclipse.
---
 git-cvsserver.perl | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/git-cvsserver.perl b/git-cvsserver.perl
index 33fcc1156b..20a9baeb71 100755
--- a/git-cvsserver.perl
+++ b/git-cvsserver.perl
@@ -630,6 +630,26 @@ sub req_update
 
     argsplit("update");
 
+    #
+    # It may just be a client exploring the available heads/modukles
+    # in that case, list them as top level directories and leave it
+    # at that. Eclipse uses this technique to offer you a list of
+    # projects (heads in this case) to checkout.
+    #
+    if ($state->{module} eq '') {
+        print "E cvs update: Updating .\n";
+	opendir HEADS, $state->{CVSROOT} . '/refs/heads';
+	while (my $head = readdir(HEADS)) {
+	    if (-f $state->{CVSROOT} . '/refs/heads/' . $head) {
+	        print "E cvs update: New directory `$head'\n";
+	    }
+	}
+	closedir HEADS;
+	print "ok\n";
+	return 1;
+    }
+
+
     # Grab a handle to the SQLite db and do any necessary updates
     my $updater = GITCVS::updater->new($state->{CVSROOT}, $state->{module}, $log);
 

From c436eb8cf1efa3fe2c70100ae0cbc48f0feaf5af Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Mon, 27 Feb 2006 23:38:50 -0800
Subject: [PATCH 11/16] diff-delta: cull collided hash bucket more
 aggressively.

This tries to limit collided hash buckets by removing identical
three-byte prefix from the same hashbucket.
---
 diff-delta.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 0730b24df8..b7190ea47a 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -88,22 +88,35 @@ static struct index ** delta_index(const unsigned char *buf,
 
 	/*
 	 * Now make sure none of the hash buckets has more entries than
-	 * we're willing to test.  Otherwise we short-circuit the entry
-	 * list uniformly to still preserve a good repartition across
-	 * the reference buffer.
+	 * we're willing to test.  Otherwise we cull the entry list to
+	 * limit identical three byte prefixes to still preserve a good
+	 * repartition across the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
+		struct index **list, *bucket, *remaining;
+		int cnt;
 		if (hash_count[i] < hlimit)
 			continue;
-		entry = hash[i];
-		do {
-			struct index *keep = entry;
-			int skip = hash_count[i] / hlimit / 2;
-			do {
-				entry = entry->next;
-			} while(--skip && entry);
-			keep->next = entry;
-		} while(entry);
+
+		bucket = NULL;
+		list = &bucket;
+		remaining = hash[i];
+		cnt = 0;
+		while (cnt < hlimit && remaining) {
+			struct index *this = remaining, *that;
+			remaining = remaining->next;
+			for (that = bucket; that; that = that->next) {
+				if (!memcmp(that->ptr, this->ptr, 3))
+					break;
+			}
+			if (that)
+				continue; /* discard */
+			cnt++;
+			*list = this;
+			list = &(this->next);
+			this->next = NULL;
+		}
+		hash[i] = bucket;
 	}
 	free(hash_count);
 

From cc5c59a30ccba8b9eac503271661af9b95edb0af Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Tue, 21 Feb 2006 20:45:36 -0500
Subject: [PATCH 12/16] diff-delta: produce optimal pack data

Indexing based on adler32 has a match precision based on the block size
(currently 16).  Lowering the block size would produce smaller deltas
but the indexing memory and computing cost increases significantly.

For optimal delta result the indexing block size should be 3 with an
increment of 1 (instead of 16 and 16).  With such low params the adler32
becomes a clear overhead increasing the time for git-repack by a factor
of 3.  And with such small blocks the adler 32 is not very useful as the
whole of the block bits can be used directly.

This patch replaces the adler32 with an open coded index value based on
3 characters directly.  This gives sufficient bits for hashing and
allows for optimal delta with reasonable CPU cycles.

The resulting packs are 6% smaller on average.  The increase in CPU time
is about 25%.  But this cost is now hidden by the delta reuse patch
while the saving on data transfers is always there.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diff-delta.c | 77 ++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 2ed5984b1c..27f83a0858 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -20,21 +20,11 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <zlib.h>
 #include "delta.h"
 
 
-/* block size: min = 16, max = 64k, power of 2 */
-#define BLK_SIZE 16
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define GR_PRIME 0x9e370001
-#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
-
 struct index {
 	const unsigned char *ptr;
-	unsigned int val;
 	struct index *next;
 };
 
@@ -42,21 +32,21 @@ static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned int hsize, hshift, entries, blksize, i;
+	unsigned long hsize;
+	unsigned int hshift, i;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
-	hsize = entries / 4;
-	for (i = 4; (1 << i) < hsize && i < 16; i++);
+	hsize = bufsize / 4;
+	for (i = 8; (1 << i) < hsize && i < 16; i++);
 	hsize = 1 << i;
-	hshift = 32 - i;
+	hshift = i - 8;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
+	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
@@ -64,17 +54,12 @@ static struct index ** delta_index(const unsigned char *buf,
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* then populate it */
-	data = buf + entries * BLK_SIZE - BLK_SIZE;
-	blksize = bufsize - (data - buf);
-	while (data >= buf) {
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hshift);
-		entry->ptr = data;
-		entry->val = val;
+	data = buf + bufsize - 2;
+	while (data > buf) {
+		entry->ptr = --data;
+		i = data[0] ^ data[1] ^ (data[2] << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
-		blksize = BLK_SIZE;
-		data -= BLK_SIZE;
  	}
 
 	return hash;
@@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		unsigned int blksize = MIN(top - data, BLK_SIZE);
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hash_shift);
-		for (entry = hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			while (ref_size && *src++ == *ref) {
-				ref++;
-				ref_size--;
-			}
-			ref_size = ref - entry->ptr;
-			if (ref_size > msize) {
-				/* this is our best match so far */
-				moff = entry->ptr - ref_data;
-				msize = ref_size;
-				if (msize >= 0x10000) {
-					msize = 0x10000;
+		if (data + 2 < top) {
+			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+			for (entry = hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size > 0x10000)
+					ref_size = 0x10000;
+				if (ref_size <= msize)
 					break;
+				while (ref_size && *src++ == *ref) {
+					ref++;
+					ref_size--;
+				}
+				ref_size = ref - entry->ptr;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
 				}
 			}
 		}

From 5bb86b82ba18dd2eb736c4f5565f9c920f815b8f Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Mon, 27 Feb 2006 23:09:55 -0500
Subject: [PATCH 13/16] diff-delta: bound hash list length to avoid O(m*n)
 behavior

The diff-delta code can exhibit O(m*n) behavior with some patological
data set where most hash entries end up in the same hash bucket.

The latest code rework reduced the block size making it particularly
vulnerable to this issue, but the issue was always there and can be
triggered regardless of the block size.

This patch does two things:

1) the hashing has been reworked to offer a better distribution to
   atenuate the problem a bit, and

2) a limit is imposed to the number of entries that can exist in the
   same hash bucket.

Because of the above the code is a bit more expensive on average, but
the problematic samples used to diagnoze the issue are now orders of
magnitude less expensive to process with only a slight loss in
compression.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 diff-delta.c | 69 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 13 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 27f83a0858..0730b24df8 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -30,19 +30,20 @@ struct index {
 
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
+				   unsigned long trg_bufsize,
 				   unsigned int *hash_shift)
 {
 	unsigned long hsize;
-	unsigned int hshift, i;
+	unsigned int i, hshift, hlimit, *hash_count;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
 	hsize = bufsize / 4;
-	for (i = 8; (1 << i) < hsize && i < 16; i++);
+	for (i = 8; (1 << i) < hsize && i < 24; i += 2);
 	hsize = 1 << i;
-	hshift = i - 8;
+	hshift = (i - 8) / 2;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
@@ -53,15 +54,59 @@ static struct index ** delta_index(const unsigned char *buf,
 	entry = mem + hsize * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 
-	/* then populate it */
+	/* allocate an array to count hash entries */
+	hash_count = calloc(hsize, sizeof(*hash_count));
+	if (!hash_count) {
+		free(hash);
+		return NULL;
+	}
+
+	/* then populate the index */
 	data = buf + bufsize - 2;
 	while (data > buf) {
 		entry->ptr = --data;
-		i = data[0] ^ data[1] ^ (data[2] << hshift);
+		i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
+		hash_count[i]++;
  	}
 
+	/*
+	 * Determine a limit on the number of entries in the same hash
+	 * bucket.  This guard us against patological data sets causing
+	 * really bad hash distribution with most entries in the same hash
+	 * bucket that would bring us to O(m*n) computing costs (m and n
+	 * corresponding to reference and target buffer sizes).
+	 *
+	 * The more the target buffer is large, the more it is important to
+	 * have small entry lists for each hash buckets.  With such a limit
+	 * the cost is bounded to something more like O(m+n).
+	 */
+	hlimit = (1 << 26) / trg_bufsize;
+	if (hlimit < 16)
+		hlimit = 16;
+
+	/*
+	 * Now make sure none of the hash buckets has more entries than
+	 * we're willing to test.  Otherwise we short-circuit the entry
+	 * list uniformly to still preserve a good repartition across
+	 * the reference buffer.
+	 */
+	for (i = 0; i < hsize; i++) {
+		if (hash_count[i] < hlimit)
+			continue;
+		entry = hash[i];
+		do {
+			struct index *keep = entry;
+			int skip = hash_count[i] / hlimit / 2;
+			do {
+				entry = entry->next;
+			} while(--skip && entry);
+			keep->next = entry;
+		} while(entry);
+	}
+	free(hash_count);
+
 	return hash;
 }
 
@@ -85,7 +130,7 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	if (!from_size || !to_size)
 		return NULL;
-	hash = delta_index(from_buf, from_size, &hash_shift);
+	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 	if (!hash)
 		return NULL;
 
@@ -126,8 +171,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		if (data + 2 < top) {
-			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+		if (data + 3 <= top) {
+			i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 			for (entry = hash[i]; entry; entry = entry->next) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
@@ -138,11 +183,9 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 					ref_size = 0x10000;
 				if (ref_size <= msize)
 					break;
-				while (ref_size && *src++ == *ref) {
-					ref++;
-					ref_size--;
-				}
-				ref_size = ref - entry->ptr;
+				if (*ref != *src)
+					continue;
+				while (ref_size-- && *++src == *++ref);
 				if (msize < ref - entry->ptr) {
 					/* this is our best match so far */
 					msize = ref - entry->ptr;

From 38fd0721d0a2a1a723bc28fc0817e3571987b1ef Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Mon, 27 Feb 2006 23:38:28 -0500
Subject: [PATCH 14/16] diff-delta: allow reusing of the reference buffer index

When a reference buffer is used multiple times then its index can be
computed only once and reused multiple times.  This patch adds an extra
pointer to a pointer argument (from_index) to diff_delta() for this.

If from_index is NULL then everything is like before.

If from_index is non NULL and *from_index is NULL then the index is
created and its location stored to *from_index.  In this case the caller
has the responsibility to free the memory pointed to by *from_index.

If from_index and *from_index are non NULL then the index is reused as
is.

This currently saves about 10% of CPU time to repack the git archive.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 delta.h           |  3 ++-
 diff-delta.c      | 41 +++++++++++++++++++++++++++--------------
 diffcore-break.c  |  2 +-
 diffcore-rename.c |  2 +-
 pack-objects.c    | 11 ++++++++---
 test-delta.c      |  2 +-
 6 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/delta.h b/delta.h
index a15350dabc..00fef0b8d7 100644
--- a/delta.h
+++ b/delta.h
@@ -4,7 +4,8 @@
 /* handling of delta buffers */
 extern void *diff_delta(void *from_buf, unsigned long from_size,
 			void *to_buf, unsigned long to_size,
-		        unsigned long *delta_size, unsigned long max_size);
+		        unsigned long *delta_size, unsigned long max_size,
+			void **from_index);
 extern void *patch_delta(void *src_buf, unsigned long src_size,
 			 void *delta_buf, unsigned long delta_size,
 			 unsigned long *dst_size);
diff --git a/diff-delta.c b/diff-delta.c
index 0730b24df8..dcd3f5572e 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -30,8 +30,7 @@ struct index {
 
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
-				   unsigned long trg_bufsize,
-				   unsigned int *hash_shift)
+				   unsigned long trg_bufsize)
 {
 	unsigned long hsize;
 	unsigned int i, hshift, hlimit, *hash_count;
@@ -44,14 +43,17 @@ static struct index ** delta_index(const unsigned char *buf,
 	for (i = 8; (1 << i) < hsize && i < 24; i += 2);
 	hsize = 1 << i;
 	hshift = (i - 8) / 2;
-	*hash_shift = hshift;
 
-	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
+	/*
+	 * Allocate lookup index.  Note the first hash pointer
+	 * is used to store the hash shift value.
+	 */
+	mem = malloc((1 + hsize) * sizeof(*hash) + bufsize * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
-	entry = mem + hsize * sizeof(*hash);
+	*hash++ = (void *)hshift;
+	entry = mem + (1 + hsize) * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* allocate an array to count hash entries */
@@ -107,7 +109,7 @@ static struct index ** delta_index(const unsigned char *buf,
 	}
 	free(hash_count);
 
-	return hash;
+	return hash-1;
 }
 
 /* provide the size of the copy opcode given the block offset and size */
@@ -121,7 +123,8 @@ static struct index ** delta_index(const unsigned char *buf,
 void *diff_delta(void *from_buf, unsigned long from_size,
 		 void *to_buf, unsigned long to_size,
 		 unsigned long *delta_size,
-		 unsigned long max_size)
+		 unsigned long max_size,
+		 void **from_index)
 {
 	unsigned int i, outpos, outsize, inscnt, hash_shift;
 	const unsigned char *ref_data, *ref_top, *data, *top;
@@ -130,9 +133,16 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 
 	if (!from_size || !to_size)
 		return NULL;
-	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
-	if (!hash)
-		return NULL;
+	if (from_index && *from_index) {
+		hash = *from_index;
+	} else {
+		hash = delta_index(from_buf, from_size, to_size);
+		if (!hash)
+			return NULL;
+		if (from_index)
+			*from_index = hash;
+	}
+	hash_shift = (unsigned int)(*hash++);
 
 	outpos = 0;
 	outsize = 8192;
@@ -140,7 +150,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 		outsize = max_size + MAX_OP_SIZE + 1;
 	out = malloc(outsize);
 	if (!out) {
-		free(hash);
+		if (!from_index)
+			free(hash-1);
 		return NULL;
 	}
 
@@ -241,7 +252,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 				out = realloc(out, outsize);
 			if (!out) {
 				free(tmp);
-				free(hash);
+				if (!from_index)
+					free(hash-1);
 				return NULL;
 			}
 		}
@@ -250,7 +262,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
 	if (inscnt)
 		out[outpos - inscnt - 1] = inscnt;
 
-	free(hash);
+	if (!from_index)
+		free(hash-1);
 	*delta_size = outpos;
 	return out;
 }
diff --git a/diffcore-break.c b/diffcore-break.c
index 95b5eb492e..34f1ed0731 100644
--- a/diffcore-break.c
+++ b/diffcore-break.c
@@ -71,7 +71,7 @@ static int should_break(struct diff_filespec *src,
 
 	delta = diff_delta(src->data, src->size,
 			   dst->data, dst->size,
-			   &delta_size, 0);
+			   &delta_size, 0, NULL);
 	if (!delta)
 		return 0; /* error but caught downstream */
 
diff --git a/diffcore-rename.c b/diffcore-rename.c
index ffd126af0d..099d2a2367 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -168,7 +168,7 @@ static int estimate_similarity(struct diff_filespec *src,
 	delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;
 	delta = diff_delta(src->data, src->size,
 			   dst->data, dst->size,
-			   &delta_size, delta_limit);
+			   &delta_size, delta_limit, NULL);
 	if (!delta)
 		/* If delta_limit is exceeded, we have too much differences */
 		return 0;
diff --git a/pack-objects.c b/pack-objects.c
index 136a7f5aad..d6a3463604 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -204,7 +204,7 @@ static void *delta_against(void *buf, unsigned long size, struct object_entry *e
 	if (!otherbuf)
 		die("unable to read %s", sha1_to_hex(entry->delta->sha1));
         delta_buf = diff_delta(otherbuf, othersize,
-			       buf, size, &delta_size, 0);
+			       buf, size, &delta_size, 0, NULL);
         if (!delta_buf || delta_size != entry->delta_size)
         	die("delta size changed");
         free(buf);
@@ -810,6 +810,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr
 struct unpacked {
 	struct object_entry *entry;
 	void *data;
+	void **delta_index;
 };
 
 /*
@@ -891,7 +892,8 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de
 	if (sizediff >= max_size)
 		return -1;
 	delta_buf = diff_delta(old->data, oldsize,
-			       cur->data, size, &delta_size, max_size);
+			       cur->data, size, &delta_size,
+			       max_size, old->delta_index);
 	if (!delta_buf)
 		return 0;
 	cur_entry->delta = old_entry;
@@ -948,6 +950,7 @@ static void find_deltas(struct object_entry **list, int window, int depth)
 			 */
 			continue;
 
+		free(n->delta_index);
 		free(n->data);
 		n->entry = entry;
 		n->data = read_sha1_file(entry->sha1, type, &size);
@@ -974,8 +977,10 @@ static void find_deltas(struct object_entry **list, int window, int depth)
 	if (progress)
 		fputc('\n', stderr);
 
-	for (i = 0; i < window; ++i)
+	for (i = 0; i < window; ++i) {
+		free(array[i].delta_index);
 		free(array[i].data);
+	}
 	free(array);
 }
 
diff --git a/test-delta.c b/test-delta.c
index 1be8ee0c72..89eb68ed21 100644
--- a/test-delta.c
+++ b/test-delta.c
@@ -63,7 +63,7 @@ int main(int argc, char *argv[])
 	if (argv[1][1] == 'd')
 		out_buf = diff_delta(from_buf, from_size,
 				     data_buf, data_size,
-				     &out_size, 0);
+				     &out_size, 0, NULL);
 	else
 		out_buf = patch_delta(from_buf, from_size,
 				      data_buf, data_size,

From 5398fed96699b5321678fdace86b6d4c62a155fb Mon Sep 17 00:00:00 2001
From: Martin Langhoff <martin@catalyst.net.nz>
Date: Fri, 3 Mar 2006 16:38:03 +1300
Subject: [PATCH 15/16] cvsserver: checkout faster by sending files in a
 sensible order

Just by sending the files in an ordered fashion, clients can process them
much faster. And we can optimize our check of whether we created this
directory already -- faster.

Timings for a checkout on a commandline cvs client for a project with
~13K files totalling ~100MB:

Unsorted:
  603.12 real        16.89 user        42.88 sys

Sorted:
  298.19 real        26.37 user        42.42 sys
---
 git-cvsserver.perl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/git-cvsserver.perl b/git-cvsserver.perl
index 7b3ba1b9de..f707bd9376 100755
--- a/git-cvsserver.perl
+++ b/git-cvsserver.perl
@@ -584,6 +584,7 @@ sub req_co
     print "E cvs checkout: Updating $checkout_path\n";
 
     my %seendirs = ();
+    my $lastdir ='';
 
     foreach my $git ( @{$updater->gethead} )
     {
@@ -603,7 +604,8 @@ sub req_co
             print "M U $checkout_path/$git->{name}\n";
         }
 
-	if (length($git->{dir}) && $git->{dir} ne './' && !exists($seendirs{$git->{dir}})) {
+	if (length($git->{dir}) && $git->{dir} ne './' 
+	    && $git->{dir} ne $lastdir && !exists($seendirs{$git->{dir}})) {
 
 	    # Eclipse seems to need the Clear-sticky command
 	    # to prepare the 'Entries' file for the new directory.
@@ -612,6 +614,7 @@ sub req_co
 	    print "Clear-static-directory $module/$git->{dir}\n";
 	    print $state->{CVSROOT} . "/$module/$git->{dir}\n";
 	    print "E cvs checkout: Updating /$module/$git->{dir}\n";
+	    $lastdir = $git->{dir};
 	    $seendirs{$git->{dir}} = 1;
 	}
 
@@ -2349,7 +2352,7 @@ sub gethead
 
     return $self->{gethead_cache} if ( defined ( $self->{gethead_cache} ) );
 
-    my $db_query = $self->{dbh}->prepare_cached("SELECT name, filehash, mode, revision, modified, commithash, author FROM head",{},1);
+    my $db_query = $self->{dbh}->prepare_cached("SELECT name, filehash, mode, revision, modified, commithash, author FROM head ORDER BY name ASC",{},1);
     $db_query->execute();
 
     my $tree = [];

From cfcbd3427e67056a00ec832645b057eaf33888d9 Mon Sep 17 00:00:00 2001
From: Martin Langhoff <martin@catalyst.net.nz>
Date: Fri, 3 Mar 2006 16:57:03 +1300
Subject: [PATCH 16/16] cvsserver: fix checkouts with -d <somedir>

A recent Eclipse compat fix broke checkouts with -d. Fix it so that the server
sends the correct module name instead of the destination directory name.
---
 git-cvsserver.perl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/git-cvsserver.perl b/git-cvsserver.perl
index f707bd9376..abae4e7ee4 100755
--- a/git-cvsserver.perl
+++ b/git-cvsserver.perl
@@ -576,9 +576,9 @@ sub req_co
     # Eclipse seems to need the Clear-sticky command
     # to prepare the 'Entries' file for the new directory.
     print "Clear-sticky $checkout_path/\n";
-    print $state->{CVSROOT} . "/$checkout_path/\n";
+    print $state->{CVSROOT} . "/$module/\n";
     print "Clear-static-directory $checkout_path/\n";
-    print $state->{CVSROOT} . "/$checkout_path/\n";
+    print $state->{CVSROOT} . "/$module/\n";
 
     # instruct the client that we're checking out to $checkout_path
     print "E cvs checkout: Updating $checkout_path\n";
@@ -609,11 +609,11 @@ sub req_co
 
 	    # Eclipse seems to need the Clear-sticky command
 	    # to prepare the 'Entries' file for the new directory.
-	    print "Clear-sticky $module/$git->{dir}\n";
+	    print "Clear-sticky $checkout_path/$git->{dir}\n";
 	    print $state->{CVSROOT} . "/$module/$git->{dir}\n";
-	    print "Clear-static-directory $module/$git->{dir}\n";
+	    print "Clear-static-directory $checkout_path/$git->{dir}\n";
 	    print $state->{CVSROOT} . "/$module/$git->{dir}\n";
-	    print "E cvs checkout: Updating /$module/$git->{dir}\n";
+	    print "E cvs checkout: Updating /$checkout_path/$git->{dir}\n";
 	    $lastdir = $git->{dir};
 	    $seendirs{$git->{dir}} = 1;
 	}