git/diff-delta.c at 1bdbb57407ca28abce3fb731d01bfcbe74a9734a - mirrors/git - Incest Forge: Beyond sex. We incest.

mirrors/git

mirror of https://github.com/git/git.git synced 2024-11-05 08:47:56 +01:00

Nicolas Pitre 5a1fb2ca92 3% tighter packs for free

This patch makes for 3.4% smaller pack with the git repository, and
a bit more than 3% smaller pack with the kernel repository.

And so with _no_ measurable CPU difference.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>

2006-03-17 20:42:39 -08:00

286 lines

7 KiB

C

Raw Blame History

 /*
  * diff-delta.c: generate a delta between two buffers
  *
  *  Many parts of this file have been lifted from LibXDiff version 0.10.
  *  http://www.xmailserver.org/xdiff-lib.html
  *
  *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
  *  Copyright (C) 2003	Davide Libenzi
  *
  *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  *
  *  This file is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU Lesser General Public
  *  License as published by the Free Software Foundation; either
  *  version 2.1 of the License, or (at your option) any later version.
  *
  *  Use of this within git automatically means that the LGPL
  *  licensing gets turned into GPLv2 within this project.
  */
 #include <stdlib.h>
 #include <string.h>
 #include <zlib.h>
 #include "delta.h"
 /* block size: min = 16, max = 64k, power of 2 */
 #define BLK_SIZE 16
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define GR_PRIME 0x9e370001
 #define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
 struct index {
 	const unsigned char *ptr;
 	unsigned int val;
 	struct index *next;
 };
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
 				   unsigned long trg_bufsize,
 				   unsigned int *hash_shift)
 {
 	unsigned int i, hsize, hshift, hlimit, entries, *hash_count;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 	/* determine index hash size */
 	entries = bufsize  / BLK_SIZE;
 	hsize = entries / 4;
 	for (i = 4; (1 << i) < hsize && i < 31; i++);
 	hsize = 1 << i;
 	hshift = 32 - i;
 	*hash_shift = hshift;
 	/* allocate lookup index */
 	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
 	entry = mem + hsize * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
 		free(hash);
 		return NULL;
 	}
 	/* then populate the index */
 	data = buf + entries * BLK_SIZE - BLK_SIZE;
 	while (data >= buf) {
 		unsigned int val = adler32(0, data, BLK_SIZE);
 		i = HASH(val, hshift);
 		entry->ptr = data;
 		entry->val = val;
 		entry->next = hash[i];
 		hash[i] = entry++;
 		hash_count[i]++;
 		data -= BLK_SIZE;
  	}
 	/*
 	 * Determine a limit on the number of entries in the same hash
 	 * bucket.  This guard us against patological data sets causing
 	 * really bad hash distribution with most entries in the same hash
 	 * bucket that would bring us to O(m*n) computing costs (m and n
 	 * corresponding to reference and target buffer sizes).
 	 *
 	 * The more the target buffer is large, the more it is important to
 	 * have small entry lists for each hash buckets.  With such a limit
 	 * the cost is bounded to something more like O(m+n).
 	 */
 	hlimit = (1 << 26) / trg_bufsize;
 	if (hlimit < 4*BLK_SIZE)
 		hlimit = 4*BLK_SIZE;
 	/*
 	 * Now make sure none of the hash buckets has more entries than
 	 * we're willing to test.  Otherwise we cull the entry list
 	 * uniformly to still preserve a good repartition across
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
 		if (hash_count[i] < hlimit)
 			continue;
 		entry = hash[i];
 		do {
 			struct index *keep = entry;
 			int skip = hash_count[i] / hlimit / 2;
 			do {
 				entry = entry->next;
 			} while(--skip && entry);
 			keep->next = entry;
 		} while(entry);
 	}
 	free(hash_count);
 	return hash;
 }
 /* provide the size of the copy opcode given the block offset and size */
 #define COPYOP_SIZE(o, s) \
     (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
      !!(s & 0xff) + !!(s & 0xff00) + 1)
 /* the maximum size for any opcode */
 #define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)
 void *diff_delta(void *from_buf, unsigned long from_size,
 		 void *to_buf, unsigned long to_size,
 		 unsigned long *delta_size,
 		 unsigned long max_size)
 {
 	unsigned int i, outpos, outsize, hash_shift;
 	int inscnt;
 	const unsigned char *ref_data, *ref_top, *data, *top;
 	unsigned char *out;
 	struct index *entry, **hash;
 	if (!from_size || !to_size)
 		return NULL;
 	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 	if (!hash)
 		return NULL;
 	outpos = 0;
 	outsize = 8192;
 	if (max_size && outsize >= max_size)
 		outsize = max_size + MAX_OP_SIZE + 1;
 	out = malloc(outsize);
 	if (!out) {
 		free(hash);
 		return NULL;
 	}
 	ref_data = from_buf;
 	ref_top = from_buf + from_size;
 	data = to_buf;
 	top = to_buf + to_size;
 	/* store reference buffer size */
 	out[outpos++] = from_size;
 	from_size >>= 7;
 	while (from_size) {
 		out[outpos - 1] |= 0x80;
 		out[outpos++] = from_size;
 		from_size >>= 7;
 	}
 	/* store target buffer size */
 	out[outpos++] = to_size;
 	to_size >>= 7;
 	while (to_size) {
 		out[outpos - 1] |= 0x80;
 		out[outpos++] = to_size;
 		to_size >>= 7;
 	}
 	inscnt = 0;
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
 		if (data + BLK_SIZE <= top) {
 			unsigned int val = adler32(0, data, BLK_SIZE);
 			i = HASH(val, hash_shift);
 			for (entry = hash[i]; entry; entry = entry->next) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = ref_top - ref;
 				if (entry->val != val)
 					continue;
 				if (ref_size > top - src)
 					ref_size = top - src;
 				if (ref_size > 0x10000)
 					ref_size = 0x10000;
 				if (ref_size <= msize)
 					break;
 				while (ref_size-- && *src++ == *ref)
 					ref++;
 				if (msize < ref - entry->ptr) {
 					/* this is our best match so far */
 					msize = ref - entry->ptr;
 					moff = entry->ptr - ref_data;
 				}
 			}
 		}
 		if (!msize || msize < COPYOP_SIZE(moff, msize)) {
 			if (!inscnt)
 				outpos++;
 			out[outpos++] = *data++;
 			inscnt++;
 			if (inscnt == 0x7f) {
 				out[outpos - inscnt - 1] = inscnt;
 				inscnt = 0;
 			}
 		} else {
 			unsigned char *op;
 			if (inscnt) {
 				while (moff && ref_data[moff-1] == data[-1]) {
 					if (msize == 0x10000)
 						break;
 					/* we can match one byte back */
 					msize++;
 					moff--;
 					data--;
 					outpos--;
 					if (--inscnt)
 						continue;
 					outpos--;  /* remove count slot */
 					inscnt--;  /* make it -1 */
 					break;
 				}
 				out[outpos - inscnt - 1] = inscnt;
 				inscnt = 0;
 			}
 			data += msize;
 			op = out + outpos++;
 			i = 0x80;
 			if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 			moff >>= 8;
 			if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 			moff >>= 8;
 			if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 			moff >>= 8;
 			if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 			if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 			msize >>= 8;
 			if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 			*op = i;
 		}
 		if (outpos >= outsize - MAX_OP_SIZE) {
 			void *tmp = out;
 			outsize = outsize * 3 / 2;
 			if (max_size && outsize >= max_size)
 				outsize = max_size + MAX_OP_SIZE + 1;
 			if (max_size && outpos > max_size)
 				out = NULL;
 			else
 				out = realloc(out, outsize);
 			if (!out) {
 				free(tmp);
 				free(hash);
 				return NULL;
 			}
 		}
 	}
 	if (inscnt)
 		out[outpos - inscnt - 1] = inscnt;
 	free(hash);
 	*delta_size = outpos;
 	return out;
 }