From ed7d2f4770659be207c0d512bceb337aa18c5527 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:19 +0200
Subject: [PATCH 1/9] reftable/stack: refactor function to gather table sizes

Refactor the function that gathers table sizes to be more idiomatic. For
one, use `REFTABLE_CALLOC_ARRAY()` instead of `reftable_calloc()`.
Second, avoid using an integer to iterate through the tables in the
reftable stack given that `stack_len` itself is using a `size_t`.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 737591125e..ba8234b486 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -1305,14 +1305,15 @@ struct segment suggest_compaction_segment(uint64_t *sizes, size_t n,
 
 static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
 {
-	uint64_t *sizes =
-		reftable_calloc(st->merged->stack_len, sizeof(*sizes));
 	int version = (st->opts.hash_id == GIT_SHA1_FORMAT_ID) ? 1 : 2;
 	int overhead = header_size(version) - 1;
-	int i = 0;
-	for (i = 0; i < st->merged->stack_len; i++) {
+	uint64_t *sizes;
+
+	REFTABLE_CALLOC_ARRAY(sizes, st->merged->stack_len);
+
+	for (size_t i = 0; i < st->merged->stack_len; i++)
 		sizes[i] = st->readers[i]->size - overhead;
-	}
+
 	return sizes;
 }
 

From 9a833ca35dfcefa0e431da5f7dcdc2e58f38469d Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:24 +0200
Subject: [PATCH 2/9] reftable/stack: extract function to setup stack with N
 tables

We're about to add two tests, and both of them will want to initialize
the reftable stack with a set of N tables. Introduce a new function that
handles this and refactor existing tests that use such a setup to use
it.

Note that this changes the exact records contained in the preexisting
tests. This is fine though as we only care about the shape of the stack
here, not the shape of each table.

Furthermore, with this change we now start to disable auto compaction
when writing the tables, as otherwise we might not end up with the
expected amount of new tables added. This also slightly changes the
behaviour of these tests, but the properties we care for remain intact.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack_test.c | 64 +++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index e3c11e6a6e..0b110f6f02 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -109,6 +109,34 @@ static int write_test_ref(struct reftable_writer *wr, void *arg)
 	return reftable_writer_add_ref(wr, ref);
 }
 
+static void write_n_ref_tables(struct reftable_stack *st,
+			       size_t n)
+{
+	struct strbuf buf = STRBUF_INIT;
+	int disable_auto_compact;
+	int err;
+
+	disable_auto_compact = st->opts.disable_auto_compact;
+	st->opts.disable_auto_compact = 1;
+
+	for (size_t i = 0; i < n; i++) {
+		struct reftable_ref_record ref = {
+			.update_index = reftable_stack_next_update_index(st),
+			.value_type = REFTABLE_REF_VAL1,
+		};
+
+		strbuf_addf(&buf, "refs/heads/branch-%04u", (unsigned) i);
+		ref.refname = buf.buf;
+		set_test_hash(ref.value.val1, i);
+
+		err = reftable_stack_add(st, &write_test_ref, &ref);
+		EXPECT_ERR(err);
+	}
+
+	st->opts.disable_auto_compact = disable_auto_compact;
+	strbuf_release(&buf);
+}
+
 struct write_log_arg {
 	struct reftable_log_record *log;
 	uint64_t update_index;
@@ -916,25 +944,11 @@ static void test_reftable_stack_compaction_concurrent(void)
 	struct reftable_write_options opts = { 0 };
 	struct reftable_stack *st1 = NULL, *st2 = NULL;
 	char *dir = get_tmp_dir(__LINE__);
-	int err, i;
-	int N = 3;
+	int err;
 
 	err = reftable_new_stack(&st1, dir, &opts);
 	EXPECT_ERR(err);
-
-	for (i = 0; i < N; i++) {
-		char name[100];
-		struct reftable_ref_record ref = {
-			.refname = name,
-			.update_index = reftable_stack_next_update_index(st1),
-			.value_type = REFTABLE_REF_SYMREF,
-			.value.symref = (char *) "master",
-		};
-		snprintf(name, sizeof(name), "branch%04d", i);
-
-		err = reftable_stack_add(st1, &write_test_ref, &ref);
-		EXPECT_ERR(err);
-	}
+	write_n_ref_tables(st1, 3);
 
 	err = reftable_new_stack(&st2, dir, &opts);
 	EXPECT_ERR(err);
@@ -965,25 +979,11 @@ static void test_reftable_stack_compaction_concurrent_clean(void)
 	struct reftable_write_options opts = { 0 };
 	struct reftable_stack *st1 = NULL, *st2 = NULL, *st3 = NULL;
 	char *dir = get_tmp_dir(__LINE__);
-	int err, i;
-	int N = 3;
+	int err;
 
 	err = reftable_new_stack(&st1, dir, &opts);
 	EXPECT_ERR(err);
-
-	for (i = 0; i < N; i++) {
-		char name[100];
-		struct reftable_ref_record ref = {
-			.refname = name,
-			.update_index = reftable_stack_next_update_index(st1),
-			.value_type = REFTABLE_REF_SYMREF,
-			.value.symref = (char *) "master",
-		};
-		snprintf(name, sizeof(name), "branch%04d", i);
-
-		err = reftable_stack_add(st1, &write_test_ref, &ref);
-		EXPECT_ERR(err);
-	}
+	write_n_ref_tables(st1, 3);
 
 	err = reftable_new_stack(&st2, dir, &opts);
 	EXPECT_ERR(err);

From 8030100bdafc508eaa7900ebcfd67a2d6b02749e Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:29 +0200
Subject: [PATCH 3/9] reftable/stack: test compaction with already-locked
 tables

We're lacking test coverage for compacting tables when some of the
tables that we are about to compact are locked. Add two tests that
exercise this, one for auto-compaction and one for full compaction.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack_test.c | 77 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index 0b110f6f02..1d109933d3 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -891,6 +891,45 @@ static void test_reftable_stack_auto_compaction(void)
 	clear_dir(dir);
 }
 
+static void test_reftable_stack_auto_compaction_with_locked_tables(void)
+{
+	struct reftable_write_options opts = {
+		.disable_auto_compact = 1,
+	};
+	struct reftable_stack *st = NULL;
+	struct strbuf buf = STRBUF_INIT;
+	char *dir = get_tmp_dir(__LINE__);
+	int err;
+
+	err = reftable_new_stack(&st, dir, &opts);
+	EXPECT_ERR(err);
+
+	write_n_ref_tables(st, 5);
+	EXPECT(st->merged->stack_len == 5);
+
+	/*
+	 * Given that all tables we have written should be roughly the same
+	 * size, we expect that auto-compaction will want to compact all of the
+	 * tables. Locking any of the tables will keep it from doing so.
+	 */
+	strbuf_reset(&buf);
+	strbuf_addf(&buf, "%s/%s.lock", dir, st->readers[2]->name);
+	write_file_buf(buf.buf, "", 0);
+
+	/*
+	 * Ideally, we'd handle the situation where any of the tables is locked
+	 * gracefully. We don't (yet) do this though and thus fail.
+	 */
+	err = reftable_stack_auto_compact(st);
+	EXPECT(err == REFTABLE_LOCK_ERROR);
+	EXPECT(st->stats.failures == 1);
+	EXPECT(st->merged->stack_len == 5);
+
+	reftable_stack_destroy(st);
+	strbuf_release(&buf);
+	clear_dir(dir);
+}
+
 static void test_reftable_stack_add_performs_auto_compaction(void)
 {
 	struct reftable_write_options opts = { 0 };
@@ -939,6 +978,42 @@ static void test_reftable_stack_add_performs_auto_compaction(void)
 	clear_dir(dir);
 }
 
+static void test_reftable_stack_compaction_with_locked_tables(void)
+{
+	struct reftable_write_options opts = {
+		.disable_auto_compact = 1,
+	};
+	struct reftable_stack *st = NULL;
+	struct strbuf buf = STRBUF_INIT;
+	char *dir = get_tmp_dir(__LINE__);
+	int err;
+
+	err = reftable_new_stack(&st, dir, &opts);
+	EXPECT_ERR(err);
+
+	write_n_ref_tables(st, 3);
+	EXPECT(st->merged->stack_len == 3);
+
+	/* Lock one of the tables that we're about to compact. */
+	strbuf_reset(&buf);
+	strbuf_addf(&buf, "%s/%s.lock", dir, st->readers[1]->name);
+	write_file_buf(buf.buf, "", 0);
+
+	/*
+	 * Compaction is expected to fail given that we were not able to
+	 * compact all tables.
+	 */
+	err = reftable_stack_compact_all(st, NULL);
+	EXPECT(err == REFTABLE_LOCK_ERROR);
+	/* TODO: this is wrong, we should get notified about the failure. */
+	EXPECT(st->stats.failures == 0);
+	EXPECT(st->merged->stack_len == 3);
+
+	reftable_stack_destroy(st);
+	strbuf_release(&buf);
+	clear_dir(dir);
+}
+
 static void test_reftable_stack_compaction_concurrent(void)
 {
 	struct reftable_write_options opts = { 0 };
@@ -1016,9 +1091,11 @@ int stack_test_main(int argc, const char *argv[])
 	RUN_TEST(test_reftable_stack_add);
 	RUN_TEST(test_reftable_stack_add_one);
 	RUN_TEST(test_reftable_stack_auto_compaction);
+	RUN_TEST(test_reftable_stack_auto_compaction_with_locked_tables);
 	RUN_TEST(test_reftable_stack_add_performs_auto_compaction);
 	RUN_TEST(test_reftable_stack_compaction_concurrent);
 	RUN_TEST(test_reftable_stack_compaction_concurrent_clean);
+	RUN_TEST(test_reftable_stack_compaction_with_locked_tables);
 	RUN_TEST(test_reftable_stack_hash_id);
 	RUN_TEST(test_reftable_stack_lock_failure);
 	RUN_TEST(test_reftable_stack_log_normalize);

From 5f0ed603a1653f2394c468814bde4b0dca2cff45 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:34 +0200
Subject: [PATCH 4/9] reftable/stack: update stats on failed full compaction

When auto-compaction fails due to a locking error, we update the
statistics to indicate this failure. We're not doing the same when
performing a full compaction.

Fix this inconsistency by using `stack_compact_range_stats()`, which
handles the stat update for us.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c      | 14 +++++++-------
 reftable/stack_test.c |  3 +--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index ba8234b486..e5959d2c76 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -1205,13 +1205,6 @@ static int stack_compact_range(struct reftable_stack *st,
 	return err;
 }
 
-int reftable_stack_compact_all(struct reftable_stack *st,
-			       struct reftable_log_expiry_config *config)
-{
-	return stack_compact_range(st, 0, st->merged->stack_len ?
-			st->merged->stack_len - 1 : 0, config);
-}
-
 static int stack_compact_range_stats(struct reftable_stack *st,
 				     size_t first, size_t last,
 				     struct reftable_log_expiry_config *config)
@@ -1222,6 +1215,13 @@ static int stack_compact_range_stats(struct reftable_stack *st,
 	return err;
 }
 
+int reftable_stack_compact_all(struct reftable_stack *st,
+			       struct reftable_log_expiry_config *config)
+{
+	size_t last = st->merged->stack_len ? st->merged->stack_len - 1 : 0;
+	return stack_compact_range_stats(st, 0, last, config);
+}
+
 static int segment_size(struct segment *s)
 {
 	return s->end - s->start;
diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index 1d109933d3..3ed8e44924 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -1005,8 +1005,7 @@ static void test_reftable_stack_compaction_with_locked_tables(void)
 	 */
 	err = reftable_stack_compact_all(st, NULL);
 	EXPECT(err == REFTABLE_LOCK_ERROR);
-	/* TODO: this is wrong, we should get notified about the failure. */
-	EXPECT(st->stats.failures == 0);
+	EXPECT(st->stats.failures == 1);
 	EXPECT(st->merged->stack_len == 3);
 
 	reftable_stack_destroy(st);

From 558f6fbeb1f194116231218b3e7496ceef4b9618 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:39 +0200
Subject: [PATCH 5/9] reftable/stack: simplify tracking of table locks

When compacting tables, we store the locks of all tables we are about to
compact in the `table_locks` array. As we currently only ever compact
all tables in the user-provided range or none, we simply track those
locks via the indices of the respective tables in the merged stack.

This is about to change though, as we will introduce a mode where auto
compaction gracefully handles the case of already-locked files. In this
case, it may happen that we only compact a subset of the user-supplied
range of tables. In this case, the indices will not necessarily match
the lock indices anymore.

Refactor the code such that we track the number of locks via a separate
variable. The resulting code is expected to perform the same, but will
make it easier to perform the described change.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index e5959d2c76..07e7ffc6b9 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -1016,7 +1016,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	struct lock_file *table_locks = NULL;
 	struct tempfile *new_table = NULL;
 	int is_empty_table = 0, err = 0;
-	size_t i;
+	size_t i, nlocks = 0;
 
 	if (first > last || (!expiry && first == last)) {
 		err = 0;
@@ -1051,7 +1051,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	for (i = first; i <= last; i++) {
 		stack_filename(&table_name, st, reader_name(st->readers[i]));
 
-		err = hold_lock_file_for_update(&table_locks[i - first],
+		err = hold_lock_file_for_update(&table_locks[nlocks],
 						table_name.buf, LOCK_NO_DEREF);
 		if (err < 0) {
 			if (errno == EEXIST)
@@ -1066,7 +1066,7 @@ static int stack_compact_range(struct reftable_stack *st,
 		 * run into file descriptor exhaustion when we compress a lot
 		 * of tables.
 		 */
-		err = close_lock_file_gently(&table_locks[i - first]);
+		err = close_lock_file_gently(&table_locks[nlocks++]);
 		if (err < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
@@ -1183,8 +1183,8 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * Delete the old tables. They may still be in use by concurrent
 	 * readers, so it is expected that unlinking tables may fail.
 	 */
-	for (i = first; i <= last; i++) {
-		struct lock_file *table_lock = &table_locks[i - first];
+	for (i = 0; i < nlocks; i++) {
+		struct lock_file *table_lock = &table_locks[i];
 		char *table_path = get_locked_file_path(table_lock);
 		unlink(table_path);
 		free(table_path);
@@ -1192,8 +1192,8 @@ static int stack_compact_range(struct reftable_stack *st,
 
 done:
 	rollback_lock_file(&tables_list_lock);
-	for (i = first; table_locks && i <= last; i++)
-		rollback_lock_file(&table_locks[i - first]);
+	for (i = 0; table_locks && i < nlocks; i++)
+		rollback_lock_file(&table_locks[i]);
 	reftable_free(table_locks);
 
 	delete_tempfile(&new_table);

From 7ee307da1bfe3867f91fbd9a053494bc5fe61675 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:44 +0200
Subject: [PATCH 6/9] reftable/stack: do not die when fsyncing lock file files

We use `fsync_component_or_die()` when committing an addition to the
"tables.list" lock file, which unsurprisingly dies in case the fsync
fails. Given that this is part of the reftable library, we should never
die and instead let callers handle the error.

Adapt accordingly and use `fsync_component()` instead.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 07e7ffc6b9..9ca549294f 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -674,8 +674,11 @@ int reftable_addition_commit(struct reftable_addition *add)
 		goto done;
 	}
 
-	fsync_component_or_die(FSYNC_COMPONENT_REFERENCE, lock_file_fd,
-			       get_tempfile_path(add->lock_file));
+	err = fsync_component(FSYNC_COMPONENT_REFERENCE, lock_file_fd);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
 
 	err = rename_tempfile(&add->lock_file, add->stack->list_file);
 	if (err < 0) {

From 128b9aa3e9d3dc0417f8f65a240aad736db97959 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:49 +0200
Subject: [PATCH 7/9] reftable/stack: use lock_file when adding table to
 "tables.list"

When modifying "tables.list", we need to lock the list before updating
it to ensure that no concurrent writers modify the list at the same
point in time. While we do this via the `lock_file` subsystem when
compacting the stack, we manually handle the lock when adding a new
table to it. While not wrong, it is at least inconsistent.

Refactor the code to consistently lock "tables.list" via the `lock_file`
subsytem.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 9ca549294f..54982e0f7d 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -567,7 +567,7 @@ static void format_name(struct strbuf *dest, uint64_t min, uint64_t max)
 }
 
 struct reftable_addition {
-	struct tempfile *lock_file;
+	struct lock_file tables_list_lock;
 	struct reftable_stack *stack;
 
 	char **new_tables;
@@ -581,13 +581,13 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 					struct reftable_stack *st)
 {
 	struct strbuf lock_file_name = STRBUF_INIT;
-	int err = 0;
+	int err;
+
 	add->stack = st;
 
-	strbuf_addf(&lock_file_name, "%s.lock", st->list_file);
-
-	add->lock_file = create_tempfile(lock_file_name.buf);
-	if (!add->lock_file) {
+	err = hold_lock_file_for_update(&add->tables_list_lock, st->list_file,
+					LOCK_NO_DEREF);
+	if (err < 0) {
 		if (errno == EEXIST) {
 			err = REFTABLE_LOCK_ERROR;
 		} else {
@@ -596,7 +596,8 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 		goto done;
 	}
 	if (st->opts.default_permissions) {
-		if (chmod(add->lock_file->filename.buf, st->opts.default_permissions) < 0) {
+		if (chmod(get_lock_file_path(&add->tables_list_lock),
+			  st->opts.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
@@ -635,7 +636,7 @@ static void reftable_addition_close(struct reftable_addition *add)
 	add->new_tables_len = 0;
 	add->new_tables_cap = 0;
 
-	delete_tempfile(&add->lock_file);
+	rollback_lock_file(&add->tables_list_lock);
 	strbuf_release(&nm);
 }
 
@@ -651,7 +652,7 @@ void reftable_addition_destroy(struct reftable_addition *add)
 int reftable_addition_commit(struct reftable_addition *add)
 {
 	struct strbuf table_list = STRBUF_INIT;
-	int lock_file_fd = get_tempfile_fd(add->lock_file);
+	int lock_file_fd = get_lock_file_fd(&add->tables_list_lock);
 	int err = 0;
 	size_t i;
 
@@ -680,7 +681,7 @@ int reftable_addition_commit(struct reftable_addition *add)
 		goto done;
 	}
 
-	err = rename_tempfile(&add->lock_file, add->stack->list_file);
+	err = commit_lock_file(&add->tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;

From ed1ad6b44deaf5089c64e0fbcae43a37d5cf666a Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:53 +0200
Subject: [PATCH 8/9] reftable/stack: fix corruption on concurrent compaction

The locking employed by compaction uses the following schema:

  1. Lock "tables.list" and verify that it matches the version we have
     loaded in core.

  2. Lock each of the tables in the user-supplied range of tables that
     we are supposed to compact. These locks prohibit any concurrent
     process to compact those tables while we are doing that.

  3. Unlock "tables.list". This enables concurrent processes to add new
     tables to the stack, but also allows them to compact tables outside
     of the range of tables that we have locked.

  4. Perform the compaction.

  5. Lock "tables.list" again.

  6. Move the compacted table into place.

  7. Write the new order of tables, including the compacted table, into
     the lockfile.

  8. Commit the lockfile into place.

Letting concurrent processes modify the "tables.list" file while we are
doing the compaction is very much part of the design and thus expected.
After all, it may take some time to compact tables in the case where we
are compacting a lot of very large tables.

But there is a bug in the code. Suppose we have two processes which are
compacting two slices of the table. Given that we lock each of the
tables before compacting them, we know that the slices must be disjunct
from each other. But regardless of that, compaction performed by one
process will always impact what the other process needs to write to the
"tables.list" file.

Right now, we do not check whether the "tables.list" has been changed
after we have locked it for the second time in (5). This has the
consequence that we will always commit the old, cached in-core tables to
disk without paying to respect what the other process has written. This
scenario would then lead to data loss and corruption.

This can even happen in the simpler case of one compacting process and
one writing process. The newly-appended table by the writing process
would get discarded by the compacting process because it never sees the
new table.

Fix this bug by re-checking whether our stack is still up to date after
locking for the second time. If it isn't, then we adjust the indices of
tables to replace in the updated stack.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c | 107 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 5 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 54982e0f7d..3f13c3eb34 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -1020,7 +1020,9 @@ static int stack_compact_range(struct reftable_stack *st,
 	struct lock_file *table_locks = NULL;
 	struct tempfile *new_table = NULL;
 	int is_empty_table = 0, err = 0;
+	size_t first_to_replace, last_to_replace;
 	size_t i, nlocks = 0;
+	char **names = NULL;
 
 	if (first > last || (!expiry && first == last)) {
 		err = 0;
@@ -1123,6 +1125,100 @@ static int stack_compact_range(struct reftable_stack *st,
 		}
 	}
 
+	/*
+	 * As we have unlocked the stack while compacting our slice of tables
+	 * it may have happened that a concurrently running process has updated
+	 * the stack while we were compacting. In that case, we need to check
+	 * whether the tables that we have just compacted still exist in the
+	 * stack in the exact same order as we have compacted them.
+	 *
+	 * If they do exist, then it is fine to continue and replace those
+	 * tables with our compacted version. If they don't, then we need to
+	 * abort.
+	 */
+	err = stack_uptodate(st);
+	if (err < 0)
+		goto done;
+	if (err > 0) {
+		ssize_t new_offset = -1;
+		int fd;
+
+		fd = open(st->list_file, O_RDONLY);
+		if (fd < 0) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+
+		err = fd_read_lines(fd, &names);
+		close(fd);
+		if (err < 0)
+			goto done;
+
+		/*
+		 * Search for the offset of the first table that we have
+		 * compacted in the updated "tables.list" file.
+		 */
+		for (size_t i = 0; names[i]; i++) {
+			if (strcmp(names[i], st->readers[first]->name))
+				continue;
+
+			/*
+			 * We have found the first entry. Verify that all the
+			 * subsequent tables we have compacted still exist in
+			 * the modified stack in the exact same order as we
+			 * have compacted them.
+			 */
+			for (size_t j = 1; j < last - first + 1; j++) {
+				const char *old = first + j < st->merged->stack_len ?
+					st->readers[first + j]->name : NULL;
+				const char *new = names[i + j];
+
+				/*
+				 * If some entries are missing or in case the tables
+				 * have changed then we need to bail out. Again, this
+				 * shouldn't ever happen because we have locked the
+				 * tables we are compacting.
+				 */
+				if (!old || !new || strcmp(old, new)) {
+					err = REFTABLE_OUTDATED_ERROR;
+					goto done;
+				}
+			}
+
+			new_offset = i;
+			break;
+		}
+
+		/*
+		 * In case we didn't find our compacted tables in the stack we
+		 * need to bail out. In theory, this should have never happened
+		 * because we locked the tables we are compacting.
+		 */
+		if (new_offset < 0) {
+			err = REFTABLE_OUTDATED_ERROR;
+			goto done;
+		}
+
+		/*
+		 * We have found the new range that we want to replace, so
+		 * let's update the range of tables that we want to replace.
+		 */
+		first_to_replace = new_offset;
+		last_to_replace = last + (new_offset - first);
+	} else {
+		/*
+		 * `fd_read_lines()` uses a `NULL` sentinel to indicate that
+		 * the array is at its end. As we use `free_names()` to free
+		 * the array, we need to include this sentinel value here and
+		 * thus have to allocate `stack_len + 1` many entries.
+		 */
+		REFTABLE_CALLOC_ARRAY(names, st->merged->stack_len + 1);
+		for (size_t i = 0; i < st->merged->stack_len; i++)
+			names[i] = xstrdup(st->readers[i]->name);
+		first_to_replace = first;
+		last_to_replace = last;
+	}
+
 	/*
 	 * If the resulting compacted table is not empty, then we need to move
 	 * it into place now.
@@ -1145,12 +1241,12 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * have just written. In case the compacted table became empty we
 	 * simply skip writing it.
 	 */
-	for (i = 0; i < first; i++)
-		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+	for (i = 0; i < first_to_replace; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", names[i]);
 	if (!is_empty_table)
 		strbuf_addf(&tables_list_buf, "%s\n", new_table_name.buf);
-	for (i = last + 1; i < st->merged->stack_len; i++)
-		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+	for (i = last_to_replace + 1; names[i]; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", names[i]);
 
 	err = write_in_full(get_lock_file_fd(&tables_list_lock),
 			    tables_list_buf.buf, tables_list_buf.len);
@@ -1203,9 +1299,10 @@ static int stack_compact_range(struct reftable_stack *st,
 	delete_tempfile(&new_table);
 	strbuf_release(&new_table_name);
 	strbuf_release(&new_table_path);
-
 	strbuf_release(&tables_list_buf);
 	strbuf_release(&table_name);
+	free_names(names);
+
 	return err;
 }
 

From f234df07f66c8f67391cc8ded5ade43b8a4428b6 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Thu, 8 Aug 2024 16:06:58 +0200
Subject: [PATCH 9/9] reftable/stack: handle locked tables during
 auto-compaction

When compacting tables, it may happen that we want to compact a set of
tables which are already locked by a concurrent process that compacts
them. In the case where we wanted to perform a full compaction of all
tables it is sensible to bail out in this case, as we cannot fulfill the
requested action.

But when performing auto-compaction it isn't necessarily in our best
interest of us to abort the whole operation. For example, due to the
geometric compacting schema that we use, it may be that process A takes
a lot of time to compact the bulk of all tables whereas process B
appends a bunch of new tables to the stack. B would in this case also
notice that it has to compact the tables that process A is compacting
already and thus also try to compact the same range, probably including
the new tables it has appended. But because those tables are locked
already, it will fail and thus abort the complete auto-compaction. The
consequence is that the stack will grow longer and longer while A isn't
yet done with compaction, which will lead to a growing performance
impact.

Instead of aborting auto-compaction altogether, let's gracefully handle
this situation by instead compacting tables which aren't locked. To do
so, instead of locking from the beginning of the slice-to-be-compacted,
we start locking tables from the end of the slice. Once we hit the first
table that is locked already, we abort. If we succeeded to lock two or
more tables, then we simply reduce the slice of tables that we're about
to compact to those which we managed to lock.

This ensures that we can at least make some progress for compaction in
said scenario. It also helps in other scenarios, like for example when a
process died and left a stale lockfile behind. In such a case we can at
least ensure some compaction on a best-effort basis.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 reftable/stack.c           | 59 +++++++++++++++++++++++++++++++-------
 reftable/stack_test.c      | 12 ++++----
 t/t0610-reftable-basics.sh | 21 +++++++++-----
 3 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 3f13c3eb34..2071e428a8 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -999,6 +999,15 @@ static int stack_write_compact(struct reftable_stack *st,
 	return err;
 }
 
+enum stack_compact_range_flags {
+	/*
+	 * Perform a best-effort compaction. That is, even if we cannot lock
+	 * all tables in the specified range, we will try to compact the
+	 * remaining slice.
+	 */
+	STACK_COMPACT_RANGE_BEST_EFFORT = (1 << 0),
+};
+
 /*
  * Compact all tables in the range `[first, last)` into a single new table.
  *
@@ -1010,7 +1019,8 @@ static int stack_write_compact(struct reftable_stack *st,
  */
 static int stack_compact_range(struct reftable_stack *st,
 			       size_t first, size_t last,
-			       struct reftable_log_expiry_config *expiry)
+			       struct reftable_log_expiry_config *expiry,
+			       unsigned int flags)
 {
 	struct strbuf tables_list_buf = STRBUF_INIT;
 	struct strbuf new_table_name = STRBUF_INIT;
@@ -1052,19 +1062,47 @@ static int stack_compact_range(struct reftable_stack *st,
 	/*
 	 * Lock all tables in the user-provided range. This is the slice of our
 	 * stack which we'll compact.
+	 *
+	 * Note that we lock tables in reverse order from last to first. The
+	 * intent behind this is to allow a newer process to perform best
+	 * effort compaction of tables that it has added in the case where an
+	 * older process is still busy compacting tables which are preexisting
+	 * from the point of view of the newer process.
 	 */
 	REFTABLE_CALLOC_ARRAY(table_locks, last - first + 1);
-	for (i = first; i <= last; i++) {
-		stack_filename(&table_name, st, reader_name(st->readers[i]));
+	for (i = last + 1; i > first; i--) {
+		stack_filename(&table_name, st, reader_name(st->readers[i - 1]));
 
 		err = hold_lock_file_for_update(&table_locks[nlocks],
 						table_name.buf, LOCK_NO_DEREF);
 		if (err < 0) {
-			if (errno == EEXIST)
+			/*
+			 * When the table is locked already we may do a
+			 * best-effort compaction and compact only the tables
+			 * that we have managed to lock so far. This of course
+			 * requires that we have been able to lock at least two
+			 * tables, otherwise there would be nothing to compact.
+			 * In that case, we return a lock error to our caller.
+			 */
+			if (errno == EEXIST && last - (i - 1) >= 2 &&
+			    flags & STACK_COMPACT_RANGE_BEST_EFFORT) {
+				err = 0;
+				/*
+				 * The subtraction is to offset the index, the
+				 * addition is to only compact up to the table
+				 * of the preceding iteration. They obviously
+				 * cancel each other out, but that may be
+				 * non-obvious when it was omitted.
+				 */
+				first = (i - 1) + 1;
+				break;
+			} else if (errno == EEXIST) {
 				err = REFTABLE_LOCK_ERROR;
-			else
+				goto done;
+			} else {
 				err = REFTABLE_IO_ERROR;
-			goto done;
+				goto done;
+			}
 		}
 
 		/*
@@ -1308,9 +1346,10 @@ static int stack_compact_range(struct reftable_stack *st,
 
 static int stack_compact_range_stats(struct reftable_stack *st,
 				     size_t first, size_t last,
-				     struct reftable_log_expiry_config *config)
+				     struct reftable_log_expiry_config *config,
+				     unsigned int flags)
 {
-	int err = stack_compact_range(st, first, last, config);
+	int err = stack_compact_range(st, first, last, config, flags);
 	if (err == REFTABLE_LOCK_ERROR)
 		st->stats.failures++;
 	return err;
@@ -1320,7 +1359,7 @@ int reftable_stack_compact_all(struct reftable_stack *st,
 			       struct reftable_log_expiry_config *config)
 {
 	size_t last = st->merged->stack_len ? st->merged->stack_len - 1 : 0;
-	return stack_compact_range_stats(st, 0, last, config);
+	return stack_compact_range_stats(st, 0, last, config, 0);
 }
 
 static int segment_size(struct segment *s)
@@ -1427,7 +1466,7 @@ int reftable_stack_auto_compact(struct reftable_stack *st)
 	reftable_free(sizes);
 	if (segment_size(&seg) > 0)
 		return stack_compact_range_stats(st, seg.start, seg.end - 1,
-						 NULL);
+						 NULL, STACK_COMPACT_RANGE_BEST_EFFORT);
 
 	return 0;
 }
diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index 3ed8e44924..8c36590ff0 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -917,13 +917,15 @@ static void test_reftable_stack_auto_compaction_with_locked_tables(void)
 	write_file_buf(buf.buf, "", 0);
 
 	/*
-	 * Ideally, we'd handle the situation where any of the tables is locked
-	 * gracefully. We don't (yet) do this though and thus fail.
+	 * When parts of the stack are locked, then auto-compaction does a best
+	 * effort compaction of those tables which aren't locked. So while this
+	 * would in theory compact all tables, due to the preexisting lock we
+	 * only compact the newest two tables.
 	 */
 	err = reftable_stack_auto_compact(st);
-	EXPECT(err == REFTABLE_LOCK_ERROR);
-	EXPECT(st->stats.failures == 1);
-	EXPECT(st->merged->stack_len == 5);
+	EXPECT_ERR(err);
+	EXPECT(st->stats.failures == 0);
+	EXPECT(st->merged->stack_len == 4);
 
 	reftable_stack_destroy(st);
 	strbuf_release(&buf);
diff --git a/t/t0610-reftable-basics.sh b/t/t0610-reftable-basics.sh
index b06c46999d..37510c2b2a 100755
--- a/t/t0610-reftable-basics.sh
+++ b/t/t0610-reftable-basics.sh
@@ -478,19 +478,26 @@ test_expect_success "$command: auto compaction" '
 
 		test_oid blob17_2 | git hash-object -w --stdin &&
 
-		# Lock all tables write some refs. Auto-compaction will be
-		# unable to compact tables and thus fails gracefully, leaving
-		# the stack in a sub-optimal state.
-		ls .git/reftable/*.ref |
+		# Lock all tables, write some refs. Auto-compaction will be
+		# unable to compact tables and thus fails gracefully,
+		# compacting only those tables which are not locked.
+		ls .git/reftable/*.ref | sort |
 		while read table
 		do
-			touch "$table.lock" || exit 1
+			touch "$table.lock" &&
+			basename "$table" >>tables.expect || exit 1
 		done &&
+		test_line_count = 2 .git/reftable/tables.list &&
 		git branch B &&
 		git branch C &&
-		rm .git/reftable/*.lock &&
-		test_line_count = 4 .git/reftable/tables.list &&
 
+		# The new tables are auto-compacted, but the locked tables are
+		# left intact.
+		test_line_count = 3 .git/reftable/tables.list &&
+		head -n 2 .git/reftable/tables.list >tables.head &&
+		test_cmp tables.expect tables.head &&
+
+		rm .git/reftable/*.lock &&
 		git $command --auto &&
 		test_line_count = 1 .git/reftable/tables.list
 	)