Index: linux/tests/atomic_test.c
===================================================================
--- /dev/null
+++ linux/tests/atomic_test.c
@@ -0,0 +1,695 @@
+/* test-atomic.c
+ *
+ * Test module for synthetic in kernel atomic operations testing
+ *
+ * The test is triggered by loading the module (which will fail).
+ *
+ * (C) 2023 Ampere Computing LLC, Christoph Lameter <cl@linux.com>
+ */
+
+
+#include <linux/jiffies.h>
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <asm/timex.h>
+#include <asm/arm_pmuv3.h>
+#include "cycles.h"
+
+#define TEST_COUNT 30000
+
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+static int cpus;
+static bool validation_failed;
+
+static struct test_struct {
+	struct task_struct *task;
+	int cpu;
+	int count;
+	char **v;
+	void (*test_i)(struct test_struct *);
+	void (*test_p)(struct test_struct *);
+	void (*test_v)(struct test_struct *);
+	unsigned long start;
+	unsigned long stop;
+	bool unreliable;
+} test[NR_CPUS];
+
+
+/*
+ * Operation on a normal global value
+ */
+
+static unsigned long x_normal;
+
+static void init_normal_increment(struct test_struct *t)
+{
+	x_normal = 0;
+}
+
+static void test_normal_increment(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		barrier();	/* Compiler optimizes the loop away if we dont stop it */
+		x_normal++;
+	}
+	t->stop = CYCLES;
+}
+
+static void validate_normal_increment(struct test_struct *t)
+{
+	if (x_normal != cpus * TEST_COUNT)
+		validation_failed = true;
+
+//		printk(KERN_ERR "x++ %d times must result in %d and not %lu\n", TEST_COUNT, TEST_COUNT, x_normal);
+
+}
+
+/*
+ * Operation on an unsigned long that is per cpu
+ */
+
+
+/* Missing
+ * inc_return
+ * xchg
+ */
+static DEFINE_PER_CPU_ALIGNED(unsigned long, x_percpu);
+
+static void init_this_cpu(struct test_struct *t)
+{
+	this_cpu_write(x_percpu, 0);
+}
+
+static void test_this_cpu_inc(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		this_cpu_inc(x_percpu);
+	}
+	t->stop = CYCLES;
+}
+
+static void test_this_cpu_inc_return(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (this_cpu_inc_return(x_percpu) != i + 1)
+			validation_failed = true;
+	}
+	t->stop = CYCLES;
+
+}
+
+static void test_this_cpu_xchg(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (this_cpu_xchg(x_percpu, i + 1) != i)
+			validation_failed = true;
+	}
+	t->stop = CYCLES;
+}
+
+
+static void test_this_cpu_cmpxchg(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (this_cpu_cmpxchg(x_percpu, i, i + 1) != i)
+			validation_failed = true;
+	}
+	t->stop = CYCLES;
+}
+
+
+static void validate_this_cpu(struct test_struct *t)
+{
+	if (this_cpu_read(x_percpu) != TEST_COUNT)
+		validation_failed = true;
+}
+
+static DEFINE_PER_CPU_ALIGNED(u128, x128_percpu);
+
+static void init_this_cpu128(struct test_struct *t)
+{
+	/* This should be this_cpu_write(x128_percpu, 0) but its not there */
+	void *p = this_cpu_ptr(&x128_percpu);
+	int *i = (int *)p;
+
+	i[0] = 0;
+	i[1] = 0;
+}
+
+static void test_this_cpu_cmpxchg128(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		this_cpu_cmpxchg128(x128_percpu, i, i+1);
+	}
+	t->stop = CYCLES;
+}
+
+static void validate_this_cpu128(struct test_struct *t)
+{
+	/* This should be this_cpu_read(x128_percpu) but its not there */
+	void *p = this_cpu_ptr(&x128_percpu);
+	int *i = (int *)p;
+
+	if (*i != TEST_COUNT)
+		validation_failed = true;
+}
+
+/* Atomics */
+
+static atomic_long_t x_atomic;
+
+static void init_atomic(struct test_struct *t)
+{
+	atomic_long_set(&x_atomic, 0);
+}
+
+static void init_atomic_1(struct test_struct *t)
+{
+	atomic_long_set(&x_atomic, 1);
+}
+
+static void test_atomic_inc(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		atomic_long_inc(&x_atomic);
+	}
+	t->stop = CYCLES;
+}
+
+static void test_atomic_inc_return(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (atomic_long_inc_return(&x_atomic) != i + 1)
+			validation_failed = false;
+	}
+	t->stop = CYCLES;
+}
+
+static void test_atomic_fetchadd(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (atomic_long_fetch_add(1, &x_atomic) != i)
+			validation_failed = false;
+	}
+	t->stop = CYCLES;
+}
+
+static void test_atomic_inc_not_zero(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		int x;
+
+		x = atomic_long_inc_not_zero(&x_atomic);
+	}
+	t->stop = CYCLES;
+}
+
+static void test_atomic_sub_test(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (atomic_long_sub_and_test(1, &x_atomic) && i != 1)
+			validation_failed = false;
+	}
+	t->stop = CYCLES;
+}
+
+static void validate_atomic(struct test_struct *t)
+{
+	if (atomic_long_read(&x_atomic) != cpus * TEST_COUNT) {
+		validation_failed = true;
+	}
+}
+
+static void test_xchg(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		atomic_long_xchg(&x_atomic, i + 1);
+		/* Cannot really test this since this is not reliable */
+	}
+	t->stop = CYCLES;
+}
+
+static void test_cmpxchg(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (atomic_long_cmpxchg(&x_atomic, i, i + 1) != i)
+			validation_failed = true;
+	}
+	t->stop = CYCLES;
+}
+
+static void validate_cmpxchg(struct test_struct *t)
+{
+	if (atomic_long_read(&x_atomic) != TEST_COUNT) {
+//		printk(KERN_ERR "cpu %d atomic_long_op %d times must result in %d and not %ld\n",
+//				t->cpu, TEST_COUNT, TEST_COUNT, atomic_long_read(&x_atomic));
+		validation_failed = true;
+	}
+}
+
+static u128 x128;
+
+static void init_atomic128(struct test_struct *t)
+{
+	/* This should be this_cpu_write(x128_percpu, 0) but its not there */
+	int *i = (int *)&x128;
+
+	i[0] = 0;
+	i[1] = 0;
+}
+
+static void test_cmpxchg128(struct test_struct *t)
+{
+	int i;
+
+	t->start = CYCLES;
+	for (i = 0; i < TEST_COUNT; i++) {
+		if (cmpxchg128(&x128, i, i+1) != i)
+			validation_failed = true;;
+	}
+	t->stop = CYCLES;
+}
+
+static void validate_atomic128(struct test_struct *t)
+{
+	int *i = (int *)&x128;
+
+	if (*i != cpus * TEST_COUNT)
+		validation_failed = true;
+}
+
+
+static atomic_t tests_running;
+static atomic_t phase1_complete;
+static DECLARE_COMPLETION(completion1);
+static DECLARE_COMPLETION(completion2);
+static DECLARE_COMPLETION(completion3);
+
+static int started;
+
+static int test_func(void *private)
+{
+	struct test_struct *t = private;
+	struct cpumask newmask = CPU_MASK_NONE;
+
+        cpumask_set_cpu(t->cpu, &newmask);
+        set_cpus_allowed_ptr(current, &newmask);
+	t->v = kzalloc(t->count * sizeof(void *), GFP_KERNEL);
+
+	t->test_i(t);
+
+	CYCLES_ENABLE;
+	atomic_inc(&tests_running);
+	wait_for_completion(&completion1);
+
+	t->test_p(t);
+	atomic_inc(&phase1_complete);
+	wait_for_completion(&completion2);
+
+	atomic_dec(&tests_running);
+
+	wait_for_completion(&completion3);
+
+	msleep(100);
+	if (t->test_v && !validation_failed && !t->unreliable)
+		t->test_v(t);
+
+
+	kfree(t->v);
+	t->v = NULL;
+
+	CYCLES_DISABLE;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+	return 0;
+}
+
+static unsigned do_concurrent_test(
+		void (*init)(struct test_struct *),
+		void (*p)(struct test_struct *),
+		void (*validate)(struct test_struct *),
+		struct cpumask *mask, const char *name, const char *subtext,
+		bool unreliable)
+{
+	int cpu;
+	unsigned long time = 0;
+	unsigned long sum = 0;
+	unsigned long cycles;
+	int nr_cpu = 0;
+
+	validation_failed = false;
+
+	atomic_set(&tests_running, 0);
+	atomic_set(&phase1_complete, 0);
+	started = 0;
+	init_completion(&completion1);
+	init_completion(&completion2);
+	init_completion(&completion3);
+
+	cpus = 0;
+	for_each_cpu(cpu, mask) {
+		struct test_struct *t = test + cpu;
+
+		t->cpu = cpu;
+		t->count = TEST_COUNT;
+		t->test_i = init;
+		t->test_p = p;
+		t->test_v = validate;
+		t->unreliable = unreliable;
+		t->task = kthread_run(test_func, t, "test%d", cpu);
+		if (IS_ERR(t->task)) {
+			printk(KERN_INFO "Failed to start test func\n");
+			return 0;
+		} else
+			cpus++;
+	}
+
+	/* Wait till all processes are running */
+	while (atomic_read(&tests_running) < cpus) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(10);
+	}
+	complete_all(&completion1);
+
+	/* Wait till all processes have completed phase 1 */
+	while (atomic_read(&phase1_complete) < cpus) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(10);
+	}
+	complete_all(&completion2);
+
+	while (atomic_read(&tests_running)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(10);
+	}
+
+	complete_all(&completion3);
+
+	msleep(1000);
+
+	for_each_cpu(cpu, mask) {
+		kthread_stop(test[cpu].task);
+	}
+	for_each_cpu(cpu, mask) {
+		struct test_struct *t = &test[cpu];
+
+		time = t->stop - t->start;
+		sum += time;
+
+		nr_cpu++;
+	}
+
+
+	printk(KERN_INFO "%s\t:", name);
+	cycles = ((sum / cpus) + TEST_COUNT /2 ) / TEST_COUNT;
+
+	printk(KERN_CONT " Average=%lu", cycles);
+	if (!unreliable && validation_failed)
+		printk(KERN_CONT " ** Counter corrupted **");
+
+	for_each_cpu(cpu, mask) {
+		struct test_struct *t = &test[cpu];
+
+		time = t->stop - t->start;
+
+		if (nr_cpu < 8) {
+			/* limit number of cpus printed for systems with large counts of cpus */
+			printk(KERN_CONT " cpu%d=%lu", cpu, (time + TEST_COUNT /2 )/ TEST_COUNT);
+		}
+	}
+
+	printk(KERN_CONT "\n");
+	schedule_timeout(200);
+	return cycles;
+};
+
+static unsigned do_single_test(
+		void (*init)(struct test_struct *),
+		void (*p)(struct test_struct *),
+		void (*validate)(struct test_struct *),
+		const char *name, const char *subtext)
+{
+	unsigned long time;
+	unsigned long cycles;
+	struct test_struct *t = test;
+
+	cpus = 1;
+	validation_failed = false;
+
+	CYCLES_ENABLE;
+
+	t->count = TEST_COUNT;
+	init(t);
+	p(t);
+	if (validate && !validation_failed)
+		validate(t);
+
+	CYCLES_DISABLE;
+
+	time = t->stop - t->start;
+
+	printk(KERN_INFO "%s\t:", name);
+	cycles = (time + TEST_COUNT /2 ) / TEST_COUNT;
+	printk(KERN_CONT " Cycles=%lu", cycles);
+	if (validation_failed)
+		printk(KERN_CONT " ** Counter corrupted **");
+	printk(KERN_CONT "\n");
+	schedule_timeout(200);
+	return cycles;
+};
+
+#define NR_TESTS 14
+
+struct test_info {
+	void (*init)(struct test_struct *);
+	void (*test)(struct test_struct *);
+	void (*validate)(struct test_struct *);
+	const char *text;
+	bool multithreaded_unreliable;
+} tests[NR_TESTS] = {
+       	{ init_normal_increment, test_normal_increment, validate_normal_increment,	"x++\t\t\t", true },
+	{ init_this_cpu,	test_this_cpu_inc,	validate_this_cpu,		"this_cpu_inc(x)\t\t", false },
+	{ init_this_cpu,	test_this_cpu_inc_return, validate_this_cpu,		"this_cpu_inc_return(x)\t", false },
+	{ init_this_cpu,	test_this_cpu_xchg,	validate_this_cpu,		"this_cpu_xchg(x)\t\t", false },
+	{ init_this_cpu,	test_this_cpu_cmpxchg,	validate_this_cpu,		"this_cpu_cmpxchg(x)\t", false },
+	{ init_this_cpu128,	test_this_cpu_cmpxchg128, validate_this_cpu128,		"this_cpu_cmpxchg128(x)\t", false },
+	{ init_atomic,		test_atomic_inc,	validate_atomic,		"atomic_long_inc(x)\t", false },
+	{ init_atomic,		test_atomic_inc_return, validate_atomic,		"atomic_long_inc_return(x)", false },
+	{ init_atomic,		test_atomic_fetchadd,	validate_atomic,		"atomic_fetchadd(x)\t", false },
+	{ init_atomic_1,	test_atomic_inc_not_zero, NULL,				"atomic_inc_not_zero(x)\t", false },
+	{ init_atomic,		test_atomic_sub_test,	NULL,				"atomic_sub_test(x)\t", false },
+	{ init_atomic,		test_xchg,		validate_cmpxchg,		"atomic_xchg(x)\t\t", true },
+	{ init_atomic,		test_cmpxchg,		validate_cmpxchg,		"atomic_cmpxchg(x)\t", true },
+	{ init_atomic128,	test_cmpxchg128,	validate_atomic128,		"atomic_cmpxchg128(x)\t", true }
+};
+
+/* Matrix for the scaling of the operations */
+#define NR_COLS 20
+
+const char *coltext[NR_COLS];
+
+unsigned cycle_matrix[NR_TESTS][NR_COLS];
+
+
+static void run_atomic_tests(struct cpumask *mask, const char *subtext, int index)
+{
+	int i;
+
+	for(i = 0; i < NR_TESTS; i++)
+		cycle_matrix[i][index] = do_concurrent_test(tests[i].init, tests[i].test, tests[i].validate, mask, tests[i].text,
+			subtext, tests[i].multithreaded_unreliable);
+}
+
+static void run_atomic_tests_single(const char *subtext)
+{
+	int i;
+
+	for(i = 0; i < NR_TESTS; i++)
+		cycle_matrix[i][0] = do_single_test(tests[i].init, tests[i].test, tests[i].validate, tests[i].text,
+			subtext);
+}
+
+static void __pick_cpus(struct cpumask *mask, int cpus, int node)
+{
+	int cpu = 0;
+	int start = 4;
+	int found_cpus = 0;
+
+	 while (found_cpus < cpus) {
+		int picked = cpu + start;
+
+		if (picked > nr_cpu_ids) {
+			printk(KERN_ERR "Not enogh cpus per node\n");
+			return;
+		}
+		if (cpu_to_node(picked) == node) {
+			cpumask_set_cpu(picked, mask);
+			found_cpus++;
+		}
+		cpu++;
+	}
+}
+
+static void pick_cpus(struct cpumask *mask, int cpus, int node)
+{
+	int i;
+	int cpus_per_node;
+
+	*mask = CPU_MASK_NONE;
+
+	if (node != NUMA_NO_NODE) {
+       		__pick_cpus(mask, cpus, node);
+		return;
+	}
+
+	cpus_per_node = cpus / nr_node_ids;
+
+	for(i = 0; i < nr_node_ids; i++)
+		__pick_cpus(mask, cpus_per_node, i);
+}
+
+static int atomic_test_init(void)
+{
+	int i;
+	struct cpumask mask;
+	char buf[PAGE_SIZE];
+	int col = 0;
+
+	printk(KERN_INFO "\nUncontended single thread on single core\n");
+	printk(KERN_INFO "===========================================================\n");
+	run_atomic_tests_single("Single thread");
+	coltext[col++] = "Single";
+
+	for(i = 1;  i < 7; i++) {
+		int x = 1 << i;
+
+		pick_cpus(&mask, x, 0);
+		cpumap_print_to_pagebuf(true, buf, &mask);
+		buf[strlen(buf)-1] = 0;
+
+		printk(KERN_INFO "\nConcurrent operations on %d Cores (%s)\n", x, buf);
+		printk(KERN_INFO "==========================================\n");
+
+		run_atomic_tests(&mask, coltext[i] = "N cpus", i);
+		coltext[col++] = kasprintf(GFP_KERNEL, "%d CPU", x);
+	}
+
+	if (nr_node_ids > 1) {
+
+		for(i = 1;  i < 7; i++) {
+			int x = 1 << i;
+
+			pick_cpus(&mask, x, NUMA_NO_NODE);
+			cpumap_print_to_pagebuf(true, buf, &mask);
+			buf[strlen(buf)-1] = 0;
+
+			printk(KERN_INFO "\nConcurrent operations on %d Cores (%s) on all NUMA NODES\n", x, buf);
+			printk(KERN_INFO "==============================================================\n");
+
+			run_atomic_tests(&mask, "N cpus", i + 6);
+			coltext[col++] = kasprintf(GFP_KERNEL, "%d 2P", x);
+		}
+
+
+		printk(KERN_INFO "\nConcurrent operations on a NUMA node\n");
+		printk(KERN_INFO "==========================================\n");
+
+		cpumask_copy(&mask, cpumask_of_node(0));
+
+		run_atomic_tests(&mask, "Node 0 cpus", col);
+		coltext[col++] = "NODE";
+	}
+
+	printk(KERN_INFO "\nConcurrent operations all cpus\n");
+	printk(KERN_INFO "==========================================\n");
+
+	mask = CPU_MASK_ALL;
+	run_atomic_tests(&mask, "All cpus", col);
+	coltext[col++] = "ALL";
+
+	if (col > NR_COLS)
+		printk(KERN_ERR "atomic_test: Too many columns\n");
+
+	/* Compact version: Test Matrix */
+	printk(KERN_INFO "\nTestMatrix\n");
+	printk(KERN_INFO "---------------------------\n");
+	printk(KERN_INFO "Test\t\t\t\t");
+
+	for(i = 0; i < col; i++)
+		printk(KERN_CONT "\t%s", coltext[i]);
+
+	printk(KERN_CONT "\n");
+
+	for(i=0; i < NR_TESTS; i++) {
+		int j;
+
+		printk(KERN_CONT "%s\t:", tests[i].text);
+
+		for(j = 0; j < col; j++)
+			printk(KERN_CONT "\t%d", cycle_matrix[i][j]);
+
+		printk(KERN_CONT "\n");
+	}
+
+
+	return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void atomic_test_exit(void)
+{
+	printk(KERN_INFO "test exit\n");
+}
+
+module_init(atomic_test_init)
+module_exit(atomic_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christopher Lameter <cl@os.amperecomputing.com>");
+MODULE_DESCRIPTION("Atomic test");
+
Index: linux/tests/Kconfig
===================================================================
--- linux.orig/tests/Kconfig
+++ linux/tests/Kconfig
@@ -15,5 +15,12 @@ config BENCHMARK_SLAB
 	help
 	  A benchmark that measures slab allocator performance.
 
+config BENCHMARK_ATOMIC
+	tristate "Atomic Operations Benchmark"
+	depends on m
+	default m
+	help
+		 A benchmark that measures cycle count of atomic operations.
+
 endif # BENCHMARKS
 
Index: linux/tests/Makefile
===================================================================
--- linux.orig/tests/Makefile
+++ linux/tests/Makefile
@@ -2,4 +2,6 @@
 
 obj-$(CONFIG_BENCHMARK_SLAB) += slab_test.o
 
-#
+obj-$(CONFIG_BENCHMARK_ATOMIC) += atomic_test.o
+
+