/*
 * Copyright (C) 2013-2017 Canonical, Ltd.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * This code is a complete clean re-write of the stress tool by
 * Colin Ian King <colin.king@canonical.com> and attempts to be
 * backwardly compatible with the stress tool by Amos Waterland
 * <apw@rossby.metr.ou.edu> but has more stress tests and more
 * functionality.
 *
 */
#include "stress-ng.h"

#if defined(__linux__) &&		\
    defined(__NR_get_mempolicy) &&	\
    defined(__NR_mbind) &&		\
    defined(__NR_migrate_pages) &&	\
    defined(__NR_move_pages) &&		\
    defined(__NR_set_mempolicy)

#define BITS_PER_BYTE		(8)
#define NUMA_LONG_BITS		(sizeof(unsigned long) * BITS_PER_BYTE)

#define MPOL_DEFAULT		(0)
#define MPOL_PREFERRED		(1)
#define MPOL_BIND		(2)
#define MPOL_INTERLEAVE		(3)
#define MPOL_LOCAL		(4)

#define MPOL_F_NODE		(1 << 0)
#define MPOL_F_ADDR		(1 << 1)
#define MPOL_F_MEMS_ALLOWED	(1 << 2)

#define MPOL_MF_STRICT		(1 << 0)
#define MPOL_MF_MOVE		(1 << 1)
#define MPOL_MF_MOVE_ALL	(1 << 2)

#define MMAP_SZ			(4 * MB)

typedef struct node {
	uint32_t	node_id;
	struct node	*next;
} node_t;

/*
 *  stress_numa_get_max_nodes()
 *	probe for maximum number of nodes
 */
static unsigned long stress_numa_get_max_nodes(void)
{
	unsigned long sz = BITS_PER_BYTE, *mask = NULL;

	do {
		int mode = 0;
		unsigned long *newmask = realloc(mask, sz / BITS_PER_BYTE);

		if (!newmask)
			break;
		mask = newmask;
		if (shim_get_mempolicy(&mode, mask, sz, 0, 0) == 0)
			goto done;
		sz <<= 1;
	} while (sz < 0x100000 && errno == EINVAL);

	/* Failed */
	sz = 0;
done:
	free(mask);

	return sz;
}

/*
 *  stress_numa_free_nodes()
 *	free circular list of node info
 */
static void stress_numa_free_nodes(node_t *nodes)
{
	node_t *n = nodes;

	while (n) {
		node_t *next = n->next;

		free(n);
		n = next;

		if (n == nodes)
			break;
	}
}

/*
 *  hex_to_int()
 *	convert ASCII hex digit to integer
 */
static inline int hex_to_int(const char ch)
{
	if (ch >= '0' && ch <= '9')
		return ch - '0';
	if (ch >= 'a' && ch <= 'f')
		return ch - 'a' + 10;
	if (ch >= 'A' && ch <= 'F')
		return ch - 'F' + 10;
	return -1;
}

/*
 *  stress_numa_get_mem_nodes(void)
 *	collect number of NUMA memory nodes, add them to a
 *	circular linked list
 */
static int stress_numa_get_mem_nodes(node_t **node_ptr)
{
	FILE *fp;
	unsigned long n = 0, node_id = 0;
	node_t *tail = NULL;
	*node_ptr = NULL;
	char buffer[8192], *str = NULL, *ptr;

	fp = fopen("/proc/self/status", "r");
	if (!fp)
		return -1;

	while (fgets(buffer, sizeof(buffer), fp)) {
		if (!strncmp(buffer, "Mems_allowed:", 13)) {
			str = buffer + 13;
			break;
		}
	}
	(void)fclose(fp);

	if (!str)
		return -1;

	ptr = buffer + strlen(buffer) - 2;

	/*
	 *  Parse hex digits into NUMA node ids, these
	 *  are listed with least significant node last
	 *  so we need to scan backwards from the end of
	 *  the string back to the start.
	 */
	while (*ptr != ' ' && (ptr > str)) {
		int val, i;

		/* Skip commas */
		if (*ptr == ',') {
			ptr--;
			continue;
		}

		val = hex_to_int(*ptr);
		if (val < 0)
			return -1;

		/* Each hex digit represent 4 memory nodes */
		for (i = 0; i < 4; i++) {
			if (val & (1 << i)) {
				node_t *node = calloc(1, sizeof(*node));
				if (!node)
					return -1;
				node->node_id = node_id;
				node->next = *node_ptr;
				*node_ptr = node;
				if (!tail)
					tail = node;
				tail->next = node;
				n++;
			}
			node_id++;
		}
		ptr--;
	}

	return n;
}

/*
 *  stress_numa()
 *	stress the Linux NUMA interfaces
 */
int stress_numa(
	uint64_t *const counter,
	const uint32_t instance,
	const uint64_t max_ops,
	const char *name)
{
	long numa_nodes;
	unsigned long max_nodes;
	const unsigned long lbits = NUMA_LONG_BITS;
	const unsigned long page_sz = stress_get_pagesize();
	const unsigned long num_pages = MMAP_SZ / page_sz;
	uint8_t *buf;
	node_t *n;
	const pid_t mypid = getpid();
	int rc = EXIT_FAILURE;

	numa_nodes = stress_numa_get_mem_nodes(&n);
	if (numa_nodes < 1) {
		pr_inf(stdout, "%s: no NUMA nodes not found, "
			"aborting test\n", name);
		rc = EXIT_NO_RESOURCE;
		goto numa_free;
	}
	max_nodes = stress_numa_get_max_nodes();
	if (max_nodes == 0) {
		pr_inf(stderr, "%s: cannot determine maximum number "
			"of NUMA nodes, aborting test\n", name);
		rc = EXIT_NO_RESOURCE;
		goto numa_free;
	}
	if (!instance) {
		pr_inf(stdout, "%s: system has %lu of a maximum %lu memory NUMA nodes\n",
			name, numa_nodes, max_nodes);
	}

	/*
	 *  We need a buffer to migrate around NUMA nodes
	 */
	buf = mmap(NULL, MMAP_SZ, PROT_READ | PROT_WRITE,
		MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
	if (buf == MAP_FAILED) {
		rc = exit_status(errno);
		pr_fail(stderr, "%s: mmap'd region of %zu bytes failed",
			name, (size_t)MMAP_SZ);
		goto numa_free;
	}

	do {
		int j, mode, ret, status[num_pages], dest_nodes[num_pages];
		unsigned long i, node_mask[lbits], old_node_mask[lbits];
		void *pages[num_pages];
		uint8_t *ptr;
		node_t *n_tmp;
		unsigned cpu, curr_node;

		/*
		 *  Fetch memory policy
		 */
		ret = shim_get_mempolicy(&mode, node_mask, max_nodes,
			(unsigned long)buf, MPOL_F_ADDR);
		if (ret < 0) {
			pr_fail_err(name, "get_mempolicy");
			goto err;
		}
		if (!opt_do_run)
			break;

		ret = shim_set_mempolicy(MPOL_PREFERRED, NULL, max_nodes);
		if (ret < 0) {
			pr_fail_err(name, "set_mempolicy");
			goto err;
		}
		memset(buf, 0xff, MMAP_SZ);
		if (!opt_do_run)
			break;

		/*
		 *  Fetch CPU and node, we just waste some cycled
		 *  doing this for stress reasons only
		 */
		(void)shim_getcpu(&cpu, &curr_node, NULL);

		/*
		 *  mbind the buffer, first try MPOL_STRICT which
		 *  may fail with EIO
		 */
		memset(node_mask, 0, sizeof(node_mask));
		STRESS_SETBIT(node_mask, n->node_id);
		ret = shim_mbind(buf, MMAP_SZ, MPOL_BIND, node_mask,
			max_nodes, MPOL_MF_STRICT);
		if (ret < 0) {
			if (errno != EIO) {
				pr_fail_err(name, "mbind");
				goto err;
			}
		} else {
			memset(buf, 0xaa, MMAP_SZ);
		}
		if (!opt_do_run)
			break;

		/*
		 *  mbind the buffer, now try MPOL_DEFAULT
		 */
		memset(node_mask, 0, sizeof(node_mask));
		STRESS_SETBIT(node_mask, n->node_id);
		ret = shim_mbind(buf, MMAP_SZ, MPOL_BIND, node_mask,
			max_nodes, MPOL_DEFAULT);
		if (ret < 0) {
			if (errno != EIO) {
				pr_fail_err(name, "mbind");
				goto err;
			}
		} else {
			memset(buf, 0x5c, MMAP_SZ);
		}
		if (!opt_do_run)
			break;

		/* Move to next node */
		n = n->next;

		/*
		 *  Migrate all this processes pages to the current new node
		 */
		memset(old_node_mask, 0xff, sizeof(old_node_mask));
		memset(node_mask, 0, sizeof(node_mask));
		STRESS_SETBIT(node_mask, n->node_id);
		ret = shim_migrate_pages(mypid, max_nodes,
			old_node_mask, node_mask);
		if (ret < 0) {
			pr_fail_err(name, "migrate_pages");
			goto err;
		}
		if (!opt_do_run)
			break;

		n_tmp = n;
		for (j = 0; j < 16; j++) {
			/*
			 *  Now move pages to lots of different numa nodes
			 */
			for (ptr = buf, i = 0; i < num_pages; i++, ptr += page_sz, n_tmp = n_tmp->next) {
				pages[i] = ptr;
				dest_nodes[i] = n_tmp->node_id;
			}
			memset(status, 0, sizeof(status));
			ret = shim_move_pages(mypid, num_pages, pages,
				dest_nodes, status, MPOL_MF_MOVE);
			if (ret < 0) {
				pr_fail_err(name, "move_pages");
				goto err;
			}
			memset(buf, j, MMAP_SZ);
			if (!opt_do_run)
				break;
		}
		(*counter)++;
	} while (opt_do_run && (!max_ops || *counter < max_ops));

	rc = EXIT_SUCCESS;
err:
	munmap(buf, MMAP_SZ);
numa_free:
	stress_numa_free_nodes(n);

	return rc;
}
#else
int stress_numa(
	uint64_t *const counter,
	const uint32_t instance,
	const uint64_t max_ops,
	const char *name)
{
	return stress_not_implemented(counter, instance, max_ops, name);
}
#endif
