/*	$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $	*/

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * uvm_pgflcache.c: page freelist cache.
 *
 * This implements a tiny per-CPU cache of pages that sits between the main
 * page allocator and the freelists.  By allocating and freeing pages in
 * batch, it reduces freelist contention by an order of magnitude.
 *
 * The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
 * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
 * world.  On system with one CPU per physical package (e.g. a uniprocessor)
 * the cache is not enabled.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $");

#include "opt_uvm.h"
#include "opt_multiprocessor.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>

/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR

/*
 * MAXPGS - maximum pages per color, per bucket.
 * FILLPGS - number of pages to allocate at once, per color, per bucket.
 *
 * Why the chosen values:
 *
 * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
 * colors.  We make the assumption that most of the time allocation activity
 * will be centered around one UVM freelist, so most of the time there will
 * be no more than 224kB worth of cached pages per-CPU.  That's tiny, but
 * enough to hugely reduce contention on the freelist locks, and give us a
 * small pool of pages which if we're very lucky may have some L1/L2 cache
 * locality, and do so without subtracting too much from the L2/L3 cache
 * benefits of having per-package free lists in the page allocator.
 *
 * (2) With the chosen values on _LP64, the data structure for each color
 * takes up a single cache line (64 bytes) giving this very low overhead
 * even in the "miss" case.
 *
 * (3) We don't want to cause too much pressure by hiding away memory that
 * could otherwise be put to good use.
 */
#define	MAXPGS		7
#define	FILLPGS		6

/* Variable size, according to # colors. */
struct pgflcache {
	struct pccolor {
		intptr_t	count;
		struct vm_page	*pages[MAXPGS];
	} color[1];
};

static kmutex_t		uvm_pgflcache_lock;
static int		uvm_pgflcache_sem;

/*
 * uvm_pgflcache_fill: fill specified freelist/color from global list
 *
 * => must be called at IPL_VM
 * => must be called with given bucket lock held
 * => must only fill from the correct bucket for this CPU
 */

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
	struct pgflbucket *pgb;
	struct pgflcache *pc;
	struct pccolor *pcc;
	struct pgflist *head;
	struct vm_page *pg;
	int count;

	KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
	KASSERT(ucpu->pgflbucket == b);

	/* If caching is off, then bail out. */
	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
		return;
	}

	/* Fill only to the limit. */
	pcc = &pc->color[c];
	pgb = uvm.page_free[fl].pgfl_buckets[b];
	head = &pgb->pgb_colors[c];
	if (pcc->count >= FILLPGS) {
		return;
	}

	/* Pull pages from the bucket until it's empty, or we are full. */
	count = pcc->count;
	pg = LIST_FIRST(head);
	while (__predict_true(pg != NULL && count < FILLPGS)) {
		KASSERT(pg->flags & PG_FREE);
		KASSERT(uvm_page_get_bucket(pg) == b);
		pcc->pages[count++] = pg;
		pg = LIST_NEXT(pg, pageq.list);
	}

	/* Violate LIST abstraction to remove all pages at once. */
	head->lh_first = pg;
	if (__predict_true(pg != NULL)) {
		pg->pageq.list.le_prev = &head->lh_first;
	}
	pgb->pgb_nfree -= (count - pcc->count);
	CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
	pcc->count = count;
}

/*
 * uvm_pgflcache_spill: spill specified freelist/color to global list
 *
 * => must be called at IPL_VM
 * => mark __noinline so we don't pull it into uvm_pgflcache_free()
 */

static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
	struct pgflbucket *pgb;
	struct pgfreelist *pgfl;
	struct pgflcache *pc;
	struct pccolor *pcc;
	struct pgflist *head;
	kmutex_t *lock;
	int b, adj;

	pc = ucpu->pgflcache[fl];
	pcc = &pc->color[c];
	pgfl = &uvm.page_free[fl];
	b = ucpu->pgflbucket;
	pgb = pgfl->pgfl_buckets[b];
	head = &pgb->pgb_colors[c];
	lock = &uvm_freelist_locks[b].lock;

	mutex_spin_enter(lock);
	for (adj = pcc->count; pcc->count != 0;) {
		pcc->count--;
		KASSERT(pcc->pages[pcc->count] != NULL);
		KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
		LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
	}
	pgb->pgb_nfree += adj;
	CPU_COUNT(CPU_COUNT_FREEPAGES, adj);
	mutex_spin_exit(lock);
}

/*
 * uvm_pgflcache_alloc: try to allocate a cached page.
 *
 * => must be called at IPL_VM
 * => allocate only from the given freelist and given page color
 */

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
	struct pgflcache *pc;
	struct pccolor *pcc;
	struct vm_page *pg;

	/* If caching is off, then bail out. */
	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
		return NULL;
	}

	/* Very simple: if we have a page then return it. */
	pcc = &pc->color[c];
	if (__predict_false(pcc->count == 0)) {
		return NULL;
	}
	pg = pcc->pages[--(pcc->count)];
	KASSERT(pg != NULL);
	KASSERT(pg->flags == PG_FREE);
	KASSERT(uvm_page_get_freelist(pg) == fl);
	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
	pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
	return pg;
}

/*
 * uvm_pgflcache_free: cache a page, if possible.
 *
 * => must be called at IPL_VM
 * => must only send pages for the correct bucket for this CPU
 */

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
	struct pgflcache *pc;
	struct pccolor *pcc;
	int fl, c;

	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);

	/* If caching is off, then bail out. */
 	fl = uvm_page_get_freelist(pg);
	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
		return false;
	}

	/* If the array is full spill it first, then add page to array. */
	c = VM_PGCOLOR(pg);
	pcc = &pc->color[c];
	KASSERT((pg->flags & PG_FREE) == 0);
	if (__predict_false(pcc->count == MAXPGS)) {
		uvm_pgflcache_spill(ucpu, fl, c);
	}
	pg->flags = PG_FREE;
	pcc->pages[pcc->count] = pg;
	pcc->count++;
	return true;
}

/*
 * uvm_pgflcache_init: allocate and initialize per-CPU data structures for
 * the free page cache.  Don't set anything in motion - that's taken care
 * of by uvm_pgflcache_resume().
 */

static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
	struct uvm_cpu *ucpu;
	size_t sz;

	ucpu = ci->ci_data.cpu_uvm;
	KASSERT(ucpu->pgflcachemem == NULL);
	KASSERT(ucpu->pgflcache[0] == NULL);

	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
	ucpu->pgflcachememsz =
	    (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
	ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}

/*
 * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
 * and shut down caching on the CPU.  Called on each CPU in the system via
 * xcall.
 */

static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
	struct uvm_cpu *ucpu;
	int fl, color, s;

	ucpu = curcpu()->ci_data.cpu_uvm;
	for (fl = 0; fl < VM_NFREELIST; fl++) {
		s = splvm();
		for (color = 0; color < uvmexp.ncolors; color++) {
			uvm_pgflcache_spill(ucpu, fl, color);
		}
		ucpu->pgflcache[fl] = NULL;
		splx(s);
	}
}

/*
 * uvm_pgflcache_pause: pause operation of the caches
 */

void
uvm_pgflcache_pause(void)
{
	uint64_t where;

	/* First one in starts draining.  Everyone else waits. */
	mutex_enter(&uvm_pgflcache_lock);
	if (uvm_pgflcache_sem++ == 0) {
		where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu,
		    (void *)1, NULL);
		xc_wait(where);
	}
	mutex_exit(&uvm_pgflcache_lock);
}

/*
 * uvm_pgflcache_resume: resume operation of the caches
 */

void
uvm_pgflcache_resume(void)
{
	CPU_INFO_ITERATOR cii;
	struct cpu_info *ci;
	struct uvm_cpu *ucpu;
	uintptr_t addr;
	size_t sz;
	int fl;

	/* Last guy out takes care of business. */
	mutex_enter(&uvm_pgflcache_lock);
	KASSERT(uvm_pgflcache_sem > 0);
	if (uvm_pgflcache_sem-- > 1) {
		mutex_exit(&uvm_pgflcache_lock);
		return;
	}

	/*
	 * Make sure dependant data structure updates are remotely visible.
	 * Essentially this functions as a global memory barrier.
	 */
	xc_barrier(XC_HIGHPRI);

	/*
	 * Then set all of the pointers in place on each CPU.  As soon as
	 * each pointer is set, caching is operational in that dimension.
	 */
	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
	for (CPU_INFO_FOREACH(cii, ci)) {
		ucpu = ci->ci_data.cpu_uvm;
		addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
		for (fl = 0; fl < VM_NFREELIST; fl++) {
			ucpu->pgflcache[fl] = (struct pgflcache *)addr;
			addr += sz;
		}
	}
	mutex_exit(&uvm_pgflcache_lock);
}

/*
 * uvm_pgflcache_start: start operation of the cache.
 *
 * => called once only, when init(8) is about to be started
 */

void
uvm_pgflcache_start(void)
{
	CPU_INFO_ITERATOR cii;
	struct cpu_info *ci;

	KASSERT(uvm_pgflcache_sem > 0);

	/*
	 * There's not much point doing this if every CPU has its own
	 * bucket (and that includes the uniprocessor case).
	 */
	if (ncpu == uvm.bucketcount) {
		return;
	}

	/* Create data structures for each CPU. */
	for (CPU_INFO_FOREACH(cii, ci)) {
		uvm_pgflcache_init_cpu(ci);
	}

	/* Kick it into action. */
	uvm_pgflcache_resume();
}

/*
 * uvm_pgflcache_init: set up data structures for the free page cache.
 */

void
uvm_pgflcache_init(void)
{

	uvm_pgflcache_sem = 1;
	mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
}

#else	/* MULTIPROCESSOR */

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{

	return NULL;
}

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{

	return false;
}

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{

}

void
uvm_pgflcache_pause(void)
{

}

void
uvm_pgflcache_resume(void)
{

}

void
uvm_pgflcache_start(void)
{

}

void
uvm_pgflcache_init(void)
{

}

#endif	/* MULTIPROCESSOR */