linux/mm/mmu_gather.c

ef24e0aaSTim Bird// SPDX-License-Identifier: GPL-2.0
196d9d8bSPeter Zijlstra#include <linux/gfp.h>
196d9d8bSPeter Zijlstra#include <linux/highmem.h>
196d9d8bSPeter Zijlstra#include <linux/kernel.h>
196d9d8bSPeter Zijlstra#include <linux/mmdebug.h>
196d9d8bSPeter Zijlstra#include <linux/mm_types.h>
36090defSArnd Bergmann#include <linux/mm_inline.h>
196d9d8bSPeter Zijlstra#include <linux/pagemap.h>
196d9d8bSPeter Zijlstra#include <linux/rcupdate.h>
196d9d8bSPeter Zijlstra#include <linux/smp.h>
196d9d8bSPeter Zijlstra#include <linux/swap.h>
5df397deSLinus Torvalds#include <linux/rmap.h>
ad8b2e09SHarry Yoo#include <linux/pgalloc.h>
8ce720d5SDavid Hildenbrand (Red Hat)#include <linux/hugetlb.h>
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra#include <asm/tlb.h>
196d9d8bSPeter Zijlstra
580a586cSPeter Zijlstra#ifndef CONFIG_MMU_GATHER_NO_GATHER
952a31c9SMartin Schwidefsky
196d9d8bSPeter Zijlstrastatic bool tlb_next_batch(struct mmu_gather *tlb)
196d9d8bSPeter Zijlstra{
196d9d8bSPeter Zijlstra	struct mmu_gather_batch *batch;
196d9d8bSPeter Zijlstra
c4745482SLinus Torvalds	/* Limit batching if we have delayed rmaps pending */
c4745482SLinus Torvalds	if (tlb->delayed_rmap && tlb->active != &tlb->local)
5df397deSLinus Torvalds		return false;
5df397deSLinus Torvalds
196d9d8bSPeter Zijlstra	batch = tlb->active;
196d9d8bSPeter Zijlstra	if (batch->next) {
196d9d8bSPeter Zijlstra		tlb->active = batch->next;
196d9d8bSPeter Zijlstra		return true;
196d9d8bSPeter Zijlstra	}
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
196d9d8bSPeter Zijlstra		return false;
196d9d8bSPeter Zijlstra
adf085ffSQianfeng Rong	batch = (void *)__get_free_page(GFP_NOWAIT);
196d9d8bSPeter Zijlstra	if (!batch)
196d9d8bSPeter Zijlstra		return false;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	tlb->batch_count++;
196d9d8bSPeter Zijlstra	batch->next = NULL;
196d9d8bSPeter Zijlstra	batch->nr   = 0;
196d9d8bSPeter Zijlstra	batch->max  = MAX_GATHER_BATCH;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	tlb->active->next = batch;
196d9d8bSPeter Zijlstra	tlb->active = batch;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	return true;
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
5df397deSLinus Torvalds#ifdef CONFIG_SMP
c4745482SLinus Torvaldsstatic void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
5df397deSLinus Torvalds{
d7f861b9SDavid Hildenbrand	struct encoded_page **pages = batch->encoded_pages;
d7f861b9SDavid Hildenbrand
5df397deSLinus Torvalds	for (int i = 0; i < batch->nr; i++) {
d7f861b9SDavid Hildenbrand		struct encoded_page *enc = pages[i];
5df397deSLinus Torvalds
da510964SDavid Hildenbrand		if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
5df397deSLinus Torvalds			struct page *page = encoded_page_ptr(enc);
d7f861b9SDavid Hildenbrand			unsigned int nr_pages = 1;
d7f861b9SDavid Hildenbrand
d7f861b9SDavid Hildenbrand			if (unlikely(encoded_page_flags(enc) &
d7f861b9SDavid Hildenbrand				     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
d7f861b9SDavid Hildenbrand				nr_pages = encoded_nr_pages(pages[++i]);
d7f861b9SDavid Hildenbrand
d7f861b9SDavid Hildenbrand			folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
d7f861b9SDavid Hildenbrand					       vma);
5df397deSLinus Torvalds		}
5df397deSLinus Torvalds	}
c4745482SLinus Torvalds}
5df397deSLinus Torvalds
c4745482SLinus Torvalds/**
c4745482SLinus Torvalds * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
c4745482SLinus Torvalds * @tlb: the current mmu_gather
19134bc2SMatthew Wilcox (Oracle) * @vma: The memory area from which the pages are being removed.
c4745482SLinus Torvalds *
c4745482SLinus Torvalds * Note that because of how tlb_next_batch() above works, we will
c4745482SLinus Torvalds * never start multiple new batches with pending delayed rmaps, so
c4745482SLinus Torvalds * we only need to walk through the current active batch and the
c4745482SLinus Torvalds * original local one.
c4745482SLinus Torvalds */
c4745482SLinus Torvaldsvoid tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
c4745482SLinus Torvalds{
c4745482SLinus Torvalds	if (!tlb->delayed_rmap)
c4745482SLinus Torvalds		return;
c4745482SLinus Torvalds
c4745482SLinus Torvalds	tlb_flush_rmap_batch(&tlb->local, vma);
c4745482SLinus Torvalds	if (tlb->active != &tlb->local)
c4745482SLinus Torvalds		tlb_flush_rmap_batch(tlb->active, vma);
5df397deSLinus Torvalds	tlb->delayed_rmap = 0;
5df397deSLinus Torvalds}
5df397deSLinus Torvalds#endif
5df397deSLinus Torvalds
e61abd44SDavid Hildenbrand/*
e61abd44SDavid Hildenbrand * We might end up freeing a lot of pages. Reschedule on a regular
e61abd44SDavid Hildenbrand * basis to avoid soft lockups in configurations without full
e61abd44SDavid Hildenbrand * preemption enabled. The magic number of 512 folios seems to work.
e61abd44SDavid Hildenbrand */
e61abd44SDavid Hildenbrand#define MAX_NR_FOLIOS_PER_FREE		512
196d9d8bSPeter Zijlstra
e61abd44SDavid Hildenbrandstatic void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
e61abd44SDavid Hildenbrand{
7cc8f9c7SLinus Torvalds	struct encoded_page **pages = batch->encoded_pages;
e61abd44SDavid Hildenbrand	unsigned int nr, nr_pages;
b191c9bcSJianxing Wang
d7f861b9SDavid Hildenbrand	while (batch->nr) {
e61abd44SDavid Hildenbrand		if (!page_poisoning_enabled_static() && !want_init_on_free()) {
e61abd44SDavid Hildenbrand			nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
b191c9bcSJianxing Wang
d7f861b9SDavid Hildenbrand			/*
d7f861b9SDavid Hildenbrand			 * Make sure we cover page + nr_pages, and don't leave
d7f861b9SDavid Hildenbrand			 * nr_pages behind when capping the number of entries.
d7f861b9SDavid Hildenbrand			 */
d7f861b9SDavid Hildenbrand			if (unlikely(encoded_page_flags(pages[nr - 1]) &
d7f861b9SDavid Hildenbrand				     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
d7f861b9SDavid Hildenbrand				nr++;
e61abd44SDavid Hildenbrand		} else {
e61abd44SDavid Hildenbrand			/*
e61abd44SDavid Hildenbrand			 * With page poisoning and init_on_free, the time it
e61abd44SDavid Hildenbrand			 * takes to free memory grows proportionally with the
e61abd44SDavid Hildenbrand			 * actual memory size. Therefore, limit based on the
e61abd44SDavid Hildenbrand			 * actual memory size and not the number of involved
e61abd44SDavid Hildenbrand			 * folios.
e61abd44SDavid Hildenbrand			 */
e61abd44SDavid Hildenbrand			for (nr = 0, nr_pages = 0;
e61abd44SDavid Hildenbrand			     nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
e61abd44SDavid Hildenbrand			     nr++) {
e61abd44SDavid Hildenbrand				if (unlikely(encoded_page_flags(pages[nr]) &
e61abd44SDavid Hildenbrand					     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
e61abd44SDavid Hildenbrand					nr_pages += encoded_nr_pages(pages[++nr]);
e61abd44SDavid Hildenbrand				else
e61abd44SDavid Hildenbrand					nr_pages++;
e61abd44SDavid Hildenbrand			}
e61abd44SDavid Hildenbrand		}
d7f861b9SDavid Hildenbrand
b191c9bcSJianxing Wang		free_pages_and_swap_cache(pages, nr);
b191c9bcSJianxing Wang		pages += nr;
b191c9bcSJianxing Wang		batch->nr -= nr;
b191c9bcSJianxing Wang
b191c9bcSJianxing Wang		cond_resched();
d7f861b9SDavid Hildenbrand	}
196d9d8bSPeter Zijlstra}
e61abd44SDavid Hildenbrand
e61abd44SDavid Hildenbrandstatic void tlb_batch_pages_flush(struct mmu_gather *tlb)
e61abd44SDavid Hildenbrand{
e61abd44SDavid Hildenbrand	struct mmu_gather_batch *batch;
e61abd44SDavid Hildenbrand
e61abd44SDavid Hildenbrand	for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
e61abd44SDavid Hildenbrand		__tlb_batch_free_encoded_pages(batch);
196d9d8bSPeter Zijlstra	tlb->active = &tlb->local;
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
952a31c9SMartin Schwidefskystatic void tlb_batch_list_free(struct mmu_gather *tlb)
196d9d8bSPeter Zijlstra{
196d9d8bSPeter Zijlstra	struct mmu_gather_batch *batch, *next;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	for (batch = tlb->local.next; batch; batch = next) {
196d9d8bSPeter Zijlstra		next = batch->next;
196d9d8bSPeter Zijlstra		free_pages((unsigned long)batch, 0);
196d9d8bSPeter Zijlstra	}
196d9d8bSPeter Zijlstra	tlb->local.next = NULL;
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
d7f861b9SDavid Hildenbrandstatic bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
d7f861b9SDavid Hildenbrand		struct page *page, unsigned int nr_pages, bool delay_rmap,
d7f861b9SDavid Hildenbrand		int page_size)
196d9d8bSPeter Zijlstra{
da510964SDavid Hildenbrand	int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
196d9d8bSPeter Zijlstra	struct mmu_gather_batch *batch;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	VM_BUG_ON(!tlb->end);
ed6a7935SPeter Zijlstra
3af4bd03SPeter Zijlstra#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
196d9d8bSPeter Zijlstra	VM_WARN_ON(tlb->page_size != page_size);
d7f861b9SDavid Hildenbrand	VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
d7f861b9SDavid Hildenbrand	VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
ed6a7935SPeter Zijlstra#endif
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	batch = tlb->active;
196d9d8bSPeter Zijlstra	/*
196d9d8bSPeter Zijlstra	 * Add the page and check if we are full. If so
196d9d8bSPeter Zijlstra	 * force a flush.
196d9d8bSPeter Zijlstra	 */
d7f861b9SDavid Hildenbrand	if (likely(nr_pages == 1)) {
da510964SDavid Hildenbrand		batch->encoded_pages[batch->nr++] = encode_page(page, flags);
d7f861b9SDavid Hildenbrand	} else {
d7f861b9SDavid Hildenbrand		flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
d7f861b9SDavid Hildenbrand		batch->encoded_pages[batch->nr++] = encode_page(page, flags);
d7f861b9SDavid Hildenbrand		batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
d7f861b9SDavid Hildenbrand	}
d7f861b9SDavid Hildenbrand	/*
d7f861b9SDavid Hildenbrand	 * Make sure that we can always add another "page" + "nr_pages",
d7f861b9SDavid Hildenbrand	 * requiring two entries instead of only a single one.
d7f861b9SDavid Hildenbrand	 */
d7f861b9SDavid Hildenbrand	if (batch->nr >= batch->max - 1) {
196d9d8bSPeter Zijlstra		if (!tlb_next_batch(tlb))
196d9d8bSPeter Zijlstra			return true;
196d9d8bSPeter Zijlstra		batch = tlb->active;
196d9d8bSPeter Zijlstra	}
d7f861b9SDavid Hildenbrand	VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	return false;
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
d7f861b9SDavid Hildenbrandbool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
d7f861b9SDavid Hildenbrand		unsigned int nr_pages, bool delay_rmap)
d7f861b9SDavid Hildenbrand{
d7f861b9SDavid Hildenbrand	return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
d7f861b9SDavid Hildenbrand					     PAGE_SIZE);
d7f861b9SDavid Hildenbrand}
d7f861b9SDavid Hildenbrand
f9b74c13SWei Yangbool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
d7f861b9SDavid Hildenbrand{
f9b74c13SWei Yang	return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size);
d7f861b9SDavid Hildenbrand}
d7f861b9SDavid Hildenbrand
580a586cSPeter Zijlstra#endif /* MMU_GATHER_NO_GATHER */
952a31c9SMartin Schwidefsky
0d6e24d4SPeter Zijlstra#ifdef CONFIG_MMU_GATHER_TABLE_FREE
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic void __tlb_remove_table_free(struct mmu_table_batch *batch)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	int i;
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstra	for (i = 0; i < batch->nr; i++)
0d6e24d4SPeter Zijlstra		__tlb_remove_table(batch->tables[i]);
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstra	free_page((unsigned long)batch);
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
ff2e6d72SPeter Zijlstra#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra/*
0d6e24d4SPeter Zijlstra * Semi RCU freeing of the page directories.
0d6e24d4SPeter Zijlstra *
0d6e24d4SPeter Zijlstra * This is needed by some architectures to implement software pagetable walkers.
0d6e24d4SPeter Zijlstra *
0d6e24d4SPeter Zijlstra * gup_fast() and other software pagetable walkers do a lockless page-table
0d6e24d4SPeter Zijlstra * walk and therefore needs some synchronization with the freeing of the page
0d6e24d4SPeter Zijlstra * directories. The chosen means to accomplish that is by disabling IRQs over
0d6e24d4SPeter Zijlstra * the walk.
0d6e24d4SPeter Zijlstra *
0d6e24d4SPeter Zijlstra * Architectures that use IPIs to flush TLBs will then automagically DTRT,
0d6e24d4SPeter Zijlstra * since we unlink the page, flush TLBs, free the page. Since the disabling of
0d6e24d4SPeter Zijlstra * IRQs delays the completion of the TLB flush we can never observe an already
0d6e24d4SPeter Zijlstra * freed page.
0d6e24d4SPeter Zijlstra *
026e8b55SBrendan Jackman * Not all systems IPI every CPU for this purpose:
026e8b55SBrendan Jackman *
026e8b55SBrendan Jackman * - Some architectures have HW support for cross-CPU synchronisation of TLB
026e8b55SBrendan Jackman *   flushes, so there's no IPI at all.
026e8b55SBrendan Jackman *
026e8b55SBrendan Jackman * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
026e8b55SBrendan Jackman *   with the hypervisor to defer flushing on preempted vCPUs.
026e8b55SBrendan Jackman *
026e8b55SBrendan Jackman * Such systems need to delay the freeing by some other means, this is that
026e8b55SBrendan Jackman * means.
0d6e24d4SPeter Zijlstra *
0d6e24d4SPeter Zijlstra * What we do is batch the freed directory pages (tables) and RCU free them.
0d6e24d4SPeter Zijlstra * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
0d6e24d4SPeter Zijlstra * holds off grace periods.
0d6e24d4SPeter Zijlstra *
0d6e24d4SPeter Zijlstra * However, in order to batch these pages we need to allocate storage, this
0d6e24d4SPeter Zijlstra * allocation is deep inside the MM code and can thus easily fail on memory
0d6e24d4SPeter Zijlstra * pressure. To guarantee progress we fall back to single table freeing, see
0d6e24d4SPeter Zijlstra * the implementation of tlb_remove_table_one().
0d6e24d4SPeter Zijlstra *
196d9d8bSPeter Zijlstra */
196d9d8bSPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic void tlb_remove_table_smp_sync(void *arg)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	/* Simply deliver the interrupt */
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
2ba99c5eSJann Hornvoid tlb_remove_table_sync_one(void)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	/*
0d6e24d4SPeter Zijlstra	 * This isn't an RCU grace period and hence the page-tables cannot be
0d6e24d4SPeter Zijlstra	 * assumed to be actually RCU-freed.
0d6e24d4SPeter Zijlstra	 *
0d6e24d4SPeter Zijlstra	 * It is however sufficient for software page-table walkers that rely on
0d6e24d4SPeter Zijlstra	 * IRQ disabling.
0d6e24d4SPeter Zijlstra	 */
0d6e24d4SPeter Zijlstra	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic void tlb_remove_table_rcu(struct rcu_head *head)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic void tlb_remove_table_free(struct mmu_table_batch *batch)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	call_rcu(&batch->rcu, tlb_remove_table_rcu);
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
*1fb3d8c2SLance Yang/**
*1fb3d8c2SLance Yang * tlb_remove_table_sync_rcu - synchronize with software page-table walkers
*1fb3d8c2SLance Yang *
*1fb3d8c2SLance Yang * Like tlb_remove_table_sync_one() but uses RCU grace period instead of IPI
*1fb3d8c2SLance Yang * broadcast. Use in slow paths where sleeping is acceptable.
*1fb3d8c2SLance Yang *
*1fb3d8c2SLance Yang * Software/Lockless page-table walkers use local_irq_disable(), which is also
*1fb3d8c2SLance Yang * an RCU read-side critical section. synchronize_rcu() waits for all such
*1fb3d8c2SLance Yang * sections, providing the same guarantee as tlb_remove_table_sync_one() but
*1fb3d8c2SLance Yang * without disrupting all CPUs with IPIs.
*1fb3d8c2SLance Yang *
*1fb3d8c2SLance Yang * Do not use for freeing memory. Use RCU callbacks instead to avoid latency
*1fb3d8c2SLance Yang * spikes.
*1fb3d8c2SLance Yang */
*1fb3d8c2SLance Yangvoid tlb_remove_table_sync_rcu(void)
*1fb3d8c2SLance Yang{
*1fb3d8c2SLance Yang	synchronize_rcu();
*1fb3d8c2SLance Yang}
*1fb3d8c2SLance Yang
0d6e24d4SPeter Zijlstra#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic void tlb_remove_table_free(struct mmu_table_batch *batch)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	__tlb_remove_table_free(batch);
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstra#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
0d6e24d4SPeter Zijlstra
196d9d8bSPeter Zijlstra/*
196d9d8bSPeter Zijlstra * If we want tlb_remove_table() to imply TLB invalidates.
196d9d8bSPeter Zijlstra */
196d9d8bSPeter Zijlstrastatic inline void tlb_table_invalidate(struct mmu_gather *tlb)
196d9d8bSPeter Zijlstra{
0ed13259SPeter Zijlstra	if (tlb_needs_table_invalidate()) {
196d9d8bSPeter Zijlstra		/*
0ed13259SPeter Zijlstra		 * Invalidate page-table caches used by hardware walkers. Then
0ed13259SPeter Zijlstra		 * we still need to RCU-sched wait while freeing the pages
0ed13259SPeter Zijlstra		 * because software walkers can still be in-flight.
196d9d8bSPeter Zijlstra		 */
196d9d8bSPeter Zijlstra		tlb_flush_mmu_tlbonly(tlb);
0ed13259SPeter Zijlstra	}
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
e74e1731SQi Zheng#ifdef CONFIG_PT_RECLAIM
e74e1731SQi Zhengstatic inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
e74e1731SQi Zheng{
e74e1731SQi Zheng	struct ptdesc *ptdesc;
e74e1731SQi Zheng
e74e1731SQi Zheng	ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
e74e1731SQi Zheng	__tlb_remove_table(ptdesc);
e74e1731SQi Zheng}
e74e1731SQi Zheng
e74e1731SQi Zhengstatic inline void __tlb_remove_table_one(void *table)
e74e1731SQi Zheng{
e74e1731SQi Zheng	struct ptdesc *ptdesc;
e74e1731SQi Zheng
e74e1731SQi Zheng	ptdesc = table;
e74e1731SQi Zheng	call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
e74e1731SQi Zheng}
e74e1731SQi Zheng#else
718b1386SQi Zhengstatic inline void __tlb_remove_table_one(void *table)
196d9d8bSPeter Zijlstra{
*1fb3d8c2SLance Yang	tlb_remove_table_sync_rcu();
196d9d8bSPeter Zijlstra	__tlb_remove_table(table);
196d9d8bSPeter Zijlstra}
e74e1731SQi Zheng#endif /* CONFIG_PT_RECLAIM */
718b1386SQi Zheng
718b1386SQi Zhengstatic void tlb_remove_table_one(void *table)
718b1386SQi Zheng{
718b1386SQi Zheng	__tlb_remove_table_one(table);
718b1386SQi Zheng}
196d9d8bSPeter Zijlstra
0a8caf21SPeter Zijlstrastatic void tlb_table_flush(struct mmu_gather *tlb)
196d9d8bSPeter Zijlstra{
196d9d8bSPeter Zijlstra	struct mmu_table_batch **batch = &tlb->batch;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	if (*batch) {
196d9d8bSPeter Zijlstra		tlb_table_invalidate(tlb);
0d6e24d4SPeter Zijlstra		tlb_remove_table_free(*batch);
196d9d8bSPeter Zijlstra		*batch = NULL;
196d9d8bSPeter Zijlstra	}
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstravoid tlb_remove_table(struct mmu_gather *tlb, void *table)
196d9d8bSPeter Zijlstra{
196d9d8bSPeter Zijlstra	struct mmu_table_batch **batch = &tlb->batch;
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	if (*batch == NULL) {
adf085ffSQianfeng Rong		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
196d9d8bSPeter Zijlstra		if (*batch == NULL) {
196d9d8bSPeter Zijlstra			tlb_table_invalidate(tlb);
196d9d8bSPeter Zijlstra			tlb_remove_table_one(table);
196d9d8bSPeter Zijlstra			return;
196d9d8bSPeter Zijlstra		}
196d9d8bSPeter Zijlstra		(*batch)->nr = 0;
196d9d8bSPeter Zijlstra	}
196d9d8bSPeter Zijlstra
196d9d8bSPeter Zijlstra	(*batch)->tables[(*batch)->nr++] = table;
196d9d8bSPeter Zijlstra	if ((*batch)->nr == MAX_TABLE_BATCH)
196d9d8bSPeter Zijlstra		tlb_table_flush(tlb);
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic inline void tlb_table_init(struct mmu_gather *tlb)
0d6e24d4SPeter Zijlstra{
0d6e24d4SPeter Zijlstra	tlb->batch = NULL;
0d6e24d4SPeter Zijlstra}
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstra#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstrastatic inline void tlb_table_flush(struct mmu_gather *tlb) { }
0d6e24d4SPeter Zijlstrastatic inline void tlb_table_init(struct mmu_gather *tlb) { }
0d6e24d4SPeter Zijlstra
0d6e24d4SPeter Zijlstra#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
196d9d8bSPeter Zijlstra
0a8caf21SPeter Zijlstrastatic void tlb_flush_mmu_free(struct mmu_gather *tlb)
0a8caf21SPeter Zijlstra{
0a8caf21SPeter Zijlstra	tlb_table_flush(tlb);
580a586cSPeter Zijlstra#ifndef CONFIG_MMU_GATHER_NO_GATHER
0a8caf21SPeter Zijlstra	tlb_batch_pages_flush(tlb);
0a8caf21SPeter Zijlstra#endif
0a8caf21SPeter Zijlstra}
0a8caf21SPeter Zijlstra
0a8caf21SPeter Zijlstravoid tlb_flush_mmu(struct mmu_gather *tlb)
0a8caf21SPeter Zijlstra{
0a8caf21SPeter Zijlstra	tlb_flush_mmu_tlbonly(tlb);
0a8caf21SPeter Zijlstra	tlb_flush_mmu_free(tlb);
0a8caf21SPeter Zijlstra}
0a8caf21SPeter Zijlstra
d8b45053SWill Deaconstatic void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
a72afd87SWill Deacon			     bool fullmm)
196d9d8bSPeter Zijlstra{
1808d65bSPeter Zijlstra	tlb->mm = mm;
a72afd87SWill Deacon	tlb->fullmm = fullmm;
1808d65bSPeter Zijlstra
580a586cSPeter Zijlstra#ifndef CONFIG_MMU_GATHER_NO_GATHER
1808d65bSPeter Zijlstra	tlb->need_flush_all = 0;
1808d65bSPeter Zijlstra	tlb->local.next = NULL;
1808d65bSPeter Zijlstra	tlb->local.nr   = 0;
1808d65bSPeter Zijlstra	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
1808d65bSPeter Zijlstra	tlb->active     = &tlb->local;
1808d65bSPeter Zijlstra	tlb->batch_count = 0;
1808d65bSPeter Zijlstra#endif
5df397deSLinus Torvalds	tlb->delayed_rmap = 0;
1808d65bSPeter Zijlstra
0d6e24d4SPeter Zijlstra	tlb_table_init(tlb);
3af4bd03SPeter Zijlstra#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
1808d65bSPeter Zijlstra	tlb->page_size = 0;
1808d65bSPeter Zijlstra#endif
bfe125f1SRoman Gushchin	tlb->vma_pfn = 0;
1808d65bSPeter Zijlstra
8ce720d5SDavid Hildenbrand (Red Hat)	tlb->fully_unshared_tables = 0;
1808d65bSPeter Zijlstra	__tlb_reset_range(tlb);
196d9d8bSPeter Zijlstra	inc_tlb_flush_pending(tlb->mm);
196d9d8bSPeter Zijlstra}
196d9d8bSPeter Zijlstra
845be1cdSRandy Dunlap/**
845be1cdSRandy Dunlap * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
845be1cdSRandy Dunlap * @tlb: the mmu_gather structure to initialize
845be1cdSRandy Dunlap * @mm: the mm_struct of the target address space
845be1cdSRandy Dunlap *
845be1cdSRandy Dunlap * Called to initialize an (on-stack) mmu_gather structure for page-table
845be1cdSRandy Dunlap * tear-down from @mm.
845be1cdSRandy Dunlap */
a72afd87SWill Deaconvoid tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
d8b45053SWill Deacon{
a72afd87SWill Deacon	__tlb_gather_mmu(tlb, mm, false);
d8b45053SWill Deacon}
d8b45053SWill Deacon
845be1cdSRandy Dunlap/**
845be1cdSRandy Dunlap * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
845be1cdSRandy Dunlap * @tlb: the mmu_gather structure to initialize
845be1cdSRandy Dunlap * @mm: the mm_struct of the target address space
845be1cdSRandy Dunlap *
845be1cdSRandy Dunlap * In this case, @mm is without users and we're going to destroy the
845be1cdSRandy Dunlap * full address space (exit/execve).
845be1cdSRandy Dunlap *
845be1cdSRandy Dunlap * Called to initialize an (on-stack) mmu_gather structure for page-table
845be1cdSRandy Dunlap * tear-down from @mm.
845be1cdSRandy Dunlap */
d8b45053SWill Deaconvoid tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
d8b45053SWill Deacon{
a72afd87SWill Deacon	__tlb_gather_mmu(tlb, mm, true);
d8b45053SWill Deacon}
d8b45053SWill Deacon
1808d65bSPeter Zijlstra/**
8ce720d5SDavid Hildenbrand (Red Hat) * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a
8ce720d5SDavid Hildenbrand (Red Hat) *			single VMA
8ce720d5SDavid Hildenbrand (Red Hat) * @tlb: the mmu_gather structure to initialize
8ce720d5SDavid Hildenbrand (Red Hat) * @vma: the vm_area_struct
8ce720d5SDavid Hildenbrand (Red Hat) *
8ce720d5SDavid Hildenbrand (Red Hat) * Called to initialize an (on-stack) mmu_gather structure for operating on
8ce720d5SDavid Hildenbrand (Red Hat) * a single VMA. In contrast to tlb_gather_mmu(), calling this function will
8ce720d5SDavid Hildenbrand (Red Hat) * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(),
8ce720d5SDavid Hildenbrand (Red Hat) * this function will *not* call flush_cache_range().
8ce720d5SDavid Hildenbrand (Red Hat) *
8ce720d5SDavid Hildenbrand (Red Hat) * For hugetlb VMAs, this function will also initialize the mmu_gather
8ce720d5SDavid Hildenbrand (Red Hat) * page_size accordingly, not requiring a separate call to
8ce720d5SDavid Hildenbrand (Red Hat) * tlb_change_page_size().
8ce720d5SDavid Hildenbrand (Red Hat) *
8ce720d5SDavid Hildenbrand (Red Hat) */
8ce720d5SDavid Hildenbrand (Red Hat)void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
8ce720d5SDavid Hildenbrand (Red Hat){
8ce720d5SDavid Hildenbrand (Red Hat)	tlb_gather_mmu(tlb, vma->vm_mm);
8ce720d5SDavid Hildenbrand (Red Hat)	tlb_update_vma_flags(tlb, vma);
8ce720d5SDavid Hildenbrand (Red Hat)	if (is_vm_hugetlb_page(vma))
8ce720d5SDavid Hildenbrand (Red Hat)		/* All entries have the same size. */
8ce720d5SDavid Hildenbrand (Red Hat)		tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma)));
8ce720d5SDavid Hildenbrand (Red Hat)}
8ce720d5SDavid Hildenbrand (Red Hat)
8ce720d5SDavid Hildenbrand (Red Hat)/**
1808d65bSPeter Zijlstra * tlb_finish_mmu - finish an mmu_gather structure
1808d65bSPeter Zijlstra * @tlb: the mmu_gather structure to finish
1808d65bSPeter Zijlstra *
1808d65bSPeter Zijlstra * Called at the end of the shootdown operation to free up any resources that
1808d65bSPeter Zijlstra * were required.
1808d65bSPeter Zijlstra */
ae8eba8bSWill Deaconvoid tlb_finish_mmu(struct mmu_gather *tlb)
196d9d8bSPeter Zijlstra{
196d9d8bSPeter Zijlstra	/*
8ce720d5SDavid Hildenbrand (Red Hat)	 * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
8ce720d5SDavid Hildenbrand (Red Hat)	 * due to complicated locking requirements with page table unsharing.
8ce720d5SDavid Hildenbrand (Red Hat)	 */
8ce720d5SDavid Hildenbrand (Red Hat)	VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
8ce720d5SDavid Hildenbrand (Red Hat)
8ce720d5SDavid Hildenbrand (Red Hat)	/*
196d9d8bSPeter Zijlstra	 * If there are parallel threads are doing PTE changes on same range
c1e8d7c6SMichel Lespinasse	 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
7a30df49SYang Shi	 * flush by batching, one thread may end up seeing inconsistent PTEs
7a30df49SYang Shi	 * and result in having stale TLB entries.  So flush TLB forcefully
7a30df49SYang Shi	 * if we detect parallel PTE batching threads.
7a30df49SYang Shi	 *
7a30df49SYang Shi	 * However, some syscalls, e.g. munmap(), may free page tables, this
7a30df49SYang Shi	 * needs force flush everything in the given range. Otherwise this
7a30df49SYang Shi	 * may result in having stale TLB entries for some architectures,
7a30df49SYang Shi	 * e.g. aarch64, that could specify flush what level TLB.
196d9d8bSPeter Zijlstra	 */
1808d65bSPeter Zijlstra	if (mm_tlb_flush_nested(tlb->mm)) {
7a30df49SYang Shi		/*
7a30df49SYang Shi		 * The aarch64 yields better performance with fullmm by
7a30df49SYang Shi		 * avoiding multiple CPUs spamming TLBI messages at the
7a30df49SYang Shi		 * same time.
7a30df49SYang Shi		 *
7a30df49SYang Shi		 * On x86 non-fullmm doesn't yield significant difference
7a30df49SYang Shi		 * against fullmm.
7a30df49SYang Shi		 */
7a30df49SYang Shi		tlb->fullmm = 1;
1808d65bSPeter Zijlstra		__tlb_reset_range(tlb);
7a30df49SYang Shi		tlb->freed_tables = 1;
1808d65bSPeter Zijlstra	}
196d9d8bSPeter Zijlstra
1808d65bSPeter Zijlstra	tlb_flush_mmu(tlb);
1808d65bSPeter Zijlstra
580a586cSPeter Zijlstra#ifndef CONFIG_MMU_GATHER_NO_GATHER
1808d65bSPeter Zijlstra	tlb_batch_list_free(tlb);
1808d65bSPeter Zijlstra#endif
196d9d8bSPeter Zijlstra	dec_tlb_flush_pending(tlb->mm);
196d9d8bSPeter Zijlstra}