open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_migrate_pageable.h

/*******************************************************************************
    Copyright (c) 2018-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

#ifndef __UVM_MIGRATE_PAGEABLE_H__
#define __UVM_MIGRATE_PAGEABLE_H__

#include "uvm_common.h"
#include "uvm_linux.h"
#include "uvm_populate_pageable.h"
#include "uvm_forward_decl.h"
#include "uvm_processors.h"
#include "uvm_va_block_types.h"

#include <linux/nodemask.h>

typedef struct
{
    uvm_va_space_t                  *va_space;
    struct mm_struct                *mm;
    const unsigned long             start;
    const unsigned long             length;
    uvm_processor_id_t              dst_id;

    // dst_node_id may be clobbered by uvm_migrate_pageable().
    int                             dst_node_id;
    uvm_populate_permissions_t      populate_permissions;
    NvU32                           populate_flags;
    bool                            skip_mapped : 1;
    uvm_make_resident_cause_t       cause;
    bool                            populate_on_cpu_alloc_failures : 1;
    bool                            populate_on_migrate_vma_failures : 1;

    // access_counters_buffer_index is only valid if cause is
    // UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER.
    NvU32                           access_counters_buffer_index;
    NvU64                           *user_space_start;
    NvU64                           *user_space_length;

    uvm_processor_mask_t            *gpus_to_check_for_nvlink_errors;
    bool                            fail_on_unresolved_sto_errors;
} uvm_migrate_args_t;

#if defined(CONFIG_MIGRATE_VMA_HELPER)
#define UVM_MIGRATE_VMA_SUPPORTED 1
#else
#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
#define UVM_MIGRATE_VMA_SUPPORTED 1
#endif
#endif

#ifdef UVM_MIGRATE_VMA_SUPPORTED
#include <linux/migrate.h>

// The calls to migrate_vma are capped at 512 pages to set an upper bound on the
// amount of metadata that needs to be allocated for the operation. This number
// was chosen because performance seems to plateau at this size on 64K-pages
// kernels. On kernels with PAGE_SIZE == 4K, 512 pages correspond to 2M VA block,
// which is also a standard size for batch operations.
#define UVM_MIGRATE_VMA_MAX_PAGES (512UL)
#define UVM_MIGRATE_VMA_MAX_SIZE (UVM_MIGRATE_VMA_MAX_PAGES * PAGE_SIZE)

typedef struct
{
    DECLARE_BITMAP(page_mask, UVM_MIGRATE_VMA_MAX_PAGES);
} uvm_migrate_vma_page_mask_t;

typedef struct
{
    // Scatter list managing the necessary copying GPU's IOMMU mappings
    // used to copy pages from a source processor.
    struct sg_table sgt_from;

    // The copying GPUs that performs the copy from the source processors
    // for which IOMMU mappings are created.
    uvm_gpu_t *sgt_from_gpu;

    // Number of DMA mapped pages in this scatterlist.
    unsigned long dma_count;
} uvm_sgt_t;

typedef struct
{
    // Input parameters
    uvm_migrate_args_t  *uvm_migrate_args;

    // Output parameters
    //
    // Error code. This only signals errors in internal UVM operations.
    // Pages that failed allocation or could not be populated are communicated
    // using the fields below.
    NV_STATUS           status;

    // Mask of pages that couldn't be made resident on the destination because
    // (a) they are backed with data but pages are not populated (e.g. in swap),
    // (b) pages are not backed with any data yet but were not populated
    // due to the vma not being READ_WRITE, as it would not charge the pages to
    // the process properly.
    // (c) pages are already mapped and such pages were requested to not be
    // migrated via skip_mapped.
    // (d) pages which couldn't be migrated by the kernel.
    uvm_migrate_vma_page_mask_t populate_pages_mask;

    // Mask of pages that failed allocation on the destination
    uvm_migrate_vma_page_mask_t allocation_failed_mask;

    // Mask of pages which are already resident at the destination.
    uvm_migrate_vma_page_mask_t dst_resident_pages_mask;

    // Global state managed by the caller
    //
    // These are scratch masks that can be used by the migrate_vma caller to
    // save output page masks and orchestrate the migrate_vma
    // retries/population calls if needed.
    uvm_migrate_vma_page_mask_t scratch1_mask;
    uvm_migrate_vma_page_mask_t scratch2_mask;

    // Arrays used by migrate_vma to store the src/dst pfns
    unsigned long dst_pfn_array[UVM_MIGRATE_VMA_MAX_PAGES];
    unsigned long src_pfn_array[UVM_MIGRATE_VMA_MAX_PAGES];

    // Internal state
    //
    uvm_tracker_t tracker;

    struct {
        // Scatter table managing the IOMMU mappings for anonymous pages.
        uvm_sgt_t anon_sgt;

        // Scatter table managing the IOMMU mappings for pages resident on
        // GPUs.
        uvm_sgt_t sgt_gpu[UVM_ID_MAX_GPUS];

        // Scatter table managing the IOMMU mappings for pages resident on CPU
        // NUMA nodes. The array size is num_possible_nodes() and should be
        // indexed by the position of CPU node ID relative to other set bits
        // in node_possible_map nodemask.
        uvm_sgt_t *sgt_cpu;
    } dma;

    // GPU processors where pages are resident before calling migrate_vma.
    uvm_processor_mask_t src_gpus;

    // CPU NUMA nodes where pages are resident before calling migrate_vma.
    nodemask_t src_cpu_nodemask;

    // Array of per-processor page masks with the pages that are resident
    // before calling migrate_vma.
    struct {
        uvm_migrate_vma_page_mask_t proc_page_mask;
    } processors[UVM_ID_MAX_PROCESSORS];

    // CPU Processors where pages are resident before calling migrate_vma
    uvm_migrate_vma_page_mask_t *cpu_page_mask;

    // Number of pages in the migrate_vma call
    unsigned long num_pages;

    // Number of pages that are directly populated on the destination
    unsigned long num_populate_anon_pages;

    // Tracks if OOM condition was encountered.
    bool out_of_memory;
} migrate_vma_state_t;

#if defined(CONFIG_MIGRATE_VMA_HELPER)
struct migrate_vma {
    struct vm_area_struct   *vma;
    unsigned long           *dst;
    unsigned long           *src;
    unsigned long           start;
    unsigned long           end;
};
#endif // CONFIG_MIGRATE_VMA_HELPER

// Populates the given VA range and tries to migrate all the pages to dst_id. If
// the destination processor is the CPU, the NUMA node in dst_node_id is used.
// The input VA range must be fully backed by VMAs. This function relies on
// migrate_vma, which was added in Linux 4.14. If skip_mapped is set to true,
// then already mapped pages will not be migrated. For kernels that do not
// provide migrate_vma, this function populates the memory using get_user_pages
// and returns NV_WARN_NOTHING_TO_DO to complete the migration in user space for
// API calls made from userspace. Kernel callers are expected to handle this
// error according to their respective usecases. user_space_start and
// user_space_length will contain the full input range. skip_mapped is ignored
// for such kernels. If the destination is the CPU and dst_node_id is full,
// NV_ERR_MORE_PROCESSING_REQUIRED is returned and user-space will call
// UVM_MIGRATE with the next preferred CPU node (if more are available),
// starting at the address specified by user_space_start. If the destination is
// a GPU and a page could not be populated, return NV_ERR_NO_MEMORY. Otherwise,
// return NV_OK. This is fine because UvmMigrate/UvmMigrateAsync only guarantee
// that the memory is populated somewhere in the system, not that pages moved to
// the requested processor.
//
// migrate_vma does not support file-backed vmas yet. If a file-backed vma is
// found, populates the pages corresponding to the vma with get_user_pages() and
// returns NV_WARN_NOTHING_TO_DO. The caller is expected to handle this error.
// API calls will fall back to user-mode to complete the migration. Kernel
// callers are expected to handle this error according to the respective
// usecases. If NV_WARN_NOTHING_TO_DO is returned, user_space_start and
// user_space_length will contain the intersection of the vma address range and
// [start:start + length].
//
// If a user-mode fallback is required but current->mm != uvm_migrate_args->mm,
// NV_ERR_NOT_SUPPORTED is returned since user mode can't perform such a
// migration. If the caller is a kernel thread, such as the GPU BH, this check
// is ignored since the caller is not expected to take such a fallback.
//
// Also, if no GPUs have been registered in the VA space, populates the pages
// corresponding to the first vma in the requested region using get_user_pages()
// and returns NV_WARN_NOTHING_TO_DO to fall back to user space for the
// userspace API callers to complete the whole migration. Kernel callers are
// expected to handle this error according to their respective usecases.
//
// Locking: mmap_lock must be held in read or write mode
NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args);

NV_STATUS uvm_migrate_pageable_init(void);

void uvm_migrate_pageable_exit(void);

NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp);

#else // UVM_MIGRATE_VMA_SUPPORTED

static NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
{
    NV_STATUS status;

    if (current->mm != uvm_migrate_args->mm && !(current->flags & PF_KTHREAD))
        return NV_ERR_NOT_SUPPORTED;

    status = uvm_populate_pageable(uvm_migrate_args->mm,
                                   uvm_migrate_args->start,
                                   uvm_migrate_args->length,
                                   uvm_migrate_args->populate_permissions,
                                   uvm_migrate_args->populate_flags);
    if (status != NV_OK)
        return status;

    *(uvm_migrate_args->user_space_start) = uvm_migrate_args->start;
    *(uvm_migrate_args->user_space_length) = uvm_migrate_args->length;

    return NV_WARN_NOTHING_TO_DO;
}

static NV_STATUS uvm_migrate_pageable_init(void)
{
    return NV_OK;
}

static void uvm_migrate_pageable_exit(void)
{
}

static inline NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
{
    return NV_OK;
}
#endif // UVM_MIGRATE_VMA_SUPPORTED

#endif