515.43.04

This commit is contained in:
Andy Ritger
2022-05-09 13:18:59 -07:00
commit 1739a20efc
2519 changed files with 1060036 additions and 0 deletions

View File

@@ -0,0 +1,427 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2011-2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _NV_P2P_H_
#define _NV_P2P_H_
/*
* NVIDIA P2P Structure Versioning
*
* For the nvidia_p2p_*_t structures allocated by the NVIDIA driver, it will
* set the version field of the structure according to the definition used by
* the NVIDIA driver. The "major" field of the version is defined as the upper
* 16 bits, and the "minor" field of the version is defined as the lower 16
* bits. The version field will always be the first 4 bytes of the structure,
* and third-party drivers should check the value of this field in structures
* allocated by the NVIDIA driver to ensure runtime compatibility.
*
* In general, version numbers will be incremented as follows:
* - When a backwards-compatible change is made to the structure layout, the
* minor version for that structure will be incremented. Third-party drivers
* built against an older minor version will continue to work with the newer
* minor version used by the NVIDIA driver, without recompilation.
* - When a breaking change is made to the structure layout, the major version
* will be incremented. Third-party drivers built against an older major
* version require at least recompilation and potentially additional updates
* to use the new API.
*/
#define NVIDIA_P2P_MAJOR_VERSION_MASK 0xffff0000
#define NVIDIA_P2P_MINOR_VERSION_MASK 0x0000ffff
#define NVIDIA_P2P_MAJOR_VERSION(v) \
(((v) & NVIDIA_P2P_MAJOR_VERSION_MASK) >> 16)
#define NVIDIA_P2P_MINOR_VERSION(v) \
(((v) & NVIDIA_P2P_MINOR_VERSION_MASK))
#define NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) \
(NVIDIA_P2P_MAJOR_VERSION((p)->version) == NVIDIA_P2P_MAJOR_VERSION(v))
#define NVIDIA_P2P_VERSION_COMPATIBLE(p, v) \
(NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) && \
(NVIDIA_P2P_MINOR_VERSION((p)->version) >= (NVIDIA_P2P_MINOR_VERSION(v))))
enum {
NVIDIA_P2P_ARCHITECTURE_TESLA = 0,
NVIDIA_P2P_ARCHITECTURE_FERMI,
NVIDIA_P2P_ARCHITECTURE_CURRENT = NVIDIA_P2P_ARCHITECTURE_FERMI
};
#define NVIDIA_P2P_PARAMS_VERSION 0x00010001
enum {
NVIDIA_P2P_PARAMS_ADDRESS_INDEX_GPU = 0,
NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE,
NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX = \
NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE
};
#define NVIDIA_P2P_GPU_UUID_LEN 16
typedef
struct nvidia_p2p_params {
uint32_t version;
uint32_t architecture;
union nvidia_p2p_mailbox_addresses {
struct {
uint64_t wmb_addr;
uint64_t wmb_data;
uint64_t rreq_addr;
uint64_t rcomp_addr;
uint64_t reserved[2];
} fermi;
} addresses[NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX+1];
} nvidia_p2p_params_t;
/*
* Capability flag for users to detect
* driver support for persistent pages.
*/
extern int nvidia_p2p_cap_persistent_pages;
#define NVIDIA_P2P_CAP_PERSISTENT_PAGES
/*
* This API is not supported.
*/
int nvidia_p2p_init_mapping(uint64_t p2p_token,
struct nvidia_p2p_params *params,
void (*destroy_callback)(void *data),
void *data);
/*
* This API is not supported.
*/
int nvidia_p2p_destroy_mapping(uint64_t p2p_token);
enum nvidia_p2p_page_size_type {
NVIDIA_P2P_PAGE_SIZE_4KB = 0,
NVIDIA_P2P_PAGE_SIZE_64KB,
NVIDIA_P2P_PAGE_SIZE_128KB,
NVIDIA_P2P_PAGE_SIZE_COUNT
};
typedef
struct nvidia_p2p_page {
uint64_t physical_address;
union nvidia_p2p_request_registers {
struct {
uint32_t wreqmb_h;
uint32_t rreqmb_h;
uint32_t rreqmb_0;
uint32_t reserved[3];
} fermi;
} registers;
} nvidia_p2p_page_t;
#define NVIDIA_P2P_PAGE_TABLE_VERSION 0x00010002
#define NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(p) \
NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_PAGE_TABLE_VERSION)
typedef
struct nvidia_p2p_page_table {
uint32_t version;
uint32_t page_size; /* enum nvidia_p2p_page_size_type */
struct nvidia_p2p_page **pages;
uint32_t entries;
uint8_t *gpu_uuid;
} nvidia_p2p_page_table_t;
/*
* @brief
* Make the pages underlying a range of GPU virtual memory
* accessible to a third-party device.
*
* This API only supports pinned, GPU-resident memory, such as that provided
* by cudaMalloc().
*
* This API may sleep.
*
* @param[in] p2p_token
* A token that uniquely identifies the P2P mapping.
* @param[in] va_space
* A GPU virtual address space qualifier.
* @param[in] virtual_address
* The start address in the specified virtual address space.
* Address must be aligned to the 64KB boundary.
* @param[in] length
* The length of the requested P2P mapping.
* Length must be a multiple of 64KB.
* @param[out] page_table
* A pointer to an array of structures with P2P PTEs.
* @param[in] free_callback
* A pointer to the function to be invoked when the pages
* underlying the virtual address range are freed
* implicitly.
* If NULL, persistent pages will be returned.
* This means the pages underlying the range of GPU virtual memory
* will persist until explicitly freed by nvidia_p2p_put_pages().
* Persistent GPU memory mappings are not supported on PowerPC,
* MIG-enabled devices and vGPU.
* @param[in] data
* A non-NULL opaque pointer to private data to be passed to the
* callback function.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
* -ENOTSUPP if the requested operation is not supported.
* -ENOMEM if the driver failed to allocate memory or if
* insufficient resources were available to complete the operation.
* -EIO if an unknown error occurred.
*/
int nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space,
uint64_t virtual_address,
uint64_t length,
struct nvidia_p2p_page_table **page_table,
void (*free_callback)(void *data),
void *data);
#define NVIDIA_P2P_DMA_MAPPING_VERSION 0x00020003
#define NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(p) \
NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_DMA_MAPPING_VERSION)
struct pci_dev;
typedef
struct nvidia_p2p_dma_mapping {
uint32_t version;
enum nvidia_p2p_page_size_type page_size_type;
uint32_t entries;
uint64_t *dma_addresses;
void *private;
struct pci_dev *pci_dev;
} nvidia_p2p_dma_mapping_t;
/*
* @brief
* Make the physical pages retrieved using nvidia_p2p_get_pages accessible to
* a third-party device.
*
* @param[in] peer
* The struct pci_dev * of the peer device that needs to DMA to/from the
* mapping.
* @param[in] page_table
* The page table outlining the physical pages underlying the mapping, as
* retrieved with nvidia_p2p_get_pages().
* @param[out] dma_mapping
* The DMA mapping containing the DMA addresses to use on the third-party
* device.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
* -ENOTSUPP if the requested operation is not supported.
* -EIO if an unknown error occurred.
*/
int nvidia_p2p_dma_map_pages(struct pci_dev *peer,
struct nvidia_p2p_page_table *page_table,
struct nvidia_p2p_dma_mapping **dma_mapping);
/*
* @brief
* Unmap the physical pages previously mapped to the third-party device by
* nvidia_p2p_dma_map_pages().
*
* @param[in] peer
* The struct pci_dev * of the peer device that the DMA mapping belongs to.
* @param[in] page_table
* The page table backing the DMA mapping to be unmapped.
* @param[in] dma_mapping
* The DMA mapping containing the DMA addresses used by the third-party
* device, as retrieved with nvidia_p2p_dma_map_pages(). After this call
* returns, neither this struct nor the addresses contained within will be
* valid for use by the third-party device.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
* -EIO if an unknown error occurred.
*/
int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer,
struct nvidia_p2p_page_table *page_table,
struct nvidia_p2p_dma_mapping *dma_mapping);
/*
* @brief
* Release a set of pages previously made accessible to
* a third-party device.
*
* @param[in] p2p_token
* A token that uniquely identifies the P2P mapping.
* @param[in] va_space
* A GPU virtual address space qualifier.
* @param[in] virtual_address
* The start address in the specified virtual address space.
* @param[in] page_table
* A pointer to the array of structures with P2P PTEs.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
* -EIO if an unknown error occurred.
*/
int nvidia_p2p_put_pages(uint64_t p2p_token, uint32_t va_space,
uint64_t virtual_address,
struct nvidia_p2p_page_table *page_table);
/*
* @brief
* Free a third-party P2P page table. (This function is a no-op.)
*
* @param[in] page_table
* A pointer to the array of structures with P2P PTEs.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
*/
int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table);
/*
* @brief
* Free a third-party P2P DMA mapping. (This function is a no-op.)
*
* @param[in] dma_mapping
* A pointer to the DMA mapping structure.
*
* @return
* 0 upon successful completion.
* -EINVAL if an invalid argument was supplied.
*/
int nvidia_p2p_free_dma_mapping(struct nvidia_p2p_dma_mapping *dma_mapping);
#define NVIDIA_P2P_RSYNC_DRIVER_VERSION 0x00010001
#define NVIDIA_P2P_RSYNC_DRIVER_VERSION_COMPATIBLE(p) \
NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_DRIVER_VERSION)
typedef
struct nvidia_p2p_rsync_driver {
uint32_t version;
int (*get_relaxed_ordering_mode)(int *mode, void *data);
void (*put_relaxed_ordering_mode)(int mode, void *data);
void (*wait_for_rsync)(struct pci_dev *gpu, void *data);
} nvidia_p2p_rsync_driver_t;
/*
* @brief
* Registers the rsync driver.
*
* @param[in] driver
* A pointer to the rsync driver structure. The NVIDIA driver would use,
*
* get_relaxed_ordering_mode to obtain a reference to the current relaxed
* ordering mode (treated as a boolean) from the rsync driver.
*
* put_relaxed_ordering_mode to release a reference to the current relaxed
* ordering mode back to the rsync driver. The NVIDIA driver will call this
* function once for each successful call to get_relaxed_ordering_mode, and
* the relaxed ordering mode must not change until the last reference is
* released.
*
* wait_for_rsync to call into the rsync module to issue RSYNC. This callback
* can't sleep or re-schedule as it may arrive under spinlocks.
* @param[in] data
* A pointer to the rsync driver's private data.
*
* @Returns
* 0 upon successful completion.
* -EINVAL parameters are incorrect.
* -EBUSY if a module is already registered or GPU devices are in use.
*/
int nvidia_p2p_register_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
void *data);
/*
* @brief
* Unregisters the rsync driver.
*
* @param[in] driver
* A pointer to the rsync driver structure.
* @param[in] data
* A pointer to the rsync driver's private data.
*/
void nvidia_p2p_unregister_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
void *data);
#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION 0x00020001
#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION_COMPATIBLE(p) \
NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_REG_INFO_VERSION)
typedef struct nvidia_p2p_rsync_reg {
void *ptr;
size_t size;
struct pci_dev *ibmnpu;
struct pci_dev *gpu;
uint32_t cluster_id;
uint32_t socket_id;
} nvidia_p2p_rsync_reg_t;
typedef struct nvidia_p2p_rsync_reg_info {
uint32_t version;
nvidia_p2p_rsync_reg_t *regs;
size_t entries;
} nvidia_p2p_rsync_reg_info_t;
/*
* @brief
* Gets rsync (GEN-ID) register information associated with the supported
* NPUs.
*
* The caller would use the returned information {GPU device, NPU device,
* socket-id, cluster-id} to pick the optimal generation registers to issue
* RSYNC (NVLink HW flush).
*
* The interface allocates structures to return the information, hence
* nvidia_p2p_put_rsync_registers() must be called to free the structures.
*
* Note, cluster-id is hardcoded to zero as early system configurations would
* only support cluster mode i.e. all devices would share the same cluster-id
* (0). In the future, appropriate kernel support would be needed to query
* cluster-ids.
*
* @param[out] reg_info
* A pointer to the rsync reg info structure.
*
* @Returns
* 0 Upon successful completion. Otherwise, returns negative value.
*/
int nvidia_p2p_get_rsync_registers(nvidia_p2p_rsync_reg_info_t **reg_info);
/*
* @brief
* Frees the structures allocated by nvidia_p2p_get_rsync_registers().
*
* @param[in] reg_info
* A pointer to the rsync reg info structure.
*/
void nvidia_p2p_put_rsync_registers(nvidia_p2p_rsync_reg_info_t *reg_info);
#endif /* _NV_P2P_H_ */

View File

@@ -0,0 +1,61 @@
###########################################################################
# Kbuild fragment for nvidia-peermem.ko
###########################################################################
#
# Define NVIDIA_PEERMEM_{SOURCES,OBJECTS}
#
NVIDIA_PEERMEM_SOURCES =
NVIDIA_PEERMEM_SOURCES += nvidia-peermem/nvidia-peermem.c
NVIDIA_PEERMEM_OBJECTS = $(patsubst %.c,%.o,$(NVIDIA_PEERMEM_SOURCES))
obj-m += nvidia-peermem.o
nvidia-peermem-y := $(NVIDIA_PEERMEM_OBJECTS)
NVIDIA_PEERMEM_KO = nvidia-peermem/nvidia-peermem.ko
NV_KERNEL_MODULE_TARGETS += $(NVIDIA_PEERMEM_KO)
#
# Define nvidia-peermem.ko-specific CFLAGS.
#
NVIDIA_PEERMEM_CFLAGS += -I$(src)/nvidia-peermem
NVIDIA_PEERMEM_CFLAGS += -UDEBUG -U_DEBUG -DNDEBUG -DNV_BUILD_MODULE_INSTANCES=0
#
# In case of MOFED installation, nvidia-peermem compilation
# needs paths to the MOFED headers in CFLAGS.
# MOFED's Module.symvers is needed for the build
# to find the additional ib_* symbols.
#
OFA_DIR := /usr/src/ofa_kernel
OFA_CANDIDATES = $(OFA_DIR)/$(ARCH)/$(KERNELRELEASE) $(OFA_DIR)/$(KERNELRELEASE) $(OFA_DIR)/default /var/lib/dkms/mlnx-ofed-kernel
MLNX_OFED_KERNEL := $(shell for d in $(OFA_CANDIDATES); do \
if [ -d "$$d" ]; then \
echo "$$d"; \
exit 0; \
fi; \
done; \
echo $(OFA_DIR) \
)
ifneq ($(shell test -d $(MLNX_OFED_KERNEL) && echo "true" || echo "" ),)
NVIDIA_PEERMEM_CFLAGS += -I$(MLNX_OFED_KERNEL)/include -I$(MLNX_OFED_KERNEL)/include/rdma
KBUILD_EXTRA_SYMBOLS := $(MLNX_OFED_KERNEL)/Module.symvers
endif
$(call ASSIGN_PER_OBJ_CFLAGS, $(NVIDIA_PEERMEM_OBJECTS), $(NVIDIA_PEERMEM_CFLAGS))
#
# Register the conftests needed by nvidia-peermem.ko
#
NV_OBJECTS_DEPEND_ON_CONFTEST += $(NVIDIA_PEERMEM_OBJECTS)
NV_CONFTEST_GENERIC_COMPILE_TESTS += ib_peer_memory_symbols
NV_CONFTEST_FUNCTION_COMPILE_TESTS +=
NV_CONFTEST_TYPE_COMPILE_TESTS +=

View File

@@ -0,0 +1,523 @@
/* SPDX-License-Identifier: Linux-OpenIB */
/*
* Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/mm.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/hugetlb.h>
#include <linux/pci.h>
#include "nv-p2p.h"
#include "peer_mem.h"
#include "conftest.h"
#define DRV_NAME "nv_mem"
#define DRV_VERSION NV_VERSION_STRING
MODULE_AUTHOR("Yishai Hadas");
MODULE_DESCRIPTION("NVIDIA GPU memory plug-in");
MODULE_LICENSE("Linux-OpenIB");
MODULE_VERSION(DRV_VERSION);
enum {
NV_MEM_PEERDIRECT_SUPPORT_DEFAULT = 0,
NV_MEM_PEERDIRECT_SUPPORT_LEGACY = 1,
};
static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
module_param(peerdirect_support, int, S_IRUGO);
MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");
#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
#if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
#ifndef READ_ONCE
#define READ_ONCE(x) ACCESS_ONCE(x)
#endif
#ifndef WRITE_ONCE
#define WRITE_ONCE(x, val) ({ ACCESS_ONCE(x) = (val); })
#endif
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE ((u64)1 << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
invalidate_peer_memory mem_invalidate_callback;
static void *reg_handle = NULL;
static void *reg_handle_nc = NULL;
struct nv_mem_context {
struct nvidia_p2p_page_table *page_table;
struct nvidia_p2p_dma_mapping *dma_mapping;
u64 core_context;
u64 page_virt_start;
u64 page_virt_end;
size_t mapped_size;
unsigned long npages;
unsigned long page_size;
struct task_struct *callback_task;
int sg_allocated;
struct sg_table sg_head;
};
static void nv_get_p2p_free_callback(void *data)
{
int ret = 0;
struct nv_mem_context *nv_mem_context = (struct nv_mem_context *)data;
struct nvidia_p2p_page_table *page_table = NULL;
struct nvidia_p2p_dma_mapping *dma_mapping = NULL;
__module_get(THIS_MODULE);
if (!nv_mem_context) {
peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
goto out;
}
if (!nv_mem_context->page_table) {
peer_err("nv_get_p2p_free_callback -- invalid page_table\n");
goto out;
}
/* Save page_table locally to prevent it being freed as part of nv_mem_release
* in case it's called internally by that callback.
*/
page_table = nv_mem_context->page_table;
if (!nv_mem_context->dma_mapping) {
peer_err("nv_get_p2p_free_callback -- invalid dma_mapping\n");
goto out;
}
dma_mapping = nv_mem_context->dma_mapping;
/* For now don't set nv_mem_context->page_table to NULL,
* confirmed by NVIDIA that inflight put_pages with valid pointer will fail gracefully.
*/
nv_mem_context->callback_task = current;
(*mem_invalidate_callback) (reg_handle, nv_mem_context->core_context);
nv_mem_context->callback_task = NULL;
ret = nvidia_p2p_free_dma_mapping(dma_mapping);
if (ret)
peer_err("nv_get_p2p_free_callback -- error %d while calling nvidia_p2p_free_dma_mapping()\n", ret);
ret = nvidia_p2p_free_page_table(page_table);
if (ret)
peer_err("nv_get_p2p_free_callback -- error %d while calling nvidia_p2p_free_page_table()\n", ret);
out:
module_put(THIS_MODULE);
return;
}
/* At that function we don't call IB core - no ticket exists */
static void nv_mem_dummy_callback(void *data)
{
struct nv_mem_context *nv_mem_context = (struct nv_mem_context *)data;
int ret = 0;
__module_get(THIS_MODULE);
ret = nvidia_p2p_free_page_table(nv_mem_context->page_table);
if (ret)
peer_err("nv_mem_dummy_callback -- error %d while calling nvidia_p2p_free_page_table()\n", ret);
module_put(THIS_MODULE);
return;
}
/* acquire return code: 1 mine, 0 - not mine */
static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_private_data,
char *peer_mem_name, void **client_context)
{
int ret = 0;
struct nv_mem_context *nv_mem_context;
nv_mem_context = kzalloc(sizeof *nv_mem_context, GFP_KERNEL);
if (!nv_mem_context)
/* Error case handled as not mine */
return 0;
nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
nv_mem_context->page_virt_end = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
nv_mem_context->mapped_size = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
&nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
if (ret < 0)
goto err;
ret = nvidia_p2p_put_pages(0, 0, nv_mem_context->page_virt_start,
nv_mem_context->page_table);
if (ret < 0) {
/* Not expected, however in case callback was called on that buffer just before
put pages we'll expect to fail gracefully (confirmed by NVIDIA) and return an error.
*/
peer_err("nv_mem_acquire -- error %d while calling nvidia_p2p_put_pages()\n", ret);
goto err;
}
/* 1 means mine */
*client_context = nv_mem_context;
__module_get(THIS_MODULE);
return 1;
err:
kfree(nv_mem_context);
/* Error case handled as not mine */
return 0;
}
static int nv_dma_map(struct sg_table *sg_head, void *context,
struct device *dma_device, int dmasync,
int *nmap)
{
int i, ret;
struct scatterlist *sg;
struct nv_mem_context *nv_mem_context =
(struct nv_mem_context *) context;
struct nvidia_p2p_page_table *page_table = nv_mem_context->page_table;
struct nvidia_p2p_dma_mapping *dma_mapping;
struct pci_dev *pdev = to_pci_dev(dma_device);
if (page_table->page_size != NVIDIA_P2P_PAGE_SIZE_64KB) {
peer_err("nv_dma_map -- assumption of 64KB pages failed size_id=%u\n",
nv_mem_context->page_table->page_size);
return -EINVAL;
}
if (!pdev) {
peer_err("nv_dma_map -- invalid pci_dev\n");
return -EINVAL;
}
ret = nvidia_p2p_dma_map_pages(pdev, page_table, &dma_mapping);
if (ret) {
peer_err("nv_dma_map -- error %d while calling nvidia_p2p_dma_map_pages()\n", ret);
return ret;
}
if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(dma_mapping)) {
peer_err("error, incompatible dma mapping version 0x%08x\n",
dma_mapping->version);
nvidia_p2p_dma_unmap_pages(pdev, page_table, dma_mapping);
return -EINVAL;
}
nv_mem_context->npages = dma_mapping->entries;
ret = sg_alloc_table(sg_head, dma_mapping->entries, GFP_KERNEL);
if (ret) {
nvidia_p2p_dma_unmap_pages(pdev, page_table, dma_mapping);
return ret;
}
nv_mem_context->dma_mapping = dma_mapping;
nv_mem_context->sg_allocated = 1;
for_each_sg(sg_head->sgl, sg, nv_mem_context->npages, i) {
sg_set_page(sg, NULL, nv_mem_context->page_size, 0);
sg->dma_address = dma_mapping->dma_addresses[i];
sg->dma_length = nv_mem_context->page_size;
}
nv_mem_context->sg_head = *sg_head;
*nmap = nv_mem_context->npages;
return 0;
}
static int nv_dma_unmap(struct sg_table *sg_head, void *context,
struct device *dma_device)
{
struct pci_dev *pdev = to_pci_dev(dma_device);
struct nv_mem_context *nv_mem_context =
(struct nv_mem_context *)context;
if (!nv_mem_context) {
peer_err("nv_dma_unmap -- invalid nv_mem_context\n");
return -EINVAL;
}
if (WARN_ON(0 != memcmp(sg_head, &nv_mem_context->sg_head, sizeof(*sg_head))))
return -EINVAL;
if (nv_mem_context->callback_task == current)
goto out;
if (nv_mem_context->dma_mapping)
nvidia_p2p_dma_unmap_pages(pdev, nv_mem_context->page_table,
nv_mem_context->dma_mapping);
out:
return 0;
}
static void nv_mem_put_pages(struct sg_table *sg_head, void *context)
{
int ret = 0;
struct nv_mem_context *nv_mem_context =
(struct nv_mem_context *) context;
if (!nv_mem_context) {
peer_err("nv_mem_put_pages -- invalid nv_mem_context\n");
return;
}
if (WARN_ON(0 != memcmp(sg_head, &nv_mem_context->sg_head, sizeof(*sg_head))))
return;
if (nv_mem_context->callback_task == current)
return;
ret = nvidia_p2p_put_pages(0, 0, nv_mem_context->page_virt_start,
nv_mem_context->page_table);
#ifdef _DEBUG_ONLY_
/* Here we expect an error in real life cases that should be ignored - not printed.
* (e.g. concurrent callback with that call)
*/
if (ret < 0) {
printk(KERN_ERR "error %d while calling nvidia_p2p_put_pages, page_table=%p \n",
ret, nv_mem_context->page_table);
}
#endif
return;
}
static void nv_mem_release(void *context)
{
struct nv_mem_context *nv_mem_context =
(struct nv_mem_context *) context;
if (nv_mem_context->sg_allocated) {
sg_free_table(&nv_mem_context->sg_head);
nv_mem_context->sg_allocated = 0;
}
kfree(nv_mem_context);
module_put(THIS_MODULE);
return;
}
static int nv_mem_get_pages(unsigned long addr,
size_t size, int write, int force,
struct sg_table *sg_head,
void *client_context,
u64 core_context)
{
int ret;
struct nv_mem_context *nv_mem_context;
nv_mem_context = (struct nv_mem_context *)client_context;
if (!nv_mem_context)
return -EINVAL;
nv_mem_context->core_context = core_context;
nv_mem_context->page_size = GPU_PAGE_SIZE;
ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
&nv_mem_context->page_table, nv_get_p2p_free_callback, nv_mem_context);
if (ret < 0) {
peer_err("error %d while calling nvidia_p2p_get_pages()\n", ret);
return ret;
}
/* No extra access to nv_mem_context->page_table here as we are
called not under a lock and may race with inflight invalidate callback on that buffer.
Extra handling was delayed to be done under nv_dma_map.
*/
return 0;
}
static unsigned long nv_mem_get_page_size(void *context)
{
struct nv_mem_context *nv_mem_context =
(struct nv_mem_context *)context;
return nv_mem_context->page_size;
}
static struct peer_memory_client_ex nv_mem_client_ex = { .client = {
.acquire = nv_mem_acquire,
.get_pages = nv_mem_get_pages,
.dma_map = nv_dma_map,
.dma_unmap = nv_dma_unmap,
.put_pages = nv_mem_put_pages,
.get_page_size = nv_mem_get_page_size,
.release = nv_mem_release,
}};
static int nv_mem_get_pages_nc(unsigned long addr,
size_t size, int write, int force,
struct sg_table *sg_head,
void *client_context,
u64 core_context)
{
int ret;
struct nv_mem_context *nv_mem_context;
nv_mem_context = (struct nv_mem_context *)client_context;
if (!nv_mem_context)
return -EINVAL;
nv_mem_context->core_context = core_context;
nv_mem_context->page_size = GPU_PAGE_SIZE;
ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
&nv_mem_context->page_table, NULL, NULL);
if (ret < 0) {
peer_err("error %d while calling nvidia_p2p_get_pages() with NULL callback\n", ret);
return ret;
}
return 0;
}
static struct peer_memory_client nv_mem_client_nc = {
.acquire = nv_mem_acquire,
.get_pages = nv_mem_get_pages_nc,
.dma_map = nv_dma_map,
.dma_unmap = nv_dma_unmap,
.put_pages = nv_mem_put_pages,
.get_page_size = nv_mem_get_page_size,
.release = nv_mem_release,
};
#endif /* NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT */
static int nv_mem_param_conf_check(void)
{
int rc = 0;
switch (peerdirect_support) {
case NV_MEM_PEERDIRECT_SUPPORT_DEFAULT:
case NV_MEM_PEERDIRECT_SUPPORT_LEGACY:
break;
default:
peer_err("invalid peerdirect_support param value %d\n", peerdirect_support);
rc = -EINVAL;
break;
}
return rc;
}
static int __init nv_mem_client_init(void)
{
int rc;
rc = nv_mem_param_conf_check();
if (rc) {
return rc;
}
#if defined (NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
int status = 0;
// off by one, to leave space for the trailing '1' which is flagging
// the new client type
BUG_ON(strlen(DRV_NAME) > IB_PEER_MEMORY_NAME_MAX-1);
strcpy(nv_mem_client_ex.client.name, DRV_NAME);
// [VER_MAX-1]=1 <-- last byte is used as flag
// [VER_MAX-2]=0 <-- version string terminator
BUG_ON(strlen(DRV_VERSION) > IB_PEER_MEMORY_VER_MAX-2);
strcpy(nv_mem_client_ex.client.version, DRV_VERSION);
nv_mem_client_ex.client.version[IB_PEER_MEMORY_VER_MAX-1] = 1;
if (peerdirect_support != NV_MEM_PEERDIRECT_SUPPORT_LEGACY) {
nv_mem_client_ex.ex_size = sizeof(struct peer_memory_client_ex);
// PEER_MEM_INVALIDATE_UNMAPS allow clients to opt out of
// unmap/put_pages during invalidation, i.e. the client tells the
// infiniband layer that it does not need to call
// unmap/put_pages in the invalidation callback
nv_mem_client_ex.flags = PEER_MEM_INVALIDATE_UNMAPS;
} else {
nv_mem_client_ex.ex_size = 0;
nv_mem_client_ex.flags = 0;
}
reg_handle = ib_register_peer_memory_client(&nv_mem_client_ex.client,
&mem_invalidate_callback);
if (!reg_handle) {
peer_err("nv_mem_client_init -- error while registering traditional client\n");
status = -EINVAL;
goto out;
}
// The nc client enables support for persistent pages.
// Thanks to this check, nvidia-peermem requires the new symbol from nvidia.ko, which
// prevents users to unintentionally load this module with unsupported nvidia.ko.
BUG_ON(!nvidia_p2p_cap_persistent_pages);
strcpy(nv_mem_client_nc.name, DRV_NAME "_nc");
strcpy(nv_mem_client_nc.version, DRV_VERSION);
reg_handle_nc = ib_register_peer_memory_client(&nv_mem_client_nc, NULL);
if (!reg_handle_nc) {
peer_err("nv_mem_client_init -- error while registering nc client\n");
status = -EINVAL;
goto out;
}
out:
if (status) {
if (reg_handle) {
ib_unregister_peer_memory_client(reg_handle);
reg_handle = NULL;
}
if (reg_handle_nc) {
ib_unregister_peer_memory_client(reg_handle_nc);
reg_handle_nc = NULL;
}
}
return status;
#else
return -EINVAL;
#endif
}
static void __exit nv_mem_client_cleanup(void)
{
#if defined (NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
if (reg_handle)
ib_unregister_peer_memory_client(reg_handle);
if (reg_handle_nc)
ib_unregister_peer_memory_client(reg_handle_nc);
#endif
}
module_init(nv_mem_client_init);
module_exit(nv_mem_client_cleanup);

View File

@@ -0,0 +1,196 @@
/* SPDX-License-Identifier: Linux-OpenIB */
/*
* Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef RDMA_PEER_MEM_H
#define RDMA_PEER_MEM_H
#include <linux/scatterlist.h>
#define IB_PEER_MEMORY_NAME_MAX 64
#define IB_PEER_MEMORY_VER_MAX 16
/*
* Prior versions used a void * for core_context, at some point this was
* switched to use u64. Be careful if compiling this as 32 bit. To help the
* value of core_context is limited to u32 so it should work OK despite the
* type change.
*/
#define PEER_MEM_U64_CORE_CONTEXT
struct device;
/**
* struct peer_memory_client - registration information for user virtual
* memory handlers
*
* The peer_memory_client scheme allows a driver to register with the ib_umem
* system that it has the ability to understand user virtual address ranges
* that are not compatible with get_user_pages(). For instance VMAs created
* with io_remap_pfn_range(), or other driver special VMA.
*
* For ranges the interface understands it can provide a DMA mapped sg_table
* for use by the ib_umem, allowing user virtual ranges that cannot be
* supported by get_user_pages() to be used as umems.
*/
struct peer_memory_client {
char name[IB_PEER_MEMORY_NAME_MAX];
char version[IB_PEER_MEMORY_VER_MAX];
/**
* acquire - Begin working with a user space virtual address range
*
* @addr - Virtual address to be checked whether belongs to peer.
* @size - Length of the virtual memory area starting at addr.
* @peer_mem_private_data - Obsolete, always NULL
* @peer_mem_name - Obsolete, always NULL
* @client_context - Returns an opaque value for this acquire use in
* other APIs
*
* Returns 1 if the peer_memory_client supports the entire virtual
* address range, 0 or -ERRNO otherwise. If 1 is returned then
* release() will be called to release the acquire().
*/
int (*acquire)(unsigned long addr, size_t size,
void *peer_mem_private_data, char *peer_mem_name,
void **client_context);
/**
* get_pages - Fill in the first part of a sg_table for a virtual
* address range
*
* @addr - Virtual address to be checked whether belongs to peer.
* @size - Length of the virtual memory area starting at addr.
* @write - Always 1
* @force - 1 if write is required
* @sg_head - Obsolete, always NULL
* @client_context - Value returned by acquire()
* @core_context - Value to be passed to invalidate_peer_memory for
* this get
*
* addr/size are passed as the raw virtual address range requested by
* the user, it is not aligned to any page size. get_pages() is always
* followed by dma_map().
*
* Upon return the caller can call the invalidate_callback().
*
* Returns 0 on success, -ERRNO on failure. After success put_pages()
* will be called to return the pages.
*/
int (*get_pages)(unsigned long addr, size_t size, int write, int force,
struct sg_table *sg_head, void *client_context,
u64 core_context);
/**
* dma_map - Create a DMA mapped sg_table
*
* @sg_head - The sg_table to allocate
* @client_context - Value returned by acquire()
* @dma_device - The device that will be doing DMA from these addresses
* @dmasync - Obsolete, always 0
* @nmap - Returns the number of dma mapped entries in the sg_head
*
* Must be called after get_pages(). This must fill in the sg_head with
* DMA mapped SGLs for dma_device. Each SGL start and end must meet a
* minimum alignment of at least PAGE_SIZE, though individual sgls can
* be multiples of PAGE_SIZE, in any mixture. Since the user virtual
* address/size are not page aligned, the implementation must increase
* it to the logical alignment when building the SGLs.
*
* Returns 0 on success, -ERRNO on failure. After success dma_unmap()
* will be called to unmap the pages. On failure sg_head must be left
* untouched or point to a valid sg_table.
*/
int (*dma_map)(struct sg_table *sg_head, void *client_context,
struct device *dma_device, int dmasync, int *nmap);
/**
* dma_unmap - Unmap a DMA mapped sg_table
*
* @sg_head - The sg_table to unmap
* @client_context - Value returned by acquire()
* @dma_device - The device that will be doing DMA from these addresses
*
* sg_head will not be touched after this function returns.
*
* Must return 0.
*/
int (*dma_unmap)(struct sg_table *sg_head, void *client_context,
struct device *dma_device);
/**
* put_pages - Unpin a SGL
*
* @sg_head - The sg_table to unpin
* @client_context - Value returned by acquire()
*
* sg_head must be freed on return.
*/
void (*put_pages)(struct sg_table *sg_head, void *client_context);
/* Obsolete, not used */
unsigned long (*get_page_size)(void *client_context);
/**
* release - Undo acquire
*
* @client_context - Value returned by acquire()
*
* If acquire() returns 1 then release() must be called. All
* get_pages() and dma_map()'s must be undone before calling this
* function.
*/
void (*release)(void *client_context);
};
enum {
PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
};
struct peer_memory_client_ex {
struct peer_memory_client client;
size_t ex_size;
u32 flags;
};
/*
* If invalidate_callback() is non-NULL then the client will only support
* umems which can be invalidated. The caller may call the
* invalidate_callback() after acquire() on return the range will no longer
* have DMA active, and release() will have been called.
*
* Note: The implementation locking must ensure that get_pages(), and
* dma_map() do not have locking dependencies with invalidate_callback(). The
* ib_core will wait until any concurrent get_pages() or dma_map() completes
* before returning.
*
* Similarly, this can call dma_unmap(), put_pages() and release() from within
* the callback, or will wait for another thread doing those operations to
* complete.
*
* For these reasons the user of invalidate_callback() must be careful with
* locking.
*/
typedef int (*invalidate_peer_memory)(void *reg_handle, u64 core_context);
void *
ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
invalidate_peer_memory *invalidate_callback);
void ib_unregister_peer_memory_client(void *reg_handle);
#endif