570.86.15

This commit is contained in:
Bernhard Stoeckner
2025-01-27 19:36:56 +01:00
parent 9d0b0414a5
commit 54d69484da
1166 changed files with 318863 additions and 182687 deletions

View File

@@ -64,7 +64,9 @@
* old or even just user disabled. If we should use LKCA, include headers, else
* define stubs to return errors.
*/
#if defined(NV_CRYPTO_PRESENT) && defined (NV_CONFIG_CRYPTO_PRESENT)
#if defined(NV_CRYPTO_PRESENT) && defined (NV_CONFIG_CRYPTO_PRESENT) && \
(defined(NV_CRYPTO_AKCIPHER_VERIFY_PRESENT) || \
(defined(NV_CRYPTO_SIG_H_PRESENT) && defined(NV_ECC_DIGITS_FROM_BYTES_PRESENT)))
#define USE_LKCA 1
#endif

View File

@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -30,14 +30,26 @@ MODULE_SOFTDEP("pre: ecdh_generic,ecdsa_generic");
#include <crypto/akcipher.h>
#include <crypto/ecdh.h>
#include <crypto/internal/ecc.h>
#ifndef NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
#include <crypto/sig.h>
struct signature
{
u64 r[ECC_MAX_DIGITS];
u64 s[ECC_MAX_DIGITS];
};
#endif // NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
#define ECDSA_PUBKEY_HEADER_XY_PRESENT (0x4)
struct ecc_ctx {
unsigned int curve_id;
u64 priv_key[ECC_MAX_DIGITS]; // In big endian
struct {
// ecdsa wants byte preceding pub_key to be set to '4'
u64 pub_key_prefix;
// ecdsa pubkey has header indicating length of pubkey
u8 padding[7];
u8 pub_key_prefix;
u64 pub_key[2 * ECC_MAX_DIGITS];
};
@@ -46,7 +58,7 @@ struct ecc_ctx {
char const *name;
int size;
};
#endif
#endif // USE_LKCA
void *libspdm_ec_new_by_nid(size_t nid)
{
@@ -77,7 +89,7 @@ void *libspdm_ec_new_by_nid(size_t nid)
ctx->priv_key_set = false;
return ctx;
#endif
#endif // USE_LKCA
}
void libspdm_ec_free(void *ec_context)
@@ -109,7 +121,7 @@ bool lkca_ecdsa_set_priv_key(void *context, uint8_t *key, size_t key_size)
ctx->pub_key_set = true;
ctx->priv_key_set = true;
return true;
#endif
#endif // USE_LKCA
}
bool lkca_ec_set_pub_key(void *ec_context, const uint8_t *public_key,
@@ -139,7 +151,7 @@ bool lkca_ec_set_pub_key(void *ec_context, const uint8_t *public_key,
memcpy(ctx->pub_key, public_key, public_key_size);
ctx->pub_key_set = true;
return true;
#endif
#endif // USE_LKCA
}
bool lkca_ec_get_pub_key(void *ec_context, uint8_t *public_key,
@@ -158,7 +170,7 @@ bool lkca_ec_get_pub_key(void *ec_context, uint8_t *public_key,
memcpy(public_key, ctx->pub_key, ctx->size);
return true;
#endif
#endif // USE_LKCA
}
bool lkca_ec_generate_key(void *ec_context, uint8_t *public_data,
@@ -185,7 +197,7 @@ bool lkca_ec_generate_key(void *ec_context, uint8_t *public_data,
ctx->pub_key_set = true;
return true;
#endif
#endif // USE_LKCA
}
bool lkca_ec_compute_key(void *ec_context, const uint8_t *peer_public,
@@ -218,28 +230,87 @@ bool lkca_ec_compute_key(void *ec_context, const uint8_t *peer_public,
*key_size = ctx->size / 2;
return true;
#endif
#endif // USE_LKCA
}
bool lkca_ecdsa_verify(void *ec_context, size_t hash_nid,
const uint8_t *message_hash, size_t hash_size,
const uint8_t *signature, size_t sig_size)
#ifndef NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
static bool lkca_ecdsa_verify_crypto_sig(void *ec_context, size_t hash_nid,
const uint8_t *message_hash, size_t hash_size,
const uint8_t *signature, size_t sig_size)
{
#ifndef USE_LKCA
return false;
#else
#else // USE_LKCA
struct ecc_ctx *ctx = ec_context;
u8 *pub_key;
int err;
DECLARE_CRYPTO_WAIT(wait);
struct crypto_sig * tfm = NULL;
struct signature sig;
if (sig_size != ctx->size || !ctx->pub_key_set)
{
return false;
}
tfm = crypto_alloc_sig(ctx->name, CRYPTO_ALG_TYPE_SIG, 0);
if (IS_ERR(tfm)) {
pr_info("crypto_alloc_sig failed in lkca_ecdsa_verify\n");
return false;
}
// modify header of pubkey to indicate size
pub_key = (u8 *) &(ctx->pub_key_prefix);
*pub_key = ECDSA_PUBKEY_HEADER_XY_PRESENT;
err = crypto_sig_set_pubkey(tfm, pub_key, ctx->size + 1);
if (err != 0)
{
pr_info("crypto_sig_set_pubkey failed in lkca_ecdsa_verify: %d", -err);
goto failTfm;
}
//
// Compared to the way we receive the signature, we need to:
// - swap order of all digits
// - swap endianness for each digit
//
memset(&sig, 0, sizeof(sig));
ecc_digits_from_bytes(signature, ctx->size/2, sig.r, ECC_MAX_DIGITS);
ecc_digits_from_bytes(signature + ctx->size/2, ctx->size/2, sig.s, ECC_MAX_DIGITS);
err = crypto_sig_verify(tfm, (void *)&sig, sizeof(sig), message_hash, hash_size);
if (err != 0)
{
pr_info("crypto_sig_verify failed in lkca_ecdsa_verify %d\n", -err);
}
failTfm:
crypto_free_sig(tfm);
return err == 0;
#endif // USE_LKCA
}
#else // NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
static bool lkca_ecdsa_verify_akcipher(void *ec_context, size_t hash_nid,
const uint8_t *message_hash, size_t hash_size,
const uint8_t *signature, size_t sig_size)
{
#ifndef USE_LKCA
return false;
#else // USE_LKCA
struct ecc_ctx *ctx = ec_context;
u8 *pub_key;
int err;
DECLARE_CRYPTO_WAIT(wait);
// Roundabout way
u64 ber_max_len = 3 + 2 * (4 + (ECC_MAX_BYTES));
u64 ber_len = 0;
u8 *ber = NULL;
u8 *pub_key;
struct akcipher_request *req = NULL;
struct crypto_akcipher *tfm = NULL;
struct scatterlist sg;
DECLARE_CRYPTO_WAIT(wait);
int err;
if (sig_size != ctx->size) {
return false;
@@ -251,21 +322,21 @@ bool lkca_ecdsa_verify(void *ec_context, size_t hash_nid,
tfm = crypto_alloc_akcipher(ctx->name, CRYPTO_ALG_TYPE_AKCIPHER, 0);
if (IS_ERR(tfm)) {
pr_info("ALLOC FAILED\n");
pr_info("crypto_alloc_akcipher failed in lkca_ecdsa_verify\n");
return false;
}
pub_key = (u8 *) ctx->pub_key;
pub_key--; // Go back into byte of pub_key_prefix
*pub_key = 4; // And set it to 4 to placate kernel
// modify header of pubkey to indicate size
pub_key = (u8 *) &(ctx->pub_key_prefix);
*pub_key = ECDSA_PUBKEY_HEADER_XY_PRESENT;
if ((err = crypto_akcipher_set_pub_key(tfm, pub_key, ctx->size + 1)) != 0) {
pr_info("SET PUB KEY FAILED: %d\n", -err);
pr_info("crypto_akcipher_set_pub_key failed in lkca_ecdsa_verify: %d\n", -err);
goto failTfm;
}
req = akcipher_request_alloc(tfm, GFP_KERNEL);
if (IS_ERR(req)) {
pr_info("REQUEST ALLOC FAILED\n");
pr_info("akcipher_request_alloc failed in lkca_ecdsa_verify\n");
goto failTfm;
}
@@ -310,9 +381,8 @@ bool lkca_ecdsa_verify(void *ec_context, size_t hash_nid,
CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait);
akcipher_request_set_crypt(req, &sg, NULL, ber_len, hash_size);
err = crypto_wait_req(crypto_akcipher_verify(req), &wait);
if (err != 0){
pr_info("Verify FAILED %d\n", -err);
pr_info("crypto_akcipher_verify failed in lkca_ecdsa_verify %d\n", -err);
}
kfree(ber);
@@ -322,5 +392,19 @@ failTfm:
crypto_free_akcipher(tfm);
return err == 0;
#endif
#endif // USE_LKCA
}
#endif // NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
bool lkca_ecdsa_verify(void *ec_context, size_t hash_nid,
const uint8_t *message_hash, size_t hash_size,
const uint8_t *signature, size_t sig_size)
{
#ifndef NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
return lkca_ecdsa_verify_crypto_sig(ec_context, hash_nid, message_hash, hash_size,
signature, sig_size);
#else // NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
return lkca_ecdsa_verify_akcipher(ec_context, hash_nid, message_hash, hash_size,
signature, sig_size);
#endif // NV_CRYPTO_AKCIPHER_VERIFY_PRESENT
}

View File

@@ -44,11 +44,15 @@ static struct file_operations g_nv_caps_imex_fops =
.release = nv_caps_imex_release
};
struct
static struct class *g_nv_caps_imex_class;
static struct
{
NvBool initialized;
struct cdev cdev;
dev_t devno;
dev_t channel0;
struct device *dev_channel0;
} g_nv_caps_imex;
int NV_API_CALL nv_caps_imex_channel_get(int fd)
@@ -93,6 +97,72 @@ int NV_API_CALL nv_caps_imex_channel_count(void)
return NVreg_ImexChannelCount;
}
static void nv_caps_imex_remove_channel0(void)
{
if (g_nv_caps_imex_class == NULL)
return;
device_destroy(g_nv_caps_imex_class, g_nv_caps_imex.channel0);
class_destroy(g_nv_caps_imex_class);
g_nv_caps_imex_class = NULL;
}
#if defined(NV_CLASS_DEVNODE_HAS_CONST_ARG)
static char *nv_caps_imex_devnode(const struct device *dev, umode_t *mode)
#else
static char *nv_caps_imex_devnode(struct device *dev, umode_t *mode)
#endif
{
if (!mode)
return NULL;
//
// Handle only world visible channel0, otherwise let the kernel apply
// defaults (root only access)
//
if (dev->devt == g_nv_caps_imex.channel0)
*mode = S_IRUGO | S_IWUGO;
return NULL;
}
static int nv_caps_imex_add_channel0(void)
{
#if defined(NV_CLASS_CREATE_HAS_NO_OWNER_ARG)
g_nv_caps_imex_class = class_create("nvidia-caps-imex-channels");
#else
g_nv_caps_imex_class = class_create(THIS_MODULE, "nvidia-caps-imex-channels");
#endif
if (IS_ERR(g_nv_caps_imex_class))
{
nv_printf(NV_DBG_ERRORS, "nv-caps-imex failed to register class.\n");
return -1;
}
// Install udev callback
g_nv_caps_imex_class->devnode = nv_caps_imex_devnode;
g_nv_caps_imex.dev_channel0 = device_create(g_nv_caps_imex_class, NULL,
g_nv_caps_imex.channel0, NULL,
"nvidia-caps-imex-channels!channel%d", 0);
if (IS_ERR(g_nv_caps_imex.dev_channel0))
{
nv_printf(NV_DBG_ERRORS, "nv-caps-imex failed to create channel0.\n");
class_destroy(g_nv_caps_imex_class);
g_nv_caps_imex_class = NULL;
return -1;
}
nv_printf(NV_DBG_ERRORS, "nv-caps-imex channel0 created. "
"Make sure you are aware of the IMEX security model.\n");
return 0;
}
int NV_API_CALL nv_caps_imex_init(void)
{
int rc;
@@ -106,14 +176,13 @@ int NV_API_CALL nv_caps_imex_init(void)
if (NVreg_ImexChannelCount == 0)
{
nv_printf(NV_DBG_INFO, "nv-caps-imex is disabled.\n");
// Disable channel creation as well
NVreg_CreateImexChannel0 = 0;
return 0;
}
rc = alloc_chrdev_region(&g_nv_caps_imex.devno, 0,
g_nv_caps_imex_class = NULL;
g_nv_caps_imex.dev_channel0 = NULL;
rc = alloc_chrdev_region(&g_nv_caps_imex.channel0, 0,
NVreg_ImexChannelCount,
"nvidia-caps-imex-channels");
if (rc < 0)
@@ -126,7 +195,7 @@ int NV_API_CALL nv_caps_imex_init(void)
g_nv_caps_imex.cdev.owner = THIS_MODULE;
rc = cdev_add(&g_nv_caps_imex.cdev, g_nv_caps_imex.devno,
rc = cdev_add(&g_nv_caps_imex.cdev, g_nv_caps_imex.channel0,
NVreg_ImexChannelCount);
if (rc < 0)
{
@@ -134,12 +203,22 @@ int NV_API_CALL nv_caps_imex_init(void)
goto cdev_add_fail;
}
if (NVreg_CreateImexChannel0 == 1)
{
rc = nv_caps_imex_add_channel0();
if (rc < 0)
goto channel0_add_fail;
}
g_nv_caps_imex.initialized = NV_TRUE;
return 0;
channel0_add_fail:
cdev_del(&g_nv_caps_imex.cdev);
cdev_add_fail:
unregister_chrdev_region(g_nv_caps_imex.devno, NVreg_ImexChannelCount);
unregister_chrdev_region(g_nv_caps_imex.channel0, NVreg_ImexChannelCount);
return rc;
}
@@ -151,9 +230,11 @@ void NV_API_CALL nv_caps_imex_exit(void)
return;
}
nv_caps_imex_remove_channel0();
cdev_del(&g_nv_caps_imex.cdev);
unregister_chrdev_region(g_nv_caps_imex.devno, NVreg_ImexChannelCount);
unregister_chrdev_region(g_nv_caps_imex.channel0, NVreg_ImexChannelCount);
g_nv_caps_imex.initialized = NV_FALSE;
}

View File

@@ -370,67 +370,6 @@ static void nv_dma_unmap_scatterlist(nv_dma_map_t *dma_map)
nv_destroy_dma_map_scatterlist(dma_map);
}
static void nv_dma_nvlink_addr_compress
(
nv_dma_device_t *dma_dev,
NvU64 *va_array,
NvU64 page_count,
NvBool contig
)
{
#if defined(NVCPU_PPC64LE)
NvU64 addr = 0;
NvU64 i;
/*
* On systems that support NVLink sysmem links, apply the required address
* compression scheme when links are trained. Otherwise check that PCIe and
* NVLink DMA mappings are equivalent as per requirements of Bug 1920398.
*/
if (dma_dev->nvlink)
{
for (i = 0; i < (contig ? 1 : page_count); i++)
{
va_array[i] = nv_compress_nvlink_addr(va_array[i]);
}
return;
}
for (i = 0; i < (contig ? 1 : page_count); i++)
{
addr = nv_compress_nvlink_addr(va_array[i]);
if (WARN_ONCE(va_array[i] != addr,
"unexpected DMA address compression (0x%llx, 0x%llx)\n",
va_array[i], addr))
{
break;
}
}
#endif
}
static void nv_dma_nvlink_addr_decompress
(
nv_dma_device_t *dma_dev,
NvU64 *va_array,
NvU64 page_count,
NvBool contig
)
{
#if defined(NVCPU_PPC64LE)
NvU64 i;
if (dma_dev->nvlink)
{
for (i = 0; i < (contig ? 1 : page_count); i++)
{
va_array[i] = nv_expand_nvlink_addr(va_array[i]);
}
}
#endif
}
NV_STATUS NV_API_CALL nv_dma_map_sgt(
nv_dma_device_t *dma_dev,
NvU64 page_count,
@@ -479,8 +418,6 @@ NV_STATUS NV_API_CALL nv_dma_map_sgt(
else
{
*priv = dma_map;
nv_dma_nvlink_addr_compress(dma_dev, va_array, dma_map->page_count,
dma_map->contiguous);
}
return status;
@@ -575,8 +512,6 @@ static NV_STATUS NV_API_CALL nv_dma_map_pages(
else
{
*priv = dma_map;
nv_dma_nvlink_addr_compress(dma_dev, va_array, dma_map->page_count,
dma_map->contiguous);
}
return status;
@@ -908,8 +843,6 @@ NV_STATUS NV_API_CALL nv_dma_map_mmio
*va = *va + dma_dev->addressable_range.start;
}
nv_dma_nvlink_addr_compress(dma_dev, va, page_count, NV_TRUE);
return NV_OK;
#else
return NV_ERR_NOT_SUPPORTED;
@@ -924,8 +857,6 @@ void NV_API_CALL nv_dma_unmap_mmio
)
{
#if defined(NV_DMA_MAP_RESOURCE_PRESENT)
nv_dma_nvlink_addr_decompress(dma_dev, &va, page_count, NV_TRUE);
if (nv_dma_use_map_resource(dma_dev))
{
dma_unmap_resource(dma_dev->dev, va, page_count * PAGE_SIZE,
@@ -974,15 +905,6 @@ void NV_API_CALL nv_dma_cache_invalidate
#endif
}
/* Enable DMA-mapping over NVLink */
void NV_API_CALL nv_dma_enable_nvlink
(
nv_dma_device_t *dma_dev
)
{
dma_dev->nvlink = NV_TRUE;
}
#if defined(NV_LINUX_DMA_BUF_H_PRESENT) && \
defined(NV_DRM_AVAILABLE) && defined(NV_DRM_DRM_GEM_H_PRESENT)

View File

@@ -90,6 +90,20 @@ typedef struct nv_dma_buf_file_private
// fetched during dma-buf create/reuse instead of in map.
//
NvBool static_phys_addrs;
//
// Type of mapping requested, one of:
// NV_DMABUF_EXPORT_MAPPING_TYPE_DEFAULT
// NV_DMABUF_EXPORT_MAPPING_TYPE_FORCE_PCIE
//
NvU8 mapping_type;
//
// On some coherent platforms requesting mapping_type FORCE_PCIE,
// peer-to-peer is expected to bypass the IOMMU due to hardware
// limitations. On such systems, IOMMU map/unmap will be skipped.
//
NvBool skip_iommu;
} nv_dma_buf_file_private_t;
static void
@@ -380,6 +394,7 @@ nv_put_phys_addresses(
// Per-handle memArea is freed by RM
rm_dma_buf_unmap_mem_handle(sp, priv->nv, priv->h_client,
priv->handles[index].h_memory,
priv->mapping_type,
priv->handles[index].mem_info,
priv->static_phys_addrs,
priv->handles[index].memArea);
@@ -508,6 +523,7 @@ nv_dma_buf_get_phys_addresses (
priv->handles[index].h_memory,
mrangeMake(priv->handles[index].offset,
priv->handles[index].size),
priv->mapping_type,
priv->handles[index].mem_info,
priv->static_phys_addrs,
&priv->handles[index].memArea);
@@ -557,22 +573,34 @@ failed:
static void
nv_dma_buf_unmap_pages(
struct device *dev,
struct sg_table *sgt
struct sg_table *sgt,
nv_dma_buf_file_private_t *priv
)
{
if (priv->skip_iommu)
{
return;
}
dma_unmap_sg(dev, sgt->sgl, sgt->nents, DMA_BIDIRECTIONAL);
}
static void
nv_dma_buf_unmap_pfns(
struct device *dev,
struct sg_table *sgt
struct sg_table *sgt,
nv_dma_buf_file_private_t *priv
)
{
nv_dma_device_t peer_dma_dev = {{ 0 }};
struct scatterlist *sg = sgt->sgl;
NvU32 i;
if (priv->skip_iommu)
{
return;
}
peer_dma_dev.dev = dev;
peer_dma_dev.addressable_range.limit = (NvU64)dev->dma_mask;
@@ -729,11 +757,14 @@ nv_dma_buf_map_pfns (
goto unmap_pfns;
}
status = nv_dma_map_peer(&peer_dma_dev, priv->nv->dma_dev, 0x1,
(sg_len >> PAGE_SHIFT), &dma_addr);
if (status != NV_OK)
if (!priv->skip_iommu)
{
goto unmap_pfns;
status = nv_dma_map_peer(&peer_dma_dev, priv->nv->dma_dev, 0x1,
(sg_len >> PAGE_SHIFT), &dma_addr);
if (status != NV_OK)
{
goto unmap_pfns;
}
}
sg_set_page(sg, NULL, sg_len, 0);
@@ -755,7 +786,7 @@ nv_dma_buf_map_pfns (
unmap_pfns:
sgt->nents = mapped_nents;
nv_dma_buf_unmap_pfns(dev, sgt);
nv_dma_buf_unmap_pfns(dev, sgt, priv);
sg_free_table(sgt);
@@ -777,12 +808,14 @@ nv_dma_buf_map(
nv_dma_buf_file_private_t *priv = buf->priv;
//
// On non-coherent platforms, importers must be able to handle peer
// MMIO resources not backed by struct page.
// On non-coherent platforms, and on coherent platforms requesting
// PCIe mapping, importers must be able to handle peer MMIO resources
// not backed by struct page.
//
#if defined(NV_DMA_BUF_HAS_DYNAMIC_ATTACHMENT) && \
defined(NV_DMA_BUF_ATTACHMENT_HAS_PEER2PEER)
if (!priv->nv->coherent &&
if (((!priv->nv->coherent) ||
(priv->mapping_type == NV_DMABUF_EXPORT_MAPPING_TYPE_FORCE_PCIE)) &&
dma_buf_attachment_is_dynamic(attachment) &&
!attachment->peer2peer)
{
@@ -794,6 +827,17 @@ nv_dma_buf_map(
mutex_lock(&priv->lock);
if (priv->mapping_type == NV_DMABUF_EXPORT_MAPPING_TYPE_FORCE_PCIE)
{
if(!nv_pci_is_valid_topology_for_direct_pci(priv->nv, attachment->dev))
{
nv_printf(NV_DBG_ERRORS,
"NVRM: topology not supported for mapping type FORCE_PCIE\n");
return NULL;
}
priv->skip_iommu = NV_TRUE;
}
if (priv->num_objects != priv->total_objects)
{
goto unlock_priv;
@@ -808,7 +852,12 @@ nv_dma_buf_map(
}
}
if (priv->nv->coherent)
//
// For MAPPING_TYPE_FORCE_PCIE on coherent platforms,
// get the BAR1 PFN scatterlist instead of C2C pages.
//
if ((priv->nv->coherent) &&
(priv->mapping_type == NV_DMABUF_EXPORT_MAPPING_TYPE_DEFAULT))
{
sgt = nv_dma_buf_map_pages(attachment->dev, priv);
}
@@ -849,13 +898,14 @@ nv_dma_buf_unmap(
mutex_lock(&priv->lock);
if (priv->nv->coherent)
if ((priv->nv->coherent) &&
(priv->mapping_type == NV_DMABUF_EXPORT_MAPPING_TYPE_DEFAULT))
{
nv_dma_buf_unmap_pages(attachment->dev, sgt);
nv_dma_buf_unmap_pages(attachment->dev, sgt, priv);
}
else
{
nv_dma_buf_unmap_pfns(attachment->dev, sgt);
nv_dma_buf_unmap_pfns(attachment->dev, sgt, priv);
}
//
@@ -1048,6 +1098,8 @@ nv_dma_buf_create(
priv->total_size = params->totalSize;
priv->nv = nv;
priv->can_mmap = NV_FALSE;
priv->mapping_type = params->mappingType;
priv->skip_iommu = NV_FALSE;
rc = nv_kmem_cache_alloc_stack(&sp);
if (rc != 0)
@@ -1066,6 +1118,7 @@ nv_dma_buf_create(
status = rm_dma_buf_get_client_and_device(sp, priv->nv,
params->hClient,
params->handles[0],
priv->mapping_type,
&priv->h_client,
&priv->h_device,
&priv->h_subdevice,
@@ -1208,7 +1261,8 @@ nv_dma_buf_reuse(
}
if ((priv->total_objects < params->numObjects) ||
(params->index > (priv->total_objects - params->numObjects)))
(params->index > (priv->total_objects - params->numObjects)) ||
(params->mappingType != priv->mapping_type))
{
status = NV_ERR_INVALID_ARGUMENT;
goto unlock_priv;
@@ -1281,6 +1335,12 @@ nv_dma_buf_export(
return NV_ERR_INVALID_ARGUMENT;
}
if ((params->mappingType != NV_DMABUF_EXPORT_MAPPING_TYPE_DEFAULT) &&
(params->mappingType != NV_DMABUF_EXPORT_MAPPING_TYPE_FORCE_PCIE))
{
return NV_ERR_INVALID_ARGUMENT;
}
//
// If fd >= 0, dma-buf already exists with this fd, so get dma-buf from fd.
// If fd == -1, dma-buf is not created yet, so create it and then store

View File

@@ -1,441 +0,0 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
* nv-ibmnpu.c - interface with the ibmnpu (IBM NVLink Processing Unit) "module"
*/
#include "nv-linux.h"
#include "nv-ibmnpu.h"
#if defined(NVCPU_PPC64LE)
#include "nv-rsync.h"
/*
* Temporary query to get the L1D cache block size directly from the device
* tree for the offline cache flush workaround, since the ppc64_caches symbol
* is unavailable to us.
*/
const NvU32 P9_L1D_CACHE_DEFAULT_BLOCK_SIZE = 0x80;
static NvU32 nv_ibm_get_cpu_l1d_cache_block_size(void)
{
const __be32 *block_size_prop;
/*
* Attempt to look up the block size from device tree. If unavailable, just
* return the default that we see on these systems.
*/
struct device_node *cpu = of_find_node_by_type(NULL, "cpu");
if (!cpu)
{
return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
}
block_size_prop = of_get_property(cpu, "d-cache-block-size", NULL);
if (!block_size_prop)
{
return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
}
return be32_to_cpu(*block_size_prop);
}
/*
* GPU device memory can be exposed to the kernel as NUMA node memory via the
* IBMNPU devices associated with the GPU. The platform firmware will specify
* the parameters of where the memory lives in the system address space via
* firmware properties on the IBMNPU devices. These properties specify what
* memory can be accessed through the IBMNPU device, and the driver can online
* a GPU device's memory into the range accessible by its associated IBMNPU
* devices.
*
* This function calls over to the IBMNPU driver to query the parameters from
* firmware, and validates that the resulting parameters are acceptable.
*/
static void nv_init_ibmnpu_numa_info(nv_state_t *nv)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
nv_npu_numa_info_t *npu_numa_info = &nvl->npu->numa_info;
struct pci_dev *npu_dev = nvl->npu->devs[0];
NvU64 spa, gpa, aper_size;
/*
* Terminology:
* - system physical address (spa): 47-bit NVIDIA physical address, which
* is the CPU real address with the NVLink address compression scheme
* already applied in firmware.
* - guest physical address (gpa): 56-bit physical address as seen by the
* operating system. This is the base address that we should use for
* onlining device memory.
*/
nvl->numa_info.node_id = ibmnpu_device_get_memory_config(npu_dev, &spa, &gpa,
&aper_size);
if (nvl->numa_info.node_id == NUMA_NO_NODE)
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "No NUMA memory aperture found\n");
return;
}
/* Validate that the compressed system physical address is not too wide */
if (spa & (~(BIT_ULL(nv_volta_dma_addr_size) - 1)))
{
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"Invalid NUMA memory system pa 0x%llx"
" on IBM-NPU device %04x:%02x:%02x.%u\n",
spa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
goto invalid_numa_config;
}
/*
* Validate that the guest physical address is aligned to 128GB.
* This alignment requirement comes from the Volta address space
* size on POWER9.
*/
if (!IS_ALIGNED(gpa, BIT_ULL(nv_volta_addr_space_width)))
{
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"Invalid alignment in NUMA memory guest pa 0x%llx"
" on IBM-NPU device %04x:%02x:%02x.%u\n",
gpa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
goto invalid_numa_config;
}
/* Validate that the aperture can map all of the device's framebuffer */
if (aper_size < nv->fb->size)
{
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"Insufficient NUMA memory aperture size 0x%llx"
" on IBM-NPU device %04x:%02x:%02x.%u (0x%llx required)\n",
aper_size, NV_PCI_DOMAIN_NUMBER(npu_dev),
NV_PCI_BUS_NUMBER(npu_dev), NV_PCI_SLOT_NUMBER(npu_dev),
PCI_FUNC(npu_dev->devfn), nv->fb->size);
goto invalid_numa_config;
}
npu_numa_info->compr_sys_phys_addr = spa;
npu_numa_info->guest_phys_addr = gpa;
if (NVreg_EnableUserNUMAManagement)
{
NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
}
else
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "User-mode NUMA onlining disabled.\n");
nvl->numa_info.node_id = NUMA_NO_NODE;
}
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "NUMA memory aperture: "
"[spa = 0x%llx, gpa = 0x%llx, aper_size = 0x%llx]\n",
spa, gpa, aper_size);
/* Get the CPU's L1D cache block size for offlining cache flush */
npu_numa_info->l1d_cache_block_size = nv_ibm_get_cpu_l1d_cache_block_size();
return;
invalid_numa_config:
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"NUMA memory aperture disabled due to invalid firmware configuration\n");
nvl->numa_info.node_id = NUMA_NO_NODE;
}
void nv_init_ibmnpu_info(nv_state_t *nv)
{
#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
struct pci_dev *npu_dev = pnv_pci_get_npu_dev(nvl->pci_dev, 0);
NvU8 dev_count;
if (!npu_dev)
{
return;
}
if (os_alloc_mem((void **)&nvl->npu, sizeof(nv_ibmnpu_info_t)) != NV_OK)
{
return;
}
os_mem_set(nvl->npu, 0, sizeof(nv_ibmnpu_info_t));
/* Find any other IBMNPU devices attached to this GPU */
for (nvl->npu->devs[0] = npu_dev, dev_count = 1;
dev_count < NV_MAX_ATTACHED_IBMNPUS; dev_count++)
{
nvl->npu->devs[dev_count] = pnv_pci_get_npu_dev(nvl->pci_dev, dev_count);
if (!nvl->npu->devs[dev_count])
{
break;
}
}
nvl->npu->dev_count = dev_count;
/*
* If we run out of space for IBMNPU devices, NV_MAX_ATTACHED_IBMNPUS will
* need to be bumped.
*/
WARN_ON((dev_count == NV_MAX_ATTACHED_IBMNPUS) &&
pnv_pci_get_npu_dev(nvl->pci_dev, dev_count));
ibmnpu_device_get_genregs_info(npu_dev, &nvl->npu->genregs);
if (nvl->npu->genregs.size > 0)
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
"IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
" has a generation register space 0x%llx-0x%llx\n",
NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn),
nvl->npu->genregs.start_addr,
nvl->npu->genregs.start_addr + nvl->npu->genregs.size - 1);
}
else
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
"IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
"does not support generation registers\n",
NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
}
nv_init_ibmnpu_numa_info(nv);
#endif
}
void nv_destroy_ibmnpu_info(nv_state_t *nv)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (nvl->npu != NULL)
{
os_free_mem(nvl->npu);
nvl->npu = NULL;
}
}
int nv_init_ibmnpu_devices(nv_state_t *nv)
{
NvU8 i;
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (!nvl->npu)
{
return 0;
}
for (i = 0; i < nvl->npu->dev_count; i++)
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
"Initializing IBM-NPU device %04x:%02x:%02x.%u\n",
NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
PCI_FUNC(nvl->npu->devs[i]->devfn));
if (ibmnpu_init_device(nvl->npu->devs[i]) != NVL_SUCCESS)
{
nv_unregister_ibmnpu_devices(nv);
return -EIO;
}
nvl->npu->initialized_dev_count++;
}
return 0;
}
void nv_unregister_ibmnpu_devices(nv_state_t *nv)
{
NvU8 i;
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (!nvl->npu)
{
return;
}
for (i = 0; i < nvl->npu->initialized_dev_count; i++)
{
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
"Unregistering IBM-NPU device %04x:%02x:%02x.%u\n",
NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
PCI_FUNC(nvl->npu->devs[i]->devfn));
ibmnpu_unregister_device(nvl->npu->devs[i]);
}
nvl->npu->initialized_dev_count = 0;
}
NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
NvU64 *size, void **device)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
{
return NV_ERR_NOT_SUPPORTED;
}
if (addr)
{
*addr = nvl->npu->genregs.start_addr;
}
if (size)
{
*size = nvl->npu->genregs.size;
}
if (device)
{
*device = (void*)nvl->npu->devs[0];
}
return NV_OK;
}
NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
NvBool *mode)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
{
return NV_ERR_NOT_SUPPORTED;
}
*mode = nv_get_rsync_relaxed_ordering_mode(nv);
return NV_OK;
}
void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
{
return;
}
nv_wait_for_rsync(nv);
}
int nv_get_ibmnpu_chip_id(nv_state_t *nv)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (nvl->npu == NULL)
{
return -1;
}
return ibmnpu_device_get_chip_id(nvl->npu->devs[0]);
}
void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
NvU64 offset, cbsize;
/*
* The range is commonly an ioremap()ed mapping of the GPU's ATS range and
* needs to be compared against the created mappings. Alternatively, kernel
* page tables can be dumped through sysfs if CONFIG_PPC_PTDUMP is enabled.
*/
NV_DEV_PRINTF(NV_DBG_INFO, nv,
"Flushing CPU virtual range [0x%llx, 0x%llx)\n",
cpu_virtual, cpu_virtual + size);
cbsize = nvl->npu->numa_info.l1d_cache_block_size;
asm volatile("sync; isync" ::: "memory");
/* Force eviction of any cache lines from the NUMA-onlined region. */
for (offset = 0; offset < size; offset += cbsize)
{
asm volatile("dcbf %0,%1" :: "r" (cpu_virtual), "r" (offset) : "memory");
/* Reschedule if necessary to avoid lockup warnings */
cond_resched();
}
asm volatile("sync; isync" ::: "memory");
}
#else
void nv_init_ibmnpu_info(nv_state_t *nv)
{
}
void nv_destroy_ibmnpu_info(nv_state_t *nv)
{
}
int nv_init_ibmnpu_devices(nv_state_t *nv)
{
return 0;
}
void nv_unregister_ibmnpu_devices(nv_state_t *nv)
{
}
NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
NvU64 *size, void **device)
{
return NV_ERR_NOT_SUPPORTED;
}
NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
NvBool *mode)
{
return NV_ERR_NOT_SUPPORTED;
}
void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
{
}
int nv_get_ibmnpu_chip_id(nv_state_t *nv)
{
return -1;
}
void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 virtual, NvU64 size)
{
}
void nv_ibmnpu_cache_flush_numa_region(nv_state_t *nv)
{
}
#endif

View File

@@ -1,80 +0,0 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _NV_IBMNPU_H_
#define _NV_IBMNPU_H_
#if defined(NVCPU_PPC64LE)
#include "ibmnpu_linux.h"
#define NV_MAX_ATTACHED_IBMNPUS 6
typedef struct nv_npu_numa_info
{
/*
* 47-bit NVIDIA 'system physical address': the hypervisor real 56-bit
* address with NVLink address compression scheme applied.
*/
NvU64 compr_sys_phys_addr;
/*
* 56-bit NVIDIA 'guest physical address'/host virtual address. On
* unvirtualized systems, applying the NVLink address compression scheme
* to this address should be the same as compr_sys_phys_addr.
*/
NvU64 guest_phys_addr;
/*
* L1 data cache block size on P9 - needed to manually flush/invalidate the
* NUMA region from the CPU caches after offlining.
*/
NvU32 l1d_cache_block_size;
} nv_npu_numa_info_t;
struct nv_ibmnpu_info
{
NvU8 dev_count;
NvU8 initialized_dev_count;
struct pci_dev *devs[NV_MAX_ATTACHED_IBMNPUS];
ibmnpu_genregs_info_t genregs;
nv_npu_numa_info_t numa_info;
};
/*
* TODO: These parameters are specific to Volta/P9 configurations, and may
* need to be determined dynamically in the future.
*/
static const NvU32 nv_volta_addr_space_width = 37;
static const NvU32 nv_volta_dma_addr_size = 47;
#endif
void nv_init_ibmnpu_info(nv_state_t *nv);
void nv_destroy_ibmnpu_info(nv_state_t *nv);
int nv_init_ibmnpu_devices(nv_state_t *nv);
void nv_unregister_ibmnpu_devices(nv_state_t *nv);
int nv_get_ibmnpu_chip_id(nv_state_t *nv);
void nv_ibmnpu_cache_flush_numa_region(nv_state_t *nv);
#endif

View File

@@ -597,7 +597,8 @@ int nvidia_mmap_helper(
// TODO: Refactor is needed as part of bug#2001704.
//
if ((nv_get_numa_status(nvl) == NV_NUMA_STATUS_ONLINE) &&
!IS_REG_OFFSET(nv, access_start, access_len))
!IS_REG_OFFSET(nv, access_start, access_len) &&
(mmap_context->num_pages != 0))
{
ret = nvidia_mmap_numa(vma, mmap_context);
if (ret)

View File

@@ -25,7 +25,6 @@
#include "os-interface.h"
#include "nv-linux.h"
#include "nv-ibmnpu.h"
#include "nv-rsync.h"
#include "nv-p2p.h"
@@ -53,6 +52,7 @@ typedef struct nv_p2p_mem_info {
} dma_mapping_list;
void *private;
void *mig_info;
NvBool force_pcie;
} nv_p2p_mem_info_t;
// declared and created in nv.c
@@ -391,6 +391,7 @@ static int nv_p2p_get_pages(
uint32_t va_space,
uint64_t virtual_address,
uint64_t length,
uint8_t flags,
struct nvidia_p2p_page_table **page_table,
void (*free_callback)(void * data),
void *data
@@ -413,6 +414,7 @@ static int nv_p2p_get_pages(
NvU64 temp_length;
NvU8 *gpu_uuid = NULL;
NvU8 uuid[NVIDIA_P2P_GPU_UUID_LEN] = {0};
NvBool force_pcie = !!(flags & NVIDIA_P2P_FLAGS_FORCE_BAR1_MAPPING);
int rc;
if (!NV_IS_ALIGNED64(virtual_address, NVRM_P2P_PAGESIZE_BIG_64K) ||
@@ -426,6 +428,12 @@ static int nv_p2p_get_pages(
return -EINVAL;
}
// Forced PCIe mappings are not supported for non-persistent APIs
if ((free_callback != NULL) && force_pcie)
{
return -ENOTSUPP;
}
rc = nv_kmem_cache_alloc_stack(&sp);
if (rc != 0)
{
@@ -444,6 +452,8 @@ static int nv_p2p_get_pages(
INIT_LIST_HEAD(&mem_info->dma_mapping_list.list_head);
NV_INIT_MUTEX(&mem_info->dma_mapping_list.lock);
mem_info->force_pcie = force_pcie;
*page_table = &(mem_info->page_table);
/*
@@ -509,7 +519,8 @@ static int nv_p2p_get_pages(
status = rm_p2p_get_pages_persistent(sp, virtual_address, length,
&mem_info->private,
physical_addresses, &entries,
*page_table, gpu_info, &mem_info->mig_info);
force_pcie, *page_table, gpu_info,
&mem_info->mig_info);
if (status != NV_OK)
{
goto failed;
@@ -647,7 +658,8 @@ int nvidia_p2p_get_pages(
return nv_p2p_get_pages(NV_P2P_PAGE_TABLE_TYPE_NON_PERSISTENT,
p2p_token, va_space, virtual_address,
length, page_table, free_callback, data);
length, NVIDIA_P2P_FLAGS_DEFAULT,
page_table, free_callback, data);
}
NV_EXPORT_SYMBOL(nvidia_p2p_get_pages);
@@ -658,13 +670,8 @@ int nvidia_p2p_get_pages_persistent(
uint32_t flags
)
{
if (flags != 0)
{
return -EINVAL;
}
return nv_p2p_get_pages(NV_P2P_PAGE_TABLE_TYPE_PERSISTENT, 0, 0,
virtual_address, length, page_table,
virtual_address, length, flags, page_table,
NULL, NULL);
}
NV_EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
@@ -779,6 +786,15 @@ int nvidia_p2p_dma_map_pages(
mem_info = container_of(page_table, nv_p2p_mem_info_t, page_table);
//
// Only CPU mappings are supported for forced PCIe config through
// nv-p2p APIs. IO mappings will not be supported.
//
if (mem_info->force_pcie)
{
return -ENOTSUPP;
}
rc = nv_kmem_cache_alloc_stack(&sp);
if (rc != 0)
{
@@ -989,12 +1005,7 @@ int nvidia_p2p_get_rsync_registers(
)
{
nv_linux_state_t *nvl;
nv_state_t *nv;
NV_STATUS status;
void *ptr = NULL;
NvU64 addr;
NvU64 size;
struct pci_dev *ibmnpu = NULL;
NvU32 index = 0;
NvU32 count = 0;
nvidia_p2p_rsync_reg_info_t *info = NULL;
@@ -1030,34 +1041,9 @@ int nvidia_p2p_get_rsync_registers(
return -ENOMEM;
}
for (nvl = nv_linux_devices; nvl; nvl = nvl->next)
{
nv = NV_STATE_PTR(nvl);
addr = 0;
size = 0;
status = nv_get_ibmnpu_genreg_info(nv, &addr, &size, (void**)&ibmnpu);
if (status != NV_OK)
{
continue;
}
ptr = nv_ioremap_nocache(addr, size);
if (ptr == NULL)
{
continue;
}
regs[index].ptr = ptr;
regs[index].size = size;
regs[index].gpu = nvl->pci_dev;
regs[index].ibmnpu = ibmnpu;
regs[index].cluster_id = 0;
regs[index].socket_id = nv_get_ibmnpu_chip_id(nv);
index++;
}
// TODO: This function will always fail with -ENODEV because the logic that
// incremented 'index' was removed. It should be cleaned up in a future
// change.
UNLOCK_NV_LINUX_DEVICES();

View File

@@ -189,6 +189,12 @@ int nvidia_p2p_get_pages( uint64_t p2p_token, uint32_t va_space,
struct nvidia_p2p_page_table **page_table,
void (*free_callback)(void *data), void *data);
/*
* Flags to be used with persistent APIs
*/
#define NVIDIA_P2P_FLAGS_DEFAULT 0
#define NVIDIA_P2P_FLAGS_FORCE_BAR1_MAPPING 1
/*
* @brief
* Pin and make the pages underlying a range of GPU virtual memory
@@ -212,7 +218,11 @@ int nvidia_p2p_get_pages( uint64_t p2p_token, uint32_t va_space,
* @param[out] page_table
* A pointer to an array of structures with P2P PTEs.
* @param[in] flags
* Must be set to zero for now.
* NVIDIA_P2P_FLAGS_DEFAULT:
* Default value to be used if no specific behavior is expected.
* NVIDIA_P2P_FLAGS_FORCE_BAR1_MAPPING:
* Force BAR1 mappings on certain coherent platforms,
* subject to capability and supported topology.
*
* @return
* 0 upon successful completion.

View File

@@ -24,7 +24,6 @@
#include "nv-pci-table.h"
#include "nv-pci-types.h"
#include "nv-pci.h"
#include "nv-ibmnpu.h"
#include "nv-msi.h"
#include "nv-hypervisor.h"
@@ -46,6 +45,8 @@
#include <linux/pci-ats.h>
#endif
extern int NVreg_GrdmaPciTopoCheckOverride;
static void
nv_check_and_exclude_gpu(
nvidia_stack_t *sp,
@@ -492,6 +493,12 @@ nv_init_coherent_link_info
NV_DEV_PRINTF(NV_DBG_INFO, nv, "\tNVRM: GPU memory NUMA node: %u\n", node);
}
#if NV_IS_EXPORT_SYMBOL_GPL_pci_ats_supported
nv->ats_support = pci_ats_supported(nvl->pci_dev);
#elif defined(NV_PCI_DEV_HAS_ATS_ENABLED)
nv->ats_support = nvl->pci_dev->ats_enabled;
#endif
if (NVreg_EnableUserNUMAManagement && !os_is_vgx_hyper())
{
NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
@@ -785,32 +792,18 @@ next_bar:
NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_DISABLED);
nvl->numa_info.node_id = NUMA_NO_NODE;
nv_init_ibmnpu_info(nv);
nv_init_coherent_link_info(nv);
#if defined(NVCPU_PPC64LE)
// Use HW NUMA support as a proxy for ATS support. This is true in the only
// PPC64LE platform where ATS is currently supported (IBM P9).
nv_ats_supported &= nv_platform_supports_numa(nvl);
#else
#if NV_IS_EXPORT_SYMBOL_GPL_pci_ats_supported
nv_ats_supported &= pci_ats_supported(pci_dev);
#elif defined(NV_PCI_DEV_HAS_ATS_ENABLED)
nv_ats_supported &= pci_dev->ats_enabled;
#else
nv_ats_supported = NV_FALSE;
nv->ats_support = nv_platform_supports_numa(nvl);
#endif
#endif
if (nv_ats_supported)
if (nv->ats_support)
{
NV_DEV_PRINTF(NV_DBG_INFO, nv, "ATS supported by this GPU!\n");
}
else
{
NV_DEV_PRINTF(NV_DBG_INFO, nv, "ATS not supported by this GPU. "
"Disabling ATS support for all the GPUs in the system!\n");
}
nv_ats_supported |= nv->ats_support;
pci_set_master(pci_dev);
@@ -929,7 +922,6 @@ err_zero_dev:
rm_free_private_state(sp, nv);
err_not_supported:
nv_ats_supported = prev_nv_ats_supported;
nv_destroy_ibmnpu_info(nv);
nv_lock_destroy_locks(sp, nv);
if (nvl != NULL)
{
@@ -1079,9 +1071,6 @@ nv_pci_remove(struct pci_dev *pci_dev)
nvl->sysfs_config_file = NULL;
}
nv_unregister_ibmnpu_devices(nv);
nv_destroy_ibmnpu_info(nv);
if (NV_ATOMIC_READ(nvl->usage_count) == 0)
{
nv_lock_destroy_locks(sp, nv);
@@ -1297,97 +1286,28 @@ nv_pci_count_devices(void)
return count;
}
#if defined(NV_PCI_ERROR_RECOVERY)
static pci_ers_result_t
nv_pci_error_detected(
struct pci_dev *pci_dev,
nv_pci_channel_state_t error
/*
* On coherent platforms that support BAR1 mappings for GPUDirect RDMA,
* dma-buf and nv-p2p subsystems need to ensure the 2 devices belong to
* the same IOMMU group.
*/
NvBool nv_pci_is_valid_topology_for_direct_pci(
nv_state_t *nv,
struct device *dev
)
{
nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
struct pci_dev *pdev0 = to_pci_dev(nv->dma_dev->dev);
struct pci_dev *pdev1 = to_pci_dev(dev);
if ((nvl == NULL) || (nvl->pci_dev != pci_dev))
if (!nv->coherent)
{
nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid device!\n", __FUNCTION__);
return PCI_ERS_RESULT_NONE;
return NV_FALSE;
}
/*
* Tell Linux to continue recovery of the device. The kernel will enable
* MMIO for the GPU and call the mmio_enabled callback.
*/
return PCI_ERS_RESULT_CAN_RECOVER;
return (NVreg_GrdmaPciTopoCheckOverride != 0) ||
(pdev0->dev.iommu_group == pdev1->dev.iommu_group);
}
static pci_ers_result_t
nv_pci_mmio_enabled(
struct pci_dev *pci_dev
)
{
NV_STATUS status = NV_OK;
nv_stack_t *sp = NULL;
nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
nv_state_t *nv = NULL;
if ((nvl == NULL) || (nvl->pci_dev != pci_dev))
{
nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid device!\n", __FUNCTION__);
goto done;
}
nv = NV_STATE_PTR(nvl);
if (nv_kmem_cache_alloc_stack(&sp) != 0)
{
nv_printf(NV_DBG_ERRORS, "NVRM: %s: failed to allocate stack!\n",
__FUNCTION__);
goto done;
}
NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "A fatal error was detected.\n");
/*
* MMIO should be re-enabled now. If we still get bad reads, there's
* likely something wrong with the adapter itself that will require a
* reset. This should let us know whether the GPU has completely fallen
* off the bus or just did something the host didn't like.
*/
status = rm_is_supported_device(sp, nv);
if (status != NV_OK)
{
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"The kernel has enabled MMIO for the device,\n"
"NVRM: but it still appears unreachable. The device\n"
"NVRM: will not function properly until it is reset.\n");
}
status = rm_log_gpu_crash(sp, nv);
if (status != NV_OK)
{
NV_DEV_PRINTF_STATUS(NV_DBG_ERRORS, nv, status,
"Failed to log crash data\n");
goto done;
}
done:
if (sp != NULL)
{
nv_kmem_cache_free_stack(sp);
}
/*
* Tell Linux to abandon recovery of the device. The kernel might be able
* to recover the device, but RM and clients don't yet support that.
*/
return PCI_ERS_RESULT_DISCONNECT;
}
struct pci_error_handlers nv_pci_error_handlers = {
.error_detected = nv_pci_error_detected,
.mmio_enabled = nv_pci_mmio_enabled,
};
#endif
#if defined(CONFIG_PM)
extern struct dev_pm_ops nv_pm_ops;
#endif
@@ -1405,9 +1325,6 @@ struct pci_driver nv_pci_driver = {
#if defined(CONFIG_PM)
.driver.pm = &nv_pm_ops,
#endif
#if defined(NV_PCI_ERROR_RECOVERY)
.err_handler = &nv_pci_error_handlers,
#endif
};
void nv_pci_unregister_driver(void)

View File

@@ -32,7 +32,6 @@
#include "nv_compiler.h"
#include "nv-reg.h"
#include "conftest/patches.h"
#include "nv-ibmnpu.h"
#define NV_DEFINE_SINGLE_NVRM_PROCFS_FILE(name) \
NV_DEFINE_SINGLE_PROCFS_FILE_READ_ONLY(name, nv_system_pm_lock)
@@ -215,7 +214,8 @@ nv_procfs_read_power(
seq_printf(s, "S0ix Power Management:\n");
seq_printf(s, " Platform Support: %s\n",
nv_platform_supports_s0ix() ? "Supported" : "Not Supported");
seq_printf(s, " Status: %s\n", power_info.s0ix_status);
seq_printf(s, " Status: %s\n\n", power_info.s0ix_status);
seq_printf(s, "Notebook Dynamic Boost: %s\n", power_info.db_support);
nv_kmem_cache_free_stack(sp);
return 0;

View File

@@ -386,32 +386,6 @@
#define __NV_IGNORE_MMIO_CHECK IgnoreMMIOCheck
#define NV_REG_IGNORE_MMIO_CHECK NV_REG_STRING(__NV_IGNORE_MMIO_CHECK)
/*
* Option: TCEBypassMode
*
* Description:
*
* When this option is enabled, the NVIDIA kernel module will attempt to setup
* all GPUs in "TCE bypass mode", in which DMA mappings of system memory bypass
* the IOMMU/TCE remapping hardware on IBM POWER systems. This is typically
* necessary for CUDA applications in which large system memory mappings may
* exceed the default TCE remapping capacity when operated in non-bypass mode.
*
* This option has no effect on non-POWER platforms.
*
* Possible Values:
*
* 0: system default TCE mode on all GPUs
* 1: enable TCE bypass mode on all GPUs
* 2: disable TCE bypass mode on all GPUs
*/
#define __NV_TCE_BYPASS_MODE TCEBypassMode
#define NV_REG_TCE_BYPASS_MODE NV_REG_STRING(__NV_TCE_BYPASS_MODE)
#define NV_TCE_BYPASS_MODE_DEFAULT 0
#define NV_TCE_BYPASS_MODE_ENABLE 1
#define NV_TCE_BYPASS_MODE_DISABLE 2
/*
* Option: pci
*
@@ -899,7 +873,7 @@
*
* This option allows users to specify whether the NVIDIA driver must create
* the IMEX channel 0 by default. The channel will be created automatically
* when an application (e.g. nvidia-smi, nvidia-persistenced) is run.
* when the NVIDIA open GPU kernel module is loaded.
*
* Note that users are advised to enable this option only in trusted
* environments where it is acceptable for applications to share the same
@@ -915,6 +889,26 @@
#define __NV_CREATE_IMEX_CHANNEL_0 CreateImexChannel0
#define NV_CREATE_IMEX_CHANNEL_0 NV_REG_STRING(__CREATE_IMEX_CHANNEL_0)
/*
* Option: NVreg_GrdmaPciTopoCheckOverride
*
* Description:
*
* This option is applicable only on coherent systems with BAR1 enabled to allow
* maximum bandwidth between GPU and a third party device over a dedicated
* PCIe link instead of over C2C for GPUDirect RDMA use-cases.
* Such a config is only supported for a specific topology which is checked by
* the GPU driver's dma-buf and nv-p2p subsystems.
*
* This option allows the user to override the driver's topology check.
*
* Possible values:
* 0 - Do not override topology check (default).
* 1 - Override topology check.
*/
#define __NV_GRDMA_PCI_TOPO_CHECK_OVERRIDE GrdmaPciTopoCheckOverride
#define NV_GRDMA_PCI_TOPO_CHECK_OVERRIDE NV_REG_STRING(__NV_GRDMA_PCI_TOPO_CHECK_OVERRIDE)
#if defined(NV_DEFINE_REGISTRY_KEY_TABLE)
/*
@@ -931,7 +925,6 @@ NV_DEFINE_REG_ENTRY(__NV_INITIALIZE_SYSTEM_MEMORY_ALLOCATIONS, 1);
NV_DEFINE_REG_ENTRY(__NV_USE_PAGE_ATTRIBUTE_TABLE, ~0);
NV_DEFINE_REG_ENTRY(__NV_ENABLE_PCIE_GEN3, 0);
NV_DEFINE_REG_ENTRY(__NV_ENABLE_MSI, 1);
NV_DEFINE_REG_ENTRY(__NV_TCE_BYPASS_MODE, NV_TCE_BYPASS_MODE_DEFAULT);
NV_DEFINE_REG_ENTRY(__NV_ENABLE_STREAM_MEMOPS, 0);
NV_DEFINE_REG_ENTRY(__NV_RM_PROFILING_ADMIN_ONLY_PARAMETER, 1);
NV_DEFINE_REG_ENTRY(__NV_PRESERVE_VIDEO_MEMORY_ALLOCATIONS, 0);
@@ -966,6 +959,7 @@ NV_DEFINE_REG_STRING_ENTRY(__NV_RM_NVLINK_BW, NULL);
NV_DEFINE_REG_ENTRY(__NV_RM_NVLINK_BW_LINK_COUNT, 0);
NV_DEFINE_REG_ENTRY_GLOBAL(__NV_IMEX_CHANNEL_COUNT, 2048);
NV_DEFINE_REG_ENTRY_GLOBAL(__NV_CREATE_IMEX_CHANNEL_0, 0);
NV_DEFINE_REG_ENTRY_GLOBAL(__NV_GRDMA_PCI_TOPO_CHECK_OVERRIDE, 0);
/*
*----------------registry database definition----------------------
@@ -993,7 +987,6 @@ nv_parm_t nv_parms[] = {
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_KMALLOC_HEAP_MAX_SIZE),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_VMALLOC_HEAP_MAX_SIZE),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_IGNORE_MMIO_CHECK),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_TCE_BYPASS_MODE),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_ENABLE_STREAM_MEMOPS),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_ENABLE_USER_NUMA_MANAGEMENT),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_NVLINK_DISABLE),
@@ -1015,6 +1008,7 @@ nv_parm_t nv_parms[] = {
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_DMA_REMAP_PEER_MMIO),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_IMEX_CHANNEL_COUNT),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_CREATE_IMEX_CHANNEL_0),
NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_GRDMA_PCI_TOPO_CHECK_OVERRIDE),
{NULL, NULL}
};

View File

@@ -159,43 +159,3 @@ void nv_unregister_rsync_driver(
up(&g_rsync_info.lock);
}
NvBool nv_get_rsync_relaxed_ordering_mode(
nv_state_t *nv
)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
/* shouldn't be called without opening a device */
WARN_ON(NV_ATOMIC_READ(nvl->usage_count) == 0);
/*
* g_rsync_info.relaxed_ordering_mode can be safely accessed outside of
* g_rsync_info.lock once a device is opened. During nvidia_open(), we
* lock the relaxed ordering state by ref-counting the rsync module
* through get_relaxed_ordering_mode.
*/
return g_rsync_info.relaxed_ordering_mode;
}
void nv_wait_for_rsync(
nv_state_t *nv
)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
/* shouldn't be called without opening a device */
WARN_ON(NV_ATOMIC_READ(nvl->usage_count) == 0);
/*
* g_rsync_info.relaxed_ordering_mode can be safely accessed outside of
* g_rsync_info.lock once a device is opened. During nvidia_open(), we
* block unregistration of the rsync driver by ref-counting the module
* through get_relaxed_ordering_mode.
*/
if (g_rsync_info.relaxed_ordering_mode)
{
WARN_ON(g_rsync_info.wait_for_rsync == NULL);
g_rsync_info.wait_for_rsync(nvl->pci_dev, g_rsync_info.data);
}
}

View File

@@ -51,7 +51,5 @@ void nv_unregister_rsync_driver(
void (*put_relaxed_ordering_mode)(int mode, void *data),
void (*wait_for_rsync)(struct pci_dev *gpu, void *data),
void *data);
NvBool nv_get_rsync_relaxed_ordering_mode(nv_state_t *nv);
void nv_wait_for_rsync(nv_state_t *nv);
#endif

View File

@@ -386,7 +386,12 @@ NV_STATUS nv_alloc_contig_pages(
if (at->flags.node)
{
NV_ALLOC_PAGES_NODE(virt_addr, at->node_id, at->order, gfp_mask);
unsigned long ptr = 0ULL;
NV_ALLOC_PAGES_NODE(ptr, at->node_id, at->order, gfp_mask);
if (ptr != 0)
{
virt_addr = (unsigned long) page_address((void *)ptr);
}
}
else
{
@@ -538,7 +543,16 @@ NV_STATUS nv_alloc_system_pages(
}
else if (at->flags.node)
{
NV_ALLOC_PAGES_NODE(virt_addr, at->node_id, at->order, gfp_mask);
unsigned long ptr = 0ULL;
NV_ALLOC_PAGES_NODE(ptr, at->node_id, at->order, gfp_mask);
if (ptr != 0)
{
virt_addr = (unsigned long) page_address((void *)ptr);
}
else
{
virt_addr = 0;
}
}
else
{

View File

@@ -50,7 +50,6 @@
#include "nvlink_caps.h"
#include "nv-hypervisor.h"
#include "nv-ibmnpu.h"
#include "nv-rsync.h"
#include "nv-kthread-q.h"
#include "nv-pat.h"
@@ -127,7 +126,11 @@ MODULE_ALIAS_CHARDEV_MAJOR(NV_MAJOR_DEVICE_NUMBER);
* DMA_BUF namespace is added by commit id 16b0314aa746
* ("dma-buf: move dma-buf symbols into the DMA_BUF module namespace") in 5.16
*/
#if defined(NV_MODULE_IMPORT_NS_TAKES_CONSTANT)
MODULE_IMPORT_NS(DMA_BUF);
#else
MODULE_IMPORT_NS("DMA_BUF");
#endif // defined(NV_MODULE_IMPORT_NS_TAKES_CONSTANT)
#endif // defined(MODULE_IMPORT_NS)
const NvBool nv_is_rm_firmware_supported_os = NV_TRUE;
@@ -182,13 +185,11 @@ static void *nvidia_pte_t_cache;
void *nvidia_stack_t_cache;
static nvidia_stack_t *__nv_init_sp;
static int nv_tce_bypass_mode = NV_TCE_BYPASS_MODE_DEFAULT;
struct semaphore nv_linux_devices_lock;
// True if all the successfully probed devices support ATS
// True if at least one of the successfully probed devices support ATS
// Assigned at device probe (module init) time
NvBool nv_ats_supported = NV_TRUE;
NvBool nv_ats_supported;
// allow an easy way to convert all debug printfs related to events
// back and forth between 'info' and 'errors'
@@ -589,20 +590,11 @@ nv_registry_keys_init(nv_stack_t *sp)
NvU32 data;
/*
* Determine the TCE bypass mode here so it can be used during
* device probe. Also determine whether we should allow
* user-mode NUMA onlining of device memory.
* Determine whether we should allow user-mode NUMA onlining of device
* memory.
*/
if (NVCPU_IS_PPC64LE)
{
status = rm_read_registry_dword(sp, nv,
NV_REG_TCE_BYPASS_MODE,
&data);
if ((status == NV_OK) && ((int)data != NV_TCE_BYPASS_MODE_DEFAULT))
{
nv_tce_bypass_mode = data;
}
if (NVreg_EnableUserNUMAManagement)
{
/* Force on the core RM registry key to match. */
@@ -1349,15 +1341,6 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
power_ref = NV_TRUE;
}
rc = nv_init_ibmnpu_devices(nv);
if (rc != 0)
{
nv_printf(NV_DBG_ERRORS,
"NVRM: failed to initialize ibmnpu devices attached to GPU with minor number %d\n",
nvl->minor_num);
goto failed;
}
if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
{
rc = nv_dev_alloc_stacks(nvl);
@@ -1558,8 +1541,6 @@ failed:
nv_dev_free_stacks(nvl);
nv_unregister_ibmnpu_devices(nv);
if (power_ref)
{
rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
@@ -1975,6 +1956,20 @@ void nv_shutdown_adapter(nvidia_stack_t *sp,
rm_shutdown_adapter(sp, nv);
if (nv->flags & NV_FLAG_TRIGGER_FLR)
{
if (nvl->pci_dev)
{
nv_printf(NV_DBG_INFO, "NVRM: Trigger FLR!\n");
os_pci_trigger_flr((void *)nvl->pci_dev);
}
else
{
nv_printf(NV_DBG_ERRORS, "NVRM: FLR not supported by the device!\n");
}
nv->flags &= ~NV_FLAG_TRIGGER_FLR;
}
if (nv_platform_use_auto_online(nvl))
nv_kthread_q_stop(&nvl->remove_numa_memory_q);
}
@@ -2037,8 +2032,6 @@ static void nv_stop_device(nv_state_t *nv, nvidia_stack_t *sp)
/* leave INIT flag alone so we don't reinit every time */
nv->flags &= ~NV_FLAG_OPEN;
nv_unregister_ibmnpu_devices(nv);
if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
{
rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
@@ -3102,17 +3095,10 @@ nv_set_dma_address_size(
)
{
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
NvU64 start_addr = nv_get_dma_start_address(nv);
NvU64 new_mask = (((NvU64)1) << phys_addr_bits) - 1;
nvl->dma_dev.addressable_range.limit = start_addr + new_mask;
nvl->dma_dev.addressable_range.limit = new_mask;
/*
* The only scenario in which we definitely should not update the DMA mask
* is on POWER, when using TCE bypass mode (see nv_get_dma_start_address()
* for details), since the meaning of the DMA mask is overloaded in that
* case.
*/
if (!nvl->tce_bypass_enabled)
{
dma_set_mask(&nvl->pci_dev->dev, new_mask);
@@ -3741,35 +3727,6 @@ NV_STATUS NV_API_CALL nv_alloc_pages(
if (unencrypted)
at->flags.unencrypted = NV_TRUE;
#if defined(NVCPU_PPC64LE)
/*
* Starting on Power9 systems, DMA addresses for NVLink are no longer the
* same as used over PCIe. There is an address compression scheme required
* for NVLink ONLY which impacts the upper address bits of the DMA address.
*
* This divergence between PCIe and NVLink DMA mappings breaks assumptions
* in the driver where during initialization we allocate system memory
* for the GPU to access over PCIe before NVLink is trained -- and some of
* these mappings persist on the GPU. If these persistent mappings are not
* equivalent they will cause invalid DMA accesses from the GPU once we
* switch to NVLink.
*
* To work around this we limit all system memory allocations from the driver
* during the period before NVLink is enabled to be from NUMA node 0 (CPU 0)
* which has a CPU real address with the upper address bits (above bit 42)
* set to 0. Effectively making the PCIe and NVLink DMA mappings equivalent
* allowing persistent system memory mappings already programmed on the GPU
* to remain valid after NVLink is enabled.
*
* See Bug 1920398 for more details.
*/
if (nv && nvl->npu && !nvl->dma_dev.nvlink)
{
at->flags.node = NV_TRUE;
at->node_id = 0;
}
#endif
if (node_id != NUMA_NO_NODE)
{
at->flags.node = NV_TRUE;
@@ -4873,154 +4830,6 @@ NV_STATUS NV_API_CALL nv_log_error(
return status;
}
NvU64 NV_API_CALL nv_get_dma_start_address(
nv_state_t *nv
)
{
#if defined(NVCPU_PPC64LE)
struct pci_dev *pci_dev;
dma_addr_t dma_addr;
NvU64 saved_dma_mask;
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
/*
* If TCE bypass is disabled via a module parameter, then just return
* the default (which is 0).
*
* Otherwise, the DMA start address only needs to be set once, and it
* won't change afterward. Just return the cached value if asked again,
* to avoid the kernel printing redundant messages to the kernel
* log when we call pci_set_dma_mask().
*/
if ((nv_tce_bypass_mode == NV_TCE_BYPASS_MODE_DISABLE) ||
(nvl->tce_bypass_enabled))
{
return nvl->dma_dev.addressable_range.start;
}
pci_dev = nvl->pci_dev;
/*
* Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes
* referred to as "windows".
*
* The "default window" provides a 2GB region of PCI address space
* located below the 32-bit line. The IOMMU is used to provide a
* "rich" mapping--any page in system memory can be mapped at an
* arbitrary address within this window. The mappings are dynamic
* and pass in and out of being as pci_map*()/pci_unmap*() calls
* are made.
*
* Dynamic DMA Windows (sometimes "Huge DDW") provides a linear
* mapping of the system's entire physical address space at some
* fixed offset above the 59-bit line. IOMMU is still used, and
* pci_map*()/pci_unmap*() are still required, but mappings are
* static. They're effectively set up in advance, and any given
* system page will always map to the same PCI bus address. I.e.
* physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx
*
* This driver does not support the 2G default window because
* of its limited size, and for reasons having to do with UVM.
*
* Linux on POWER8 will only provide the DDW-style full linear
* mapping when the driver claims support for 64-bit DMA addressing
* (a pre-requisite because the PCI addresses used in this case will
* be near the top of the 64-bit range). The linear mapping
* is not available in all system configurations.
*
* Detect whether the linear mapping is present by claiming
* 64-bit support and then mapping physical page 0. For historical
* reasons, Linux on POWER8 will never map a page to PCI address 0x0.
* In the "default window" case page 0 will be mapped to some
* non-zero address below the 32-bit line. In the
* DDW/linear-mapping case, it will be mapped to address 0 plus
* some high-order offset.
*
* If the linear mapping is present and sane then return the offset
* as the starting address for all DMA mappings.
*/
saved_dma_mask = pci_dev->dma_mask;
if (dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64)) != 0)
{
goto done;
}
dma_addr = dma_map_single(&pci_dev->dev, NULL, 1, DMA_BIDIRECTIONAL);
if (dma_mapping_error(&pci_dev->dev, dma_addr))
{
dma_set_mask(&pci_dev->dev, saved_dma_mask);
goto done;
}
dma_unmap_single(&pci_dev->dev, dma_addr, 1, DMA_BIDIRECTIONAL);
/*
* From IBM: "For IODA2, native DMA bypass or KVM TCE-based implementation
* of full 64-bit DMA support will establish a window in address-space
* with the high 14 bits being constant and the bottom up-to-50 bits
* varying with the mapping."
*
* Unfortunately, we don't have any good interfaces or definitions from
* the kernel to get information about the DMA offset assigned by OS.
* However, we have been told that the offset will be defined by the top
* 14 bits of the address, and bits 40-49 will not vary for any DMA
* mappings until 1TB of system memory is surpassed; this limitation is
* essential for us to function properly since our current GPUs only
* support 40 physical address bits. We are in a fragile place where we
* need to tell the OS that we're capable of 64-bit addressing, while
* relying on the assumption that the top 24 bits will not vary in this
* case.
*
* The way we try to compute the window, then, is mask the trial mapping
* against the DMA capabilities of the device. That way, devices with
* greater addressing capabilities will only take the bits it needs to
* define the window.
*/
if ((dma_addr & DMA_BIT_MASK(32)) != 0)
{
/*
* Huge DDW not available - page 0 mapped to non-zero address below
* the 32-bit line.
*/
nv_printf(NV_DBG_WARNINGS,
"NVRM: DMA window limited by platform\n");
dma_set_mask(&pci_dev->dev, saved_dma_mask);
goto done;
}
else if ((dma_addr & saved_dma_mask) != 0)
{
NvU64 memory_size = NV_NUM_PHYSPAGES * PAGE_SIZE;
if ((dma_addr & ~saved_dma_mask) !=
((dma_addr + memory_size) & ~saved_dma_mask))
{
/*
* The physical window straddles our addressing limit boundary,
* e.g., for an adapter that can address up to 1TB, the window
* crosses the 40-bit limit so that the lower end of the range
* has different bits 63:40 than the higher end of the range.
* We can only handle a single, static value for bits 63:40, so
* we must fall back here.
*/
nv_printf(NV_DBG_WARNINGS,
"NVRM: DMA window limited by memory size\n");
dma_set_mask(&pci_dev->dev, saved_dma_mask);
goto done;
}
}
nvl->tce_bypass_enabled = NV_TRUE;
nvl->dma_dev.addressable_range.start = dma_addr & ~(saved_dma_mask);
/* Update the coherent mask to match */
dma_set_coherent_mask(&pci_dev->dev, pci_dev->dma_mask);
done:
return nvl->dma_dev.addressable_range.start;
#else
return 0;
#endif
}
NV_STATUS NV_API_CALL nv_set_primary_vga_status(
nv_state_t *nv
)
@@ -5041,38 +4850,6 @@ NV_STATUS NV_API_CALL nv_set_primary_vga_status(
#endif
}
NV_STATUS NV_API_CALL nv_pci_trigger_recovery(
nv_state_t *nv
)
{
NV_STATUS status = NV_ERR_NOT_SUPPORTED;
#if defined(NV_PCI_ERROR_RECOVERY)
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
/*
* Calling readl() on PPC64LE will allow the kernel to check its state for
* the device and update it accordingly. This needs to be done before
* checking if the PCI channel is offline, so that we don't check stale
* state.
*
* This will also kick off the recovery process for the device.
*/
if (NV_PCI_ERROR_RECOVERY_ENABLED())
{
if (readl(nv->regs->map) == 0xFFFFFFFF)
{
if (pci_channel_offline(nvl->pci_dev))
{
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
"PCI channel for the device is offline\n");
status = NV_OK;
}
}
}
#endif
return status;
}
NvBool NV_API_CALL nv_requires_dma_remap(
nv_state_t *nv
)
@@ -5304,44 +5081,6 @@ NV_STATUS NV_API_CALL nv_get_device_memory_config(
{
NV_STATUS status = NV_ERR_NOT_SUPPORTED;
#if defined(NVCPU_PPC64LE)
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
if (!nv_platform_supports_numa(nvl))
{
return NV_ERR_NOT_SUPPORTED;
}
if (node_id != NULL)
{
*node_id = nvl->numa_info.node_id;
}
{
nv_npu_numa_info_t *numa_info;
numa_info = &nvl->npu->numa_info;
if (compr_addr_sys_phys != NULL)
{
*compr_addr_sys_phys =
numa_info->compr_sys_phys_addr;
}
if (addr_guest_phys != NULL)
{
*addr_guest_phys =
numa_info->guest_phys_addr;
}
}
if (addr_width != NULL)
{
*addr_width = nv_volta_dma_addr_size - nv_volta_addr_space_width;
}
status = NV_OK;
#endif
#if defined(NVCPU_AARCH64)
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
@@ -5374,68 +5113,6 @@ NV_STATUS NV_API_CALL nv_get_device_memory_config(
return status;
}
#if defined(NVCPU_PPC64LE)
NV_STATUS NV_API_CALL nv_get_nvlink_line_rate(
nv_state_t *nvState,
NvU32 *linerate
)
{
#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
nv_linux_state_t *nvl;
struct pci_dev *npuDev;
NvU32 *pSpeedPtr = NULL;
NvU32 speed;
int len;
if (nvState != NULL)
nvl = NV_GET_NVL_FROM_NV_STATE(nvState);
else
return NV_ERR_INVALID_ARGUMENT;
if (!nvl->npu)
{
return NV_ERR_NOT_SUPPORTED;
}
npuDev = nvl->npu->devs[0];
if (!npuDev->dev.of_node)
{
nv_printf(NV_DBG_ERRORS, "NVRM: %s: OF Node not found in IBM-NPU device node\n",
__FUNCTION__);
return NV_ERR_NOT_SUPPORTED;
}
pSpeedPtr = (NvU32 *) of_get_property(npuDev->dev.of_node, "ibm,nvlink-speed", &len);
if (pSpeedPtr)
{
speed = (NvU32) be32_to_cpup(pSpeedPtr);
}
else
{
return NV_ERR_NOT_SUPPORTED;
}
if (!speed)
{
return NV_ERR_NOT_SUPPORTED;
}
else
{
*linerate = speed;
}
return NV_OK;
#endif
return NV_ERR_NOT_SUPPORTED;
}
#endif
NV_STATUS NV_API_CALL nv_indicate_idle(
nv_state_t *nv
)
@@ -5972,9 +5649,7 @@ void NV_API_CALL nv_disallow_runtime_suspend
void NV_API_CALL nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
{
#if NVCPU_IS_PPC64LE
return nv_ibmnpu_cache_flush_range(nv, cpu_virtual, size);
#elif NVCPU_IS_AARCH64
#if NVCPU_IS_AARCH64
NvU64 va, cbsize;
NvU64 end_cpu_virtual = cpu_virtual + size;
@@ -5983,8 +5658,6 @@ void NV_API_CALL nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_vir
cpu_virtual, end_cpu_virtual);
cbsize = cache_line_size();
// Align address to line size
cpu_virtual = NV_ALIGN_UP(cpu_virtual, cbsize);
// Force eviction of any cache lines from the NUMA-onlined region.
for (va = cpu_virtual; va < end_cpu_virtual; va += cbsize)
@@ -6147,7 +5820,7 @@ void NV_API_CALL nv_get_screen_info(
*pFbHeight = registered_fb[i]->var.yres;
*pFbDepth = registered_fb[i]->var.bits_per_pixel;
*pFbPitch = registered_fb[i]->fix.line_length;
*pFbSize = (NvU64)(*pFbHeight) * (NvU64)(*pFbPitch);
*pFbSize = registered_fb[i]->fix.smem_len;
return;
}
}

View File

@@ -25,7 +25,6 @@ NVIDIA_SOURCES += nvidia/nv-modeset-interface.c
NVIDIA_SOURCES += nvidia/nv-pci-table.c
NVIDIA_SOURCES += nvidia/nv-kthread-q.c
NVIDIA_SOURCES += nvidia/nv-memdbg.c
NVIDIA_SOURCES += nvidia/nv-ibmnpu.c
NVIDIA_SOURCES += nvidia/nv-report-err.c
NVIDIA_SOURCES += nvidia/nv-rsync.c
NVIDIA_SOURCES += nvidia/nv-msi.c

View File

@@ -40,9 +40,6 @@ NVIDIA_KO = nvidia/nvidia.ko
NVIDIA_BINARY_OBJECT := $(src)/nvidia/nv-kernel.o_binary
NVIDIA_BINARY_OBJECT_O := nvidia/nv-kernel.o
quiet_cmd_symlink = SYMLINK $@
cmd_symlink = ln -sf $< $@
targets += $(NVIDIA_BINARY_OBJECT_O)
$(obj)/$(NVIDIA_BINARY_OBJECT_O): $(NVIDIA_BINARY_OBJECT) FORCE
@@ -134,7 +131,6 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_map_page_attrs
NV_CONFTEST_FUNCTION_COMPILE_TESTS += write_cr4
NV_CONFTEST_FUNCTION_COMPILE_TESTS += of_find_node_by_phandle
NV_CONFTEST_FUNCTION_COMPILE_TESTS += of_node_to_nid
NV_CONFTEST_FUNCTION_COMPILE_TESTS += pnv_pci_get_npu_dev
NV_CONFTEST_FUNCTION_COMPILE_TESTS += of_get_ibm_chip_id
NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_stop_and_remove_bus_device
NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_rebar_get_possible_sizes
@@ -182,6 +178,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += gpio_get_value
NV_CONFTEST_FUNCTION_COMPILE_TESTS += gpio_set_value
NV_CONFTEST_FUNCTION_COMPILE_TESTS += gpio_to_irq
NV_CONFTEST_FUNCTION_COMPILE_TESTS += icc_get
NV_CONFTEST_FUNCTION_COMPILE_TESTS += devm_of_icc_get
NV_CONFTEST_FUNCTION_COMPILE_TESTS += icc_put
NV_CONFTEST_FUNCTION_COMPILE_TESTS += icc_set_bw
NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_buf_export_args
@@ -230,8 +227,10 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_alloc_me
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_free_gscco_mem
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_memory_block_size_bytes
NV_CONFTEST_SYMBOL_COMPILE_TESTS += crypto
NV_CONFTEST_SYMBOL_COMPILE_TESTS += crypto_akcipher_verify
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_follow_pte
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_pci_ats_supported
NV_CONFTEST_SYMBOL_COMPILE_TESTS += ecc_digits_from_bytes
NV_CONFTEST_TYPE_COMPILE_TESTS += dma_ops
NV_CONFTEST_TYPE_COMPILE_TESTS += swiotlb_dma_ops
@@ -256,6 +255,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg
NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present
NV_CONFTEST_TYPE_COMPILE_TESTS += bus_type_has_iommu_ops
NV_CONFTEST_TYPE_COMPILE_TESTS += class_create_has_no_owner_arg
NV_CONFTEST_TYPE_COMPILE_TESTS += class_devnode_has_const_arg
NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
@@ -274,3 +275,6 @@ NV_CONFTEST_GENERIC_COMPILE_TESTS += mdev_available
NV_CONFTEST_GENERIC_COMPILE_TESTS += cmd_uphy_display_port_init
NV_CONFTEST_GENERIC_COMPILE_TESTS += cmd_uphy_display_port_off
NV_CONFTEST_GENERIC_COMPILE_TESTS += memory_failure_mf_sw_simulated_defined
NV_CONFTEST_GENERIC_COMPILE_TESTS += device_vm_build
NV_CONFTEST_GENERIC_COMPILE_TESTS += pcie_reset_flr
NV_CONFTEST_GENERIC_COMPILE_TESTS += module_import_ns_takes_constant

View File

@@ -34,6 +34,7 @@
#include <linux/cpuset.h>
#include <linux/pid.h>
#include <linux/pid_namespace.h>
#if defined(CONFIG_LOCKDEP)
#include <linux/lockdep.h>
#endif // CONFIG_LOCKDEP
@@ -2645,3 +2646,16 @@ NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
return NV_OK;
}
NvBool NV_API_CALL os_is_init_ns(void)
{
return (task_active_pid_ns(current) == &init_pid_ns);
}
NV_STATUS NV_API_CALL os_device_vm_present(void)
{
#if defined(NV_DEVICE_VM_BUILD)
return NV_OK;
#else
return NV_ERR_NOT_SUPPORTED;
#endif
}

View File

@@ -274,10 +274,11 @@ NV_STATUS NV_API_CALL os_lock_user_pages(
NV_STATUS NV_API_CALL os_unlock_user_pages(
NvU64 page_count,
void *page_array
void *page_array,
NvU32 flags
)
{
NvBool write = 1;
NvBool write = FLD_TEST_DRF(_LOCK_USER_PAGES, _FLAGS, _WRITE, _YES, flags);
struct page **user_pages = page_array;
NvU32 i;

View File

@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1999-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -202,3 +202,35 @@ os_enable_pci_req_atomics(
#endif
return NV_ERR_NOT_SUPPORTED;
}
void NV_API_CALL os_pci_trigger_flr(void *handle)
{
struct pci_dev *pdev = (struct pci_dev *) handle;
int ret;
ret = pci_save_state(pdev);
if (ret)
{
nv_printf(NV_DBG_ERRORS,
"NVRM: %s() PCI save state failed, Skip FLR\n", __FUNCTION__);
return;
}
#if defined(NV_PCIE_RESET_FLR_PRESENT)
// If PCI_RESET_DO_RESET is not defined in a particular kernel version
// define it as 0. Boolean value 0 will trigger a reset of the device.
#ifndef PCI_RESET_DO_RESET
#define PCI_RESET_DO_RESET 0
#endif
ret = pcie_reset_flr(pdev, PCI_RESET_DO_RESET);
if (ret)
{
nv_printf(NV_DBG_ERRORS,
"NVRM: %s() PCI FLR might have failed\n", __FUNCTION__);
}
#else
nv_printf(NV_DBG_ERRORS,
"NVRM: %s() PCI FLR not supported\n", __FUNCTION__);
#endif
pci_restore_state(pdev);
return;
}