mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-02-03 06:57:27 +00:00
535.43.02
This commit is contained in:
@@ -1,29 +1,33 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2013 NVIDIA Corporation
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2003-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef __cla06fsubch_h__
|
||||
#define __cla06fsubch_h__
|
||||
#ifndef _cla06fsubch_h_
|
||||
#define _cla06fsubch_h_
|
||||
|
||||
#define NVA06F_SUBCHANNEL_2D 3
|
||||
#define NVA06F_SUBCHANNEL_3D 0
|
||||
#define NVA06F_SUBCHANNEL_COMPUTE 1
|
||||
#define NVA06F_SUBCHANNEL_COPY_ENGINE 4
|
||||
#define NVA06F_SUBCHANNEL_I2M 2
|
||||
|
||||
#endif // {__cla06fsubch_h__}
|
||||
#endif // _cla06fsubch_h_
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2022 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _cla16f_h_
|
||||
#define _cla16f_h_
|
||||
@@ -30,9 +30,48 @@ extern "C" {
|
||||
|
||||
#include "nvtypes.h"
|
||||
|
||||
#define KEPLER_CHANNEL_GPFIFO_B (0x0000A16F)
|
||||
/* class KEPLER_CHANNEL_GPFIFO */
|
||||
/*
|
||||
* Documentation for KEPLER_CHANNEL_GPFIFO can be found in dev_pbdma.ref,
|
||||
* chapter "User Control Registers". It is documented as device NV_UDMA.
|
||||
* The GPFIFO format itself is also documented in dev_pbdma.ref,
|
||||
* NV_PPBDMA_GP_ENTRY_*. The pushbuffer format is documented in dev_ram.ref,
|
||||
* chapter "FIFO DMA RAM", NV_FIFO_DMA_*.
|
||||
*
|
||||
*/
|
||||
#define KEPLER_CHANNEL_GPFIFO_B (0x0000A16F)
|
||||
|
||||
/* pio method data structure */
|
||||
typedef volatile struct _cla16f_tag0 {
|
||||
NvV32 Reserved00[0x7c0];
|
||||
} NvA16FTypedef, KEPLER_ChannelGPFifoB;
|
||||
#define NVA16F_TYPEDEF KEPLER_CHANNELChannelGPFifo
|
||||
/* dma flow control data structure */
|
||||
typedef volatile struct _cla16f_tag1 {
|
||||
NvU32 Ignored00[0x010]; /* 0000-003f*/
|
||||
NvU32 Put; /* put offset, read/write 0040-0043*/
|
||||
NvU32 Get; /* get offset, read only 0044-0047*/
|
||||
NvU32 Reference; /* reference value, read only 0048-004b*/
|
||||
NvU32 PutHi; /* high order put offset bits 004c-004f*/
|
||||
NvU32 Ignored01[0x002]; /* 0050-0057*/
|
||||
NvU32 TopLevelGet; /* top level get offset, read only 0058-005b*/
|
||||
NvU32 TopLevelGetHi; /* high order top level get bits 005c-005f*/
|
||||
NvU32 GetHi; /* high order get offset bits 0060-0063*/
|
||||
NvU32 Ignored02[0x007]; /* 0064-007f*/
|
||||
NvU32 Ignored03; /* used to be engine yield 0080-0083*/
|
||||
NvU32 Ignored04[0x001]; /* 0084-0087*/
|
||||
NvU32 GPGet; /* GP FIFO get offset, read only 0088-008b*/
|
||||
NvU32 GPPut; /* GP FIFO put offset 008c-008f*/
|
||||
NvU32 Ignored05[0x5c];
|
||||
} NvA16FControl, KeplerBControlGPFifo;
|
||||
/* fields and values */
|
||||
#define NVA16F_NUMBER_OF_SUBCHANNELS (8)
|
||||
#define NVA16F_SET_OBJECT (0x00000000)
|
||||
#define NVA16F_SET_OBJECT_NVCLASS 15:0
|
||||
#define NVA16F_SET_OBJECT_ENGINE 20:16
|
||||
#define NVA16F_SET_OBJECT_ENGINE_SW 0x0000001f
|
||||
#define NVA16F_ILLEGAL (0x00000004)
|
||||
#define NVA16F_ILLEGAL_HANDLE 31:0
|
||||
#define NVA16F_NOP (0x00000008)
|
||||
#define NVA16F_NOP_HANDLE 31:0
|
||||
#define NVA16F_SEMAPHOREA (0x00000010)
|
||||
@@ -100,6 +139,12 @@ extern "C" {
|
||||
#define NVA16F_SET_REFERENCE_COUNT 31:0
|
||||
#define NVA16F_WFI (0x00000078)
|
||||
#define NVA16F_WFI_HANDLE 31:0
|
||||
#define NVA16F_CRC_CHECK (0x0000007c)
|
||||
#define NVA16F_CRC_CHECK_VALUE 31:0
|
||||
#define NVA16F_YIELD (0x00000080)
|
||||
#define NVA16F_YIELD_OP 1:0
|
||||
#define NVA16F_YIELD_OP_NOP 0x00000000
|
||||
|
||||
|
||||
/* GPFIFO entry format */
|
||||
#define NVA16F_GP_ENTRY__SIZE 8
|
||||
@@ -126,13 +171,28 @@ extern "C" {
|
||||
#define NVA16F_GP_ENTRY1_OPCODE_PB_CRC 0x00000003
|
||||
|
||||
/* dma method formats */
|
||||
#define NVA16F_DMA_METHOD_ADDRESS_OLD 12:2
|
||||
#define NVA16F_DMA_METHOD_ADDRESS 11:0
|
||||
#define NVA16F_DMA_SUBDEVICE_MASK 15:4
|
||||
#define NVA16F_DMA_METHOD_SUBCHANNEL 15:13
|
||||
#define NVA16F_DMA_TERT_OP 17:16
|
||||
#define NVA16F_DMA_TERT_OP_GRP0_INC_METHOD (0x00000000)
|
||||
#define NVA16F_DMA_TERT_OP_GRP0_SET_SUB_DEV_MASK (0x00000001)
|
||||
#define NVA16F_DMA_TERT_OP_GRP0_STORE_SUB_DEV_MASK (0x00000002)
|
||||
#define NVA16F_DMA_TERT_OP_GRP0_USE_SUB_DEV_MASK (0x00000003)
|
||||
#define NVA16F_DMA_TERT_OP_GRP2_NON_INC_METHOD (0x00000000)
|
||||
#define NVA16F_DMA_METHOD_COUNT_OLD 28:18
|
||||
#define NVA16F_DMA_METHOD_COUNT 28:16
|
||||
#define NVA16F_DMA_IMMD_DATA 28:16
|
||||
#define NVA16F_DMA_SEC_OP 31:29
|
||||
#define NVA16F_DMA_SEC_OP_GRP0_USE_TERT (0x00000000)
|
||||
#define NVA16F_DMA_SEC_OP_INC_METHOD (0x00000001)
|
||||
#define NVA16F_DMA_SEC_OP_GRP2_USE_TERT (0x00000002)
|
||||
#define NVA16F_DMA_SEC_OP_NON_INC_METHOD (0x00000003)
|
||||
|
||||
#define NVA16F_DMA_SEC_OP_IMMD_DATA_METHOD (0x00000004)
|
||||
#define NVA16F_DMA_SEC_OP_ONE_INC (0x00000005)
|
||||
#define NVA16F_DMA_SEC_OP_RESERVED6 (0x00000006)
|
||||
#define NVA16F_DMA_SEC_OP_END_PB_SEGMENT (0x00000007)
|
||||
/* dma incrementing method format */
|
||||
#define NVA16F_DMA_INCR_ADDRESS 11:0
|
||||
#define NVA16F_DMA_INCR_SUBCHANNEL 15:13
|
||||
@@ -140,7 +200,6 @@ extern "C" {
|
||||
#define NVA16F_DMA_INCR_OPCODE 31:29
|
||||
#define NVA16F_DMA_INCR_OPCODE_VALUE (0x00000001)
|
||||
#define NVA16F_DMA_INCR_DATA 31:0
|
||||
|
||||
/* dma non-incrementing method format */
|
||||
#define NVA16F_DMA_NONINCR_ADDRESS 11:0
|
||||
#define NVA16F_DMA_NONINCR_SUBCHANNEL 15:13
|
||||
@@ -148,13 +207,45 @@ extern "C" {
|
||||
#define NVA16F_DMA_NONINCR_OPCODE 31:29
|
||||
#define NVA16F_DMA_NONINCR_OPCODE_VALUE (0x00000003)
|
||||
#define NVA16F_DMA_NONINCR_DATA 31:0
|
||||
|
||||
/* dma increment-once method format */
|
||||
#define NVA16F_DMA_ONEINCR_ADDRESS 11:0
|
||||
#define NVA16F_DMA_ONEINCR_SUBCHANNEL 15:13
|
||||
#define NVA16F_DMA_ONEINCR_COUNT 28:16
|
||||
#define NVA16F_DMA_ONEINCR_OPCODE 31:29
|
||||
#define NVA16F_DMA_ONEINCR_OPCODE_VALUE (0x00000005)
|
||||
#define NVA16F_DMA_ONEINCR_DATA 31:0
|
||||
/* dma no-operation format */
|
||||
#define NVA16F_DMA_NOP (0x00000000)
|
||||
/* dma immediate-data format */
|
||||
#define NVA16F_DMA_IMMD_ADDRESS 11:0
|
||||
#define NVA16F_DMA_IMMD_SUBCHANNEL 15:13
|
||||
#define NVA16F_DMA_IMMD_DATA 28:16
|
||||
#define NVA16F_DMA_IMMD_OPCODE 31:29
|
||||
#define NVA16F_DMA_IMMD_OPCODE_VALUE (0x00000004)
|
||||
/* dma set sub-device mask format */
|
||||
#define NVA16F_DMA_SET_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVA16F_DMA_SET_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVA16F_DMA_SET_SUBDEVICE_MASK_OPCODE_VALUE (0x00000001)
|
||||
/* dma store sub-device mask format */
|
||||
#define NVA16F_DMA_STORE_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVA16F_DMA_STORE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVA16F_DMA_STORE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000002)
|
||||
/* dma use sub-device mask format */
|
||||
#define NVA16F_DMA_USE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVA16F_DMA_USE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000003)
|
||||
/* dma end-segment format */
|
||||
#define NVA16F_DMA_ENDSEG_OPCODE 31:29
|
||||
#define NVA16F_DMA_ENDSEG_OPCODE_VALUE (0x00000007)
|
||||
/* dma legacy incrementing/non-incrementing formats */
|
||||
#define NVA16F_DMA_ADDRESS 12:2
|
||||
#define NVA16F_DMA_SUBCH 15:13
|
||||
#define NVA16F_DMA_OPCODE3 17:16
|
||||
#define NVA16F_DMA_OPCODE3_NONE (0x00000000)
|
||||
#define NVA16F_DMA_COUNT 28:18
|
||||
#define NVA16F_DMA_OPCODE 31:29
|
||||
#define NVA16F_DMA_OPCODE_METHOD (0x00000000)
|
||||
#define NVA16F_DMA_OPCODE_NONINC_METHOD (0x00000002)
|
||||
#define NVA16F_DMA_DATA 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
|
||||
@@ -1,24 +1,26 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVidia Corporation
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
#ifndef _clb069_h_
|
||||
#define _clb069_h_
|
||||
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVIDIA Corporation
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _clB06f_h_
|
||||
#define _clB06f_h_
|
||||
#ifndef _clb06f_h_
|
||||
#define _clb06f_h_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -30,10 +30,46 @@ extern "C" {
|
||||
|
||||
#include "nvtypes.h"
|
||||
|
||||
/* class MAXWELL_CHANNEL_GPFIFO */
|
||||
/*
|
||||
* Documentation for MAXWELL_CHANNEL_GPFIFO can be found in dev_pbdma.ref,
|
||||
* chapter "User Control Registers". It is documented as device NV_UDMA.
|
||||
* The GPFIFO format itself is also documented in dev_pbdma.ref,
|
||||
* NV_PPBDMA_GP_ENTRY_*. The pushbuffer format is documented in dev_ram.ref,
|
||||
* chapter "FIFO DMA RAM", NV_FIFO_DMA_*.
|
||||
*
|
||||
*/
|
||||
#define MAXWELL_CHANNEL_GPFIFO_A (0x0000B06F)
|
||||
|
||||
/* class MAXWELL_CHANNEL_GPFIFO */
|
||||
#define NVB06F_TYPEDEF MAXWELL_CHANNELChannelGPFifoA
|
||||
|
||||
/* dma flow control data structure */
|
||||
typedef volatile struct _clb06f_tag0 {
|
||||
NvU32 Ignored00[0x010]; /* 0000-003f*/
|
||||
NvU32 Put; /* put offset, read/write 0040-0043*/
|
||||
NvU32 Get; /* get offset, read only 0044-0047*/
|
||||
NvU32 Reference; /* reference value, read only 0048-004b*/
|
||||
NvU32 PutHi; /* high order put offset bits 004c-004f*/
|
||||
NvU32 Ignored01[0x002]; /* 0050-0057*/
|
||||
NvU32 TopLevelGet; /* top level get offset, read only 0058-005b*/
|
||||
NvU32 TopLevelGetHi; /* high order top level get bits 005c-005f*/
|
||||
NvU32 GetHi; /* high order get offset bits 0060-0063*/
|
||||
NvU32 Ignored02[0x007]; /* 0064-007f*/
|
||||
NvU32 Ignored03; /* used to be engine yield 0080-0083*/
|
||||
NvU32 Ignored04[0x001]; /* 0084-0087*/
|
||||
NvU32 GPGet; /* GP FIFO get offset, read only 0088-008b*/
|
||||
NvU32 GPPut; /* GP FIFO put offset 008c-008f*/
|
||||
NvU32 Ignored05[0x5c];
|
||||
} Nvb06FControl, MaxwellAControlGPFifo;
|
||||
|
||||
/* fields and values */
|
||||
#define NVB06F_NUMBER_OF_SUBCHANNELS (8)
|
||||
#define NVB06F_SET_OBJECT (0x00000000)
|
||||
#define NVB06F_SET_OBJECT_NVCLASS 15:0
|
||||
#define NVB06F_SET_OBJECT_ENGINE 20:16
|
||||
#define NVB06F_SET_OBJECT_ENGINE_SW 0x0000001f
|
||||
#define NVB06F_ILLEGAL (0x00000004)
|
||||
#define NVB06F_ILLEGAL_HANDLE 31:0
|
||||
#define NVB06F_NOP (0x00000008)
|
||||
#define NVB06F_NOP_HANDLE 31:0
|
||||
#define NVB06F_SEMAPHOREA (0x00000010)
|
||||
@@ -47,6 +83,8 @@ extern "C" {
|
||||
#define NVB06F_SEMAPHORED_OPERATION_ACQUIRE 0x00000001
|
||||
#define NVB06F_SEMAPHORED_OPERATION_RELEASE 0x00000002
|
||||
#define NVB06F_SEMAPHORED_OPERATION_ACQ_GEQ 0x00000004
|
||||
#define NVB06F_SEMAPHORED_OPERATION_ACQ_AND 0x00000008
|
||||
#define NVB06F_SEMAPHORED_OPERATION_REDUCTION 0x00000010
|
||||
#define NVB06F_SEMAPHORED_ACQUIRE_SWITCH 12:12
|
||||
#define NVB06F_SEMAPHORED_ACQUIRE_SWITCH_DISABLED 0x00000000
|
||||
#define NVB06F_SEMAPHORED_ACQUIRE_SWITCH_ENABLED 0x00000001
|
||||
@@ -56,8 +94,22 @@ extern "C" {
|
||||
#define NVB06F_SEMAPHORED_RELEASE_SIZE 24:24
|
||||
#define NVB06F_SEMAPHORED_RELEASE_SIZE_16BYTE 0x00000000
|
||||
#define NVB06F_SEMAPHORED_RELEASE_SIZE_4BYTE 0x00000001
|
||||
|
||||
#define NVB06F_SEMAPHORED_REDUCTION 30:27
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_MIN 0x00000000
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_MAX 0x00000001
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_XOR 0x00000002
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_AND 0x00000003
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_OR 0x00000004
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_ADD 0x00000005
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_INC 0x00000006
|
||||
#define NVB06F_SEMAPHORED_REDUCTION_DEC 0x00000007
|
||||
#define NVB06F_SEMAPHORED_FORMAT 31:31
|
||||
#define NVB06F_SEMAPHORED_FORMAT_SIGNED 0x00000000
|
||||
#define NVB06F_SEMAPHORED_FORMAT_UNSIGNED 0x00000001
|
||||
#define NVB06F_NON_STALL_INTERRUPT (0x00000020)
|
||||
#define NVB06F_NON_STALL_INTERRUPT_HANDLE 31:0
|
||||
#define NVB06F_FB_FLUSH (0x00000024)
|
||||
#define NVB06F_FB_FLUSH_HANDLE 31:0
|
||||
// NOTE - MEM_OP_A and MEM_OP_B have been removed for gm20x to make room for
|
||||
// possible future MEM_OP features. MEM_OP_C/D have identical functionality
|
||||
// to the previous MEM_OP_A/B methods.
|
||||
@@ -84,10 +136,27 @@ extern "C" {
|
||||
#define NVB06F_MEM_OP_D_OPERATION_L2_CLEAN_COMPTAGS 0x0000000f
|
||||
#define NVB06F_MEM_OP_D_OPERATION_L2_FLUSH_DIRTY 0x00000010
|
||||
#define NVB06F_MEM_OP_D_TLB_INVALIDATE_ADDR_HI 7:0
|
||||
#define NVB06F_SET_REFERENCE (0x00000050)
|
||||
#define NVB06F_SET_REFERENCE_COUNT 31:0
|
||||
#define NVB06F_WFI (0x00000078)
|
||||
#define NVB06F_WFI_SCOPE 0:0
|
||||
#define NVB06F_WFI_SCOPE_CURRENT_SCG_TYPE 0x00000000
|
||||
#define NVB06F_WFI_SCOPE_ALL 0x00000001
|
||||
#define NVB06F_CRC_CHECK (0x0000007c)
|
||||
#define NVB06F_CRC_CHECK_VALUE 31:0
|
||||
#define NVB06F_YIELD (0x00000080)
|
||||
#define NVB06F_YIELD_OP 1:0
|
||||
#define NVB06F_YIELD_OP_NOP 0x00000000
|
||||
#define NVB06F_YIELD_OP_PBDMA_TIMESLICE 0x00000001
|
||||
#define NVB06F_YIELD_OP_RUNLIST_TIMESLICE 0x00000002
|
||||
#define NVB06F_YIELD_OP_TSG 0x00000003
|
||||
|
||||
|
||||
/* GPFIFO entry format */
|
||||
#define NVB06F_GP_ENTRY__SIZE 8
|
||||
#define NVB06F_GP_ENTRY0_FETCH 0:0
|
||||
#define NVB06F_GP_ENTRY0_FETCH_UNCONDITIONAL 0x00000000
|
||||
#define NVB06F_GP_ENTRY0_FETCH_CONDITIONAL 0x00000001
|
||||
#define NVB06F_GP_ENTRY0_GET 31:2
|
||||
#define NVB06F_GP_ENTRY0_OPERAND 31:0
|
||||
#define NVB06F_GP_ENTRY1_GET_HI 7:0
|
||||
@@ -98,11 +167,38 @@ extern "C" {
|
||||
#define NVB06F_GP_ENTRY1_LEVEL_MAIN 0x00000000
|
||||
#define NVB06F_GP_ENTRY1_LEVEL_SUBROUTINE 0x00000001
|
||||
#define NVB06F_GP_ENTRY1_LENGTH 30:10
|
||||
#define NVB06F_GP_ENTRY1_SYNC 31:31
|
||||
#define NVB06F_GP_ENTRY1_SYNC_PROCEED 0x00000000
|
||||
#define NVB06F_GP_ENTRY1_SYNC_WAIT 0x00000001
|
||||
#define NVB06F_GP_ENTRY1_OPCODE 7:0
|
||||
#define NVB06F_GP_ENTRY1_OPCODE_NOP 0x00000000
|
||||
#define NVB06F_GP_ENTRY1_OPCODE_ILLEGAL 0x00000001
|
||||
#define NVB06F_GP_ENTRY1_OPCODE_GP_CRC 0x00000002
|
||||
#define NVB06F_GP_ENTRY1_OPCODE_PB_CRC 0x00000003
|
||||
|
||||
/* dma method formats */
|
||||
#define NVB06F_DMA_METHOD_ADDRESS_OLD 12:2
|
||||
#define NVB06F_DMA_METHOD_ADDRESS 11:0
|
||||
#define NVB06F_DMA_SUBDEVICE_MASK 15:4
|
||||
#define NVB06F_DMA_METHOD_SUBCHANNEL 15:13
|
||||
#define NVB06F_DMA_TERT_OP 17:16
|
||||
#define NVB06F_DMA_TERT_OP_GRP0_INC_METHOD (0x00000000)
|
||||
#define NVB06F_DMA_TERT_OP_GRP0_SET_SUB_DEV_MASK (0x00000001)
|
||||
#define NVB06F_DMA_TERT_OP_GRP0_STORE_SUB_DEV_MASK (0x00000002)
|
||||
#define NVB06F_DMA_TERT_OP_GRP0_USE_SUB_DEV_MASK (0x00000003)
|
||||
#define NVB06F_DMA_TERT_OP_GRP2_NON_INC_METHOD (0x00000000)
|
||||
#define NVB06F_DMA_METHOD_COUNT_OLD 28:18
|
||||
#define NVB06F_DMA_METHOD_COUNT 28:16
|
||||
#define NVB06F_DMA_IMMD_DATA 28:16
|
||||
#define NVB06F_DMA_SEC_OP 31:29
|
||||
#define NVB06F_DMA_SEC_OP_GRP0_USE_TERT (0x00000000)
|
||||
#define NVB06F_DMA_SEC_OP_INC_METHOD (0x00000001)
|
||||
#define NVB06F_DMA_SEC_OP_GRP2_USE_TERT (0x00000002)
|
||||
#define NVB06F_DMA_SEC_OP_NON_INC_METHOD (0x00000003)
|
||||
#define NVB06F_DMA_SEC_OP_IMMD_DATA_METHOD (0x00000004)
|
||||
#define NVB06F_DMA_SEC_OP_ONE_INC (0x00000005)
|
||||
#define NVB06F_DMA_SEC_OP_RESERVED6 (0x00000006)
|
||||
#define NVB06F_DMA_SEC_OP_END_PB_SEGMENT (0x00000007)
|
||||
/* dma incrementing method format */
|
||||
#define NVB06F_DMA_INCR_ADDRESS 11:0
|
||||
#define NVB06F_DMA_INCR_SUBCHANNEL 15:13
|
||||
@@ -132,9 +228,33 @@ extern "C" {
|
||||
#define NVB06F_DMA_IMMD_DATA 28:16
|
||||
#define NVB06F_DMA_IMMD_OPCODE 31:29
|
||||
#define NVB06F_DMA_IMMD_OPCODE_VALUE (0x00000004)
|
||||
/* dma set sub-device mask format */
|
||||
#define NVB06F_DMA_SET_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVB06F_DMA_SET_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVB06F_DMA_SET_SUBDEVICE_MASK_OPCODE_VALUE (0x00000001)
|
||||
/* dma store sub-device mask format */
|
||||
#define NVB06F_DMA_STORE_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVB06F_DMA_STORE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVB06F_DMA_STORE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000002)
|
||||
/* dma use sub-device mask format */
|
||||
#define NVB06F_DMA_USE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVB06F_DMA_USE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000003)
|
||||
/* dma end-segment format */
|
||||
#define NVB06F_DMA_ENDSEG_OPCODE 31:29
|
||||
#define NVB06F_DMA_ENDSEG_OPCODE_VALUE (0x00000007)
|
||||
/* dma legacy incrementing/non-incrementing formats */
|
||||
#define NVB06F_DMA_ADDRESS 12:2
|
||||
#define NVB06F_DMA_SUBCH 15:13
|
||||
#define NVB06F_DMA_OPCODE3 17:16
|
||||
#define NVB06F_DMA_OPCODE3_NONE (0x00000000)
|
||||
#define NVB06F_DMA_COUNT 28:18
|
||||
#define NVB06F_DMA_OPCODE 31:29
|
||||
#define NVB06F_DMA_OPCODE_METHOD (0x00000000)
|
||||
#define NVB06F_DMA_OPCODE_NONINC_METHOD (0x00000002)
|
||||
#define NVB06F_DMA_DATA 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* _clB06F_h_ */
|
||||
#endif /* _clb06f_h_ */
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVIDIA Corporation
|
||||
Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
@@ -32,6 +32,10 @@ extern "C" {
|
||||
|
||||
#define MAXWELL_DMA_COPY_A (0x0000B0B5)
|
||||
|
||||
#define NVB0B5_NOP (0x00000100)
|
||||
#define NVB0B5_NOP_PARAMETER 31:0
|
||||
#define NVB0B5_PM_TRIGGER (0x00000140)
|
||||
#define NVB0B5_PM_TRIGGER_V 31:0
|
||||
#define NVB0B5_SET_SEMAPHORE_A (0x00000240)
|
||||
#define NVB0B5_SET_SEMAPHORE_A_UPPER 7:0
|
||||
#define NVB0B5_SET_SEMAPHORE_B (0x00000244)
|
||||
@@ -183,9 +187,75 @@ extern "C" {
|
||||
#define NVB0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_TWO (0x00000001)
|
||||
#define NVB0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_THREE (0x00000002)
|
||||
#define NVB0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_FOUR (0x00000003)
|
||||
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE (0x0000070C)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_WIDTH_QUARTER_GOB (0x0000000E)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_TESLA_4 (0x00000000)
|
||||
#define NVB0B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVB0B5_SET_DST_WIDTH (0x00000710)
|
||||
#define NVB0B5_SET_DST_WIDTH_V 31:0
|
||||
#define NVB0B5_SET_DST_HEIGHT (0x00000714)
|
||||
#define NVB0B5_SET_DST_HEIGHT_V 31:0
|
||||
#define NVB0B5_SET_DST_DEPTH (0x00000718)
|
||||
#define NVB0B5_SET_DST_DEPTH_V 31:0
|
||||
#define NVB0B5_SET_DST_LAYER (0x0000071C)
|
||||
#define NVB0B5_SET_DST_LAYER_V 31:0
|
||||
#define NVB0B5_SET_DST_ORIGIN (0x00000720)
|
||||
#define NVB0B5_SET_DST_ORIGIN_X 15:0
|
||||
#define NVB0B5_SET_DST_ORIGIN_Y 31:16
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE (0x00000728)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_WIDTH_QUARTER_GOB (0x0000000E)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_TESLA_4 (0x00000000)
|
||||
#define NVB0B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVB0B5_SET_SRC_WIDTH (0x0000072C)
|
||||
#define NVB0B5_SET_SRC_WIDTH_V 31:0
|
||||
#define NVB0B5_SET_SRC_HEIGHT (0x00000730)
|
||||
#define NVB0B5_SET_SRC_HEIGHT_V 31:0
|
||||
#define NVB0B5_SET_SRC_DEPTH (0x00000734)
|
||||
#define NVB0B5_SET_SRC_DEPTH_V 31:0
|
||||
#define NVB0B5_SET_SRC_LAYER (0x00000738)
|
||||
#define NVB0B5_SET_SRC_LAYER_V 31:0
|
||||
#define NVB0B5_SET_SRC_ORIGIN (0x0000073C)
|
||||
#define NVB0B5_SET_SRC_ORIGIN_X 15:0
|
||||
#define NVB0B5_SET_SRC_ORIGIN_Y 31:16
|
||||
#define NVB0B5_PM_TRIGGER_END (0x00001114)
|
||||
#define NVB0B5_PM_TRIGGER_END_V 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
#endif
|
||||
#endif // _clb0b5_h
|
||||
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _clc06f_h_
|
||||
#define _clc06f_h_
|
||||
@@ -30,10 +30,47 @@ extern "C" {
|
||||
|
||||
#include "nvtypes.h"
|
||||
|
||||
/* class PASCAL_CHANNEL_GPFIFO */
|
||||
/*
|
||||
* Documentation for PASCAL_CHANNEL_GPFIFO can be found in dev_pbdma.ref,
|
||||
* chapter "User Control Registers". It is documented as device NV_UDMA.
|
||||
* The GPFIFO format itself is also documented in dev_pbdma.ref,
|
||||
* NV_PPBDMA_GP_ENTRY_*. The pushbuffer format is documented in dev_ram.ref,
|
||||
* chapter "FIFO DMA RAM", NV_FIFO_DMA_*.
|
||||
*
|
||||
* Note there is no .mfs file for this class.
|
||||
*/
|
||||
#define PASCAL_CHANNEL_GPFIFO_A (0x0000C06F)
|
||||
|
||||
/* class PASCAL_CHANNEL_GPFIFO_A */
|
||||
#define NVC06F_TYPEDEF PASCAL_CHANNELChannelGPFifoA
|
||||
|
||||
/* dma flow control data structure */
|
||||
typedef volatile struct Nvc06fControl_struct {
|
||||
NvU32 Ignored00[0x010]; /* 0000-003f*/
|
||||
NvU32 Put; /* put offset, read/write 0040-0043*/
|
||||
NvU32 Get; /* get offset, read only 0044-0047*/
|
||||
NvU32 Reference; /* reference value, read only 0048-004b*/
|
||||
NvU32 PutHi; /* high order put offset bits 004c-004f*/
|
||||
NvU32 Ignored01[0x002]; /* 0050-0057*/
|
||||
NvU32 TopLevelGet; /* top level get offset, read only 0058-005b*/
|
||||
NvU32 TopLevelGetHi; /* high order top level get bits 005c-005f*/
|
||||
NvU32 GetHi; /* high order get offset bits 0060-0063*/
|
||||
NvU32 Ignored02[0x007]; /* 0064-007f*/
|
||||
NvU32 Ignored03; /* used to be engine yield 0080-0083*/
|
||||
NvU32 Ignored04[0x001]; /* 0084-0087*/
|
||||
NvU32 GPGet; /* GP FIFO get offset, read only 0088-008b*/
|
||||
NvU32 GPPut; /* GP FIFO put offset 008c-008f*/
|
||||
NvU32 Ignored05[0x5c];
|
||||
} Nvc06fControl, PascalAControlGPFifo;
|
||||
|
||||
/* fields and values */
|
||||
#define NVC06F_NUMBER_OF_SUBCHANNELS (8)
|
||||
#define NVC06F_SET_OBJECT (0x00000000)
|
||||
#define NVC06F_SET_OBJECT_NVCLASS 15:0
|
||||
#define NVC06F_SET_OBJECT_ENGINE 20:16
|
||||
#define NVC06F_SET_OBJECT_ENGINE_SW 0x0000001f
|
||||
#define NVC06F_ILLEGAL (0x00000004)
|
||||
#define NVC06F_ILLEGAL_HANDLE 31:0
|
||||
#define NVC06F_NOP (0x00000008)
|
||||
#define NVC06F_NOP_HANDLE 31:0
|
||||
#define NVC06F_SEMAPHOREA (0x00000010)
|
||||
@@ -47,54 +84,33 @@ extern "C" {
|
||||
#define NVC06F_SEMAPHORED_OPERATION_ACQUIRE 0x00000001
|
||||
#define NVC06F_SEMAPHORED_OPERATION_RELEASE 0x00000002
|
||||
#define NVC06F_SEMAPHORED_OPERATION_ACQ_GEQ 0x00000004
|
||||
#define NVC06F_SEMAPHORED_OPERATION_ACQ_AND 0x00000008
|
||||
#define NVC06F_SEMAPHORED_OPERATION_REDUCTION 0x00000010
|
||||
#define NVC06F_SEMAPHORED_ACQUIRE_SWITCH 12:12
|
||||
#define NVC06F_SEMAPHORED_ACQUIRE_SWITCH_DISABLED 0x00000000
|
||||
#define NVC06F_SEMAPHORED_ACQUIRE_SWITCH_ENABLED 0x00000001
|
||||
|
||||
|
||||
/* GPFIFO entry format */
|
||||
#define NVC06F_GP_ENTRY__SIZE 8
|
||||
#define NVC06F_GP_ENTRY0_GET 31:2
|
||||
#define NVC06F_GP_ENTRY0_OPERAND 31:0
|
||||
#define NVC06F_GP_ENTRY1_GET_HI 7:0
|
||||
#define NVC06F_GP_ENTRY1_PRIV 8:8
|
||||
#define NVC06F_GP_ENTRY1_PRIV_USER 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_PRIV_KERNEL 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_LEVEL 9:9
|
||||
#define NVC06F_GP_ENTRY1_LEVEL_MAIN 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_LEVEL_SUBROUTINE 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_LENGTH 30:10
|
||||
|
||||
/* dma incrementing method format */
|
||||
#define NVC06F_DMA_INCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_INCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_INCR_COUNT 28:16
|
||||
#define NVC06F_DMA_INCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_INCR_OPCODE_VALUE (0x00000001)
|
||||
#define NVC06F_DMA_INCR_DATA 31:0
|
||||
/* dma non-incrementing method format */
|
||||
#define NVC06F_DMA_NONINCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_NONINCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_NONINCR_COUNT 28:16
|
||||
#define NVC06F_DMA_NONINCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_NONINCR_OPCODE_VALUE (0x00000003)
|
||||
#define NVC06F_DMA_NONINCR_DATA 31:0
|
||||
/* dma increment-once method format */
|
||||
#define NVC06F_DMA_ONEINCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_ONEINCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_ONEINCR_COUNT 28:16
|
||||
#define NVC06F_DMA_ONEINCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_ONEINCR_OPCODE_VALUE (0x00000005)
|
||||
#define NVC06F_DMA_ONEINCR_DATA 31:0
|
||||
/* dma no-operation format */
|
||||
#define NVC06F_DMA_NOP (0x00000000)
|
||||
/* dma immediate-data format */
|
||||
#define NVC06F_DMA_IMMD_ADDRESS 11:0
|
||||
#define NVC06F_DMA_IMMD_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_IMMD_DATA 28:16
|
||||
#define NVC06F_DMA_IMMD_OPCODE 31:29
|
||||
#define NVC06F_DMA_IMMD_OPCODE_VALUE (0x00000004)
|
||||
|
||||
#define NVC06F_SEMAPHORED_RELEASE_WFI 20:20
|
||||
#define NVC06F_SEMAPHORED_RELEASE_WFI_EN 0x00000000
|
||||
#define NVC06F_SEMAPHORED_RELEASE_WFI_DIS 0x00000001
|
||||
#define NVC06F_SEMAPHORED_RELEASE_SIZE 24:24
|
||||
#define NVC06F_SEMAPHORED_RELEASE_SIZE_16BYTE 0x00000000
|
||||
#define NVC06F_SEMAPHORED_RELEASE_SIZE_4BYTE 0x00000001
|
||||
#define NVC06F_SEMAPHORED_REDUCTION 30:27
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_MIN 0x00000000
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_MAX 0x00000001
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_XOR 0x00000002
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_AND 0x00000003
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_OR 0x00000004
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_ADD 0x00000005
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_INC 0x00000006
|
||||
#define NVC06F_SEMAPHORED_REDUCTION_DEC 0x00000007
|
||||
#define NVC06F_SEMAPHORED_FORMAT 31:31
|
||||
#define NVC06F_SEMAPHORED_FORMAT_SIGNED 0x00000000
|
||||
#define NVC06F_SEMAPHORED_FORMAT_UNSIGNED 0x00000001
|
||||
#define NVC06F_NON_STALL_INTERRUPT (0x00000020)
|
||||
#define NVC06F_NON_STALL_INTERRUPT_HANDLE 31:0
|
||||
#define NVC06F_FB_FLUSH (0x00000024) // Deprecated - use MEMBAR TYPE SYS_MEMBAR
|
||||
#define NVC06F_FB_FLUSH_HANDLE 31:0
|
||||
// NOTE - MEM_OP_A and MEM_OP_B have been replaced in gp100 with methods for
|
||||
// specifying the page address for a targeted TLB invalidate and the uTLB for
|
||||
// a targeted REPLAY_CANCEL for UVM.
|
||||
@@ -153,19 +169,142 @@ extern "C" {
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_PEERMEM_INVALIDATE 0x0000000d
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_SYSMEM_INVALIDATE 0x0000000e
|
||||
// CLEAN_LINES is an alias for Tegra/GPU IP usage
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_INVALIDATE_CLEAN_LINES 0x0000000e
|
||||
// This B alias is confusing but it was missed as part of the update. Left here
|
||||
// for compatibility.
|
||||
#define NVC06F_MEM_OP_B_OPERATION_L2_INVALIDATE_CLEAN_LINES 0x0000000e
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_CLEAN_COMPTAGS 0x0000000f
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_FLUSH_DIRTY 0x00000010
|
||||
#define NVC06F_MEM_OP_D_OPERATION_L2_WAIT_FOR_SYS_PENDING_READS 0x00000015
|
||||
#define NVC06F_SET_REFERENCE (0x00000050)
|
||||
#define NVC06F_SET_REFERENCE_COUNT 31:0
|
||||
|
||||
// Syncpoint methods are only available on Tegra parts. Attempting to use
|
||||
// them on discrete GPUs will result in Host raising NV_PPBDMA_INTR_0_METHOD.
|
||||
#define NVC06F_SYNCPOINTA (0x00000070)
|
||||
#define NVC06F_SYNCPOINTA_PAYLOAD 31:0
|
||||
#define NVC06F_SYNCPOINTB (0x00000074)
|
||||
#define NVC06F_SYNCPOINTB_OPERATION 0:0
|
||||
#define NVC06F_SYNCPOINTB_OPERATION_WAIT 0x00000000
|
||||
#define NVC06F_SYNCPOINTB_OPERATION_INCR 0x00000001
|
||||
#define NVC06F_SYNCPOINTB_WAIT_SWITCH 4:4
|
||||
#define NVC06F_SYNCPOINTB_WAIT_SWITCH_DIS 0x00000000
|
||||
#define NVC06F_SYNCPOINTB_WAIT_SWITCH_EN 0x00000001
|
||||
#define NVC06F_SYNCPOINTB_SYNCPT_INDEX 19:8
|
||||
#define NVC06F_WFI (0x00000078)
|
||||
#define NVC06F_WFI_SCOPE 0:0
|
||||
#define NVC06F_WFI_SCOPE_CURRENT_SCG_TYPE 0x00000000
|
||||
#define NVC06F_WFI_SCOPE_ALL 0x00000001
|
||||
#define NVC06F_CRC_CHECK (0x0000007c)
|
||||
#define NVC06F_CRC_CHECK_VALUE 31:0
|
||||
#define NVC06F_YIELD (0x00000080)
|
||||
#define NVC06F_YIELD_OP 1:0
|
||||
#define NVC06F_YIELD_OP_NOP 0x00000000
|
||||
#define NVC06F_YIELD_OP_PBDMA_TIMESLICE 0x00000001
|
||||
#define NVC06F_YIELD_OP_RUNLIST_TIMESLICE 0x00000002
|
||||
#define NVC06F_YIELD_OP_TSG 0x00000003
|
||||
|
||||
|
||||
/* GPFIFO entry format */
|
||||
#define NVC06F_GP_ENTRY__SIZE 8
|
||||
#define NVC06F_GP_ENTRY0_FETCH 0:0
|
||||
#define NVC06F_GP_ENTRY0_FETCH_UNCONDITIONAL 0x00000000
|
||||
#define NVC06F_GP_ENTRY0_FETCH_CONDITIONAL 0x00000001
|
||||
#define NVC06F_GP_ENTRY0_GET 31:2
|
||||
#define NVC06F_GP_ENTRY0_OPERAND 31:0
|
||||
#define NVC06F_GP_ENTRY1_GET_HI 7:0
|
||||
#define NVC06F_GP_ENTRY1_PRIV 8:8
|
||||
#define NVC06F_GP_ENTRY1_PRIV_USER 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_PRIV_KERNEL 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_LEVEL 9:9
|
||||
#define NVC06F_GP_ENTRY1_LEVEL_MAIN 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_LEVEL_SUBROUTINE 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_LENGTH 30:10
|
||||
#define NVC06F_GP_ENTRY1_SYNC 31:31
|
||||
#define NVC06F_GP_ENTRY1_SYNC_PROCEED 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_SYNC_WAIT 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_OPCODE 7:0
|
||||
#define NVC06F_GP_ENTRY1_OPCODE_NOP 0x00000000
|
||||
#define NVC06F_GP_ENTRY1_OPCODE_ILLEGAL 0x00000001
|
||||
#define NVC06F_GP_ENTRY1_OPCODE_GP_CRC 0x00000002
|
||||
#define NVC06F_GP_ENTRY1_OPCODE_PB_CRC 0x00000003
|
||||
|
||||
/* dma method formats */
|
||||
#define NVC06F_DMA_METHOD_ADDRESS_OLD 12:2
|
||||
#define NVC06F_DMA_METHOD_ADDRESS 11:0
|
||||
#define NVC06F_DMA_SUBDEVICE_MASK 15:4
|
||||
#define NVC06F_DMA_METHOD_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_TERT_OP 17:16
|
||||
#define NVC06F_DMA_TERT_OP_GRP0_INC_METHOD (0x00000000)
|
||||
#define NVC06F_DMA_TERT_OP_GRP0_SET_SUB_DEV_MASK (0x00000001)
|
||||
#define NVC06F_DMA_TERT_OP_GRP0_STORE_SUB_DEV_MASK (0x00000002)
|
||||
#define NVC06F_DMA_TERT_OP_GRP0_USE_SUB_DEV_MASK (0x00000003)
|
||||
#define NVC06F_DMA_TERT_OP_GRP2_NON_INC_METHOD (0x00000000)
|
||||
#define NVC06F_DMA_METHOD_COUNT_OLD 28:18
|
||||
#define NVC06F_DMA_METHOD_COUNT 28:16
|
||||
#define NVC06F_DMA_IMMD_DATA 28:16
|
||||
#define NVC06F_DMA_SEC_OP 31:29
|
||||
#define NVC06F_DMA_SEC_OP_GRP0_USE_TERT (0x00000000)
|
||||
#define NVC06F_DMA_SEC_OP_INC_METHOD (0x00000001)
|
||||
#define NVC06F_DMA_SEC_OP_GRP2_USE_TERT (0x00000002)
|
||||
#define NVC06F_DMA_SEC_OP_NON_INC_METHOD (0x00000003)
|
||||
#define NVC06F_DMA_SEC_OP_IMMD_DATA_METHOD (0x00000004)
|
||||
#define NVC06F_DMA_SEC_OP_ONE_INC (0x00000005)
|
||||
#define NVC06F_DMA_SEC_OP_RESERVED6 (0x00000006)
|
||||
#define NVC06F_DMA_SEC_OP_END_PB_SEGMENT (0x00000007)
|
||||
/* dma incrementing method format */
|
||||
#define NVC06F_DMA_INCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_INCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_INCR_COUNT 28:16
|
||||
#define NVC06F_DMA_INCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_INCR_OPCODE_VALUE (0x00000001)
|
||||
#define NVC06F_DMA_INCR_DATA 31:0
|
||||
/* dma non-incrementing method format */
|
||||
#define NVC06F_DMA_NONINCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_NONINCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_NONINCR_COUNT 28:16
|
||||
#define NVC06F_DMA_NONINCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_NONINCR_OPCODE_VALUE (0x00000003)
|
||||
#define NVC06F_DMA_NONINCR_DATA 31:0
|
||||
/* dma increment-once method format */
|
||||
#define NVC06F_DMA_ONEINCR_ADDRESS 11:0
|
||||
#define NVC06F_DMA_ONEINCR_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_ONEINCR_COUNT 28:16
|
||||
#define NVC06F_DMA_ONEINCR_OPCODE 31:29
|
||||
#define NVC06F_DMA_ONEINCR_OPCODE_VALUE (0x00000005)
|
||||
#define NVC06F_DMA_ONEINCR_DATA 31:0
|
||||
/* dma no-operation format */
|
||||
#define NVC06F_DMA_NOP (0x00000000)
|
||||
/* dma immediate-data format */
|
||||
#define NVC06F_DMA_IMMD_ADDRESS 11:0
|
||||
#define NVC06F_DMA_IMMD_SUBCHANNEL 15:13
|
||||
#define NVC06F_DMA_IMMD_DATA 28:16
|
||||
#define NVC06F_DMA_IMMD_OPCODE 31:29
|
||||
#define NVC06F_DMA_IMMD_OPCODE_VALUE (0x00000004)
|
||||
/* dma set sub-device mask format */
|
||||
#define NVC06F_DMA_SET_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVC06F_DMA_SET_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVC06F_DMA_SET_SUBDEVICE_MASK_OPCODE_VALUE (0x00000001)
|
||||
/* dma store sub-device mask format */
|
||||
#define NVC06F_DMA_STORE_SUBDEVICE_MASK_VALUE 15:4
|
||||
#define NVC06F_DMA_STORE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVC06F_DMA_STORE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000002)
|
||||
/* dma use sub-device mask format */
|
||||
#define NVC06F_DMA_USE_SUBDEVICE_MASK_OPCODE 31:16
|
||||
#define NVC06F_DMA_USE_SUBDEVICE_MASK_OPCODE_VALUE (0x00000003)
|
||||
/* dma end-segment format */
|
||||
#define NVC06F_DMA_ENDSEG_OPCODE 31:29
|
||||
#define NVC06F_DMA_ENDSEG_OPCODE_VALUE (0x00000007)
|
||||
/* dma legacy incrementing/non-incrementing formats */
|
||||
#define NVC06F_DMA_ADDRESS 12:2
|
||||
#define NVC06F_DMA_SUBCH 15:13
|
||||
#define NVC06F_DMA_OPCODE3 17:16
|
||||
#define NVC06F_DMA_OPCODE3_NONE (0x00000000)
|
||||
#define NVC06F_DMA_COUNT 28:18
|
||||
#define NVC06F_DMA_OPCODE 31:29
|
||||
#define NVC06F_DMA_OPCODE_METHOD (0x00000000)
|
||||
#define NVC06F_DMA_OPCODE_NONINC_METHOD (0x00000002)
|
||||
#define NVC06F_DMA_DATA 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
#endif
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVIDIA Corporation
|
||||
Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
@@ -32,6 +32,10 @@ extern "C" {
|
||||
|
||||
#define PASCAL_DMA_COPY_A (0x0000C0B5)
|
||||
|
||||
#define NVC0B5_NOP (0x00000100)
|
||||
#define NVC0B5_NOP_PARAMETER 31:0
|
||||
#define NVC0B5_PM_TRIGGER (0x00000140)
|
||||
#define NVC0B5_PM_TRIGGER_V 31:0
|
||||
#define NVC0B5_SET_SEMAPHORE_A (0x00000240)
|
||||
#define NVC0B5_SET_SEMAPHORE_A_UPPER 16:0
|
||||
#define NVC0B5_SET_SEMAPHORE_B (0x00000244)
|
||||
@@ -115,6 +119,10 @@ extern "C" {
|
||||
#define NVC0B5_LAUNCH_DMA_SRC_BYPASS_L2 20:20
|
||||
#define NVC0B5_LAUNCH_DMA_SRC_BYPASS_L2_USE_PTE_SETTING (0x00000000)
|
||||
#define NVC0B5_LAUNCH_DMA_SRC_BYPASS_L2_FORCE_VOLATILE (0x00000001)
|
||||
#define NVC0B5_LAUNCH_DMA_DST_BYPASS_L2 21:21
|
||||
#define NVC0B5_LAUNCH_DMA_DST_BYPASS_L2_USE_PTE_SETTING (0x00000000)
|
||||
#define NVC0B5_LAUNCH_DMA_DST_BYPASS_L2_FORCE_VOLATILE (0x00000001)
|
||||
#define NVC0B5_LAUNCH_DMA_RESERVED 31:28
|
||||
#define NVC0B5_OFFSET_IN_UPPER (0x00000400)
|
||||
#define NVC0B5_OFFSET_IN_UPPER_UPPER 16:0
|
||||
#define NVC0B5_OFFSET_IN_LOWER (0x00000404)
|
||||
@@ -183,6 +191,68 @@ extern "C" {
|
||||
#define NVC0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_TWO (0x00000001)
|
||||
#define NVC0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_THREE (0x00000002)
|
||||
#define NVC0B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_FOUR (0x00000003)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE (0x0000070C)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC0B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC0B5_SET_DST_WIDTH (0x00000710)
|
||||
#define NVC0B5_SET_DST_WIDTH_V 31:0
|
||||
#define NVC0B5_SET_DST_HEIGHT (0x00000714)
|
||||
#define NVC0B5_SET_DST_HEIGHT_V 31:0
|
||||
#define NVC0B5_SET_DST_DEPTH (0x00000718)
|
||||
#define NVC0B5_SET_DST_DEPTH_V 31:0
|
||||
#define NVC0B5_SET_DST_LAYER (0x0000071C)
|
||||
#define NVC0B5_SET_DST_LAYER_V 31:0
|
||||
#define NVC0B5_SET_DST_ORIGIN (0x00000720)
|
||||
#define NVC0B5_SET_DST_ORIGIN_X 15:0
|
||||
#define NVC0B5_SET_DST_ORIGIN_Y 31:16
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE (0x00000728)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC0B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC0B5_SET_SRC_WIDTH (0x0000072C)
|
||||
#define NVC0B5_SET_SRC_WIDTH_V 31:0
|
||||
#define NVC0B5_SET_SRC_HEIGHT (0x00000730)
|
||||
#define NVC0B5_SET_SRC_HEIGHT_V 31:0
|
||||
#define NVC0B5_SET_SRC_DEPTH (0x00000734)
|
||||
#define NVC0B5_SET_SRC_DEPTH_V 31:0
|
||||
#define NVC0B5_SET_SRC_LAYER (0x00000738)
|
||||
#define NVC0B5_SET_SRC_LAYER_V 31:0
|
||||
#define NVC0B5_SET_SRC_ORIGIN (0x0000073C)
|
||||
#define NVC0B5_SET_SRC_ORIGIN_X 15:0
|
||||
#define NVC0B5_SET_SRC_ORIGIN_Y 31:16
|
||||
#define NVC0B5_PM_TRIGGER_END (0x00001114)
|
||||
#define NVC0B5_PM_TRIGGER_END_V 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2014 NVIDIA Corporation
|
||||
Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
@@ -32,6 +32,10 @@ extern "C" {
|
||||
|
||||
#define PASCAL_DMA_COPY_B (0x0000C1B5)
|
||||
|
||||
#define NVC1B5_NOP (0x00000100)
|
||||
#define NVC1B5_NOP_PARAMETER 31:0
|
||||
#define NVC1B5_PM_TRIGGER (0x00000140)
|
||||
#define NVC1B5_PM_TRIGGER_V 31:0
|
||||
#define NVC1B5_SET_SEMAPHORE_A (0x00000240)
|
||||
#define NVC1B5_SET_SEMAPHORE_A_UPPER 16:0
|
||||
#define NVC1B5_SET_SEMAPHORE_B (0x00000244)
|
||||
@@ -115,6 +119,14 @@ extern "C" {
|
||||
#define NVC1B5_LAUNCH_DMA_SRC_BYPASS_L2 20:20
|
||||
#define NVC1B5_LAUNCH_DMA_SRC_BYPASS_L2_USE_PTE_SETTING (0x00000000)
|
||||
#define NVC1B5_LAUNCH_DMA_SRC_BYPASS_L2_FORCE_VOLATILE (0x00000001)
|
||||
#define NVC1B5_LAUNCH_DMA_DST_BYPASS_L2 21:21
|
||||
#define NVC1B5_LAUNCH_DMA_DST_BYPASS_L2_USE_PTE_SETTING (0x00000000)
|
||||
#define NVC1B5_LAUNCH_DMA_DST_BYPASS_L2_FORCE_VOLATILE (0x00000001)
|
||||
#define NVC1B5_LAUNCH_DMA_VPRMODE 23:22
|
||||
#define NVC1B5_LAUNCH_DMA_VPRMODE_VPR_NONE (0x00000000)
|
||||
#define NVC1B5_LAUNCH_DMA_VPRMODE_VPR_VID2VID (0x00000001)
|
||||
#define NVC1B5_LAUNCH_DMA_RESERVED_START_OF_COPY 24:24
|
||||
#define NVC1B5_LAUNCH_DMA_RESERVED_ERR_CODE 31:28
|
||||
#define NVC1B5_OFFSET_IN_UPPER (0x00000400)
|
||||
#define NVC1B5_OFFSET_IN_UPPER_UPPER 16:0
|
||||
#define NVC1B5_OFFSET_IN_LOWER (0x00000404)
|
||||
@@ -183,6 +195,76 @@ extern "C" {
|
||||
#define NVC1B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_TWO (0x00000001)
|
||||
#define NVC1B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_THREE (0x00000002)
|
||||
#define NVC1B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_FOUR (0x00000003)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE (0x0000070C)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC1B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC1B5_SET_DST_WIDTH (0x00000710)
|
||||
#define NVC1B5_SET_DST_WIDTH_V 31:0
|
||||
#define NVC1B5_SET_DST_HEIGHT (0x00000714)
|
||||
#define NVC1B5_SET_DST_HEIGHT_V 31:0
|
||||
#define NVC1B5_SET_DST_DEPTH (0x00000718)
|
||||
#define NVC1B5_SET_DST_DEPTH_V 31:0
|
||||
#define NVC1B5_SET_DST_LAYER (0x0000071C)
|
||||
#define NVC1B5_SET_DST_LAYER_V 31:0
|
||||
#define NVC1B5_SET_DST_ORIGIN (0x00000720)
|
||||
#define NVC1B5_SET_DST_ORIGIN_X 15:0
|
||||
#define NVC1B5_SET_DST_ORIGIN_Y 31:16
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE (0x00000728)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC1B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC1B5_SET_SRC_WIDTH (0x0000072C)
|
||||
#define NVC1B5_SET_SRC_WIDTH_V 31:0
|
||||
#define NVC1B5_SET_SRC_HEIGHT (0x00000730)
|
||||
#define NVC1B5_SET_SRC_HEIGHT_V 31:0
|
||||
#define NVC1B5_SET_SRC_DEPTH (0x00000734)
|
||||
#define NVC1B5_SET_SRC_DEPTH_V 31:0
|
||||
#define NVC1B5_SET_SRC_LAYER (0x00000738)
|
||||
#define NVC1B5_SET_SRC_LAYER_V 31:0
|
||||
#define NVC1B5_SET_SRC_ORIGIN (0x0000073C)
|
||||
#define NVC1B5_SET_SRC_ORIGIN_X 15:0
|
||||
#define NVC1B5_SET_SRC_ORIGIN_Y 31:16
|
||||
#define NVC1B5_SRC_ORIGIN_X (0x00000744)
|
||||
#define NVC1B5_SRC_ORIGIN_X_VALUE 31:0
|
||||
#define NVC1B5_SRC_ORIGIN_Y (0x00000748)
|
||||
#define NVC1B5_SRC_ORIGIN_Y_VALUE 31:0
|
||||
#define NVC1B5_DST_ORIGIN_X (0x0000074C)
|
||||
#define NVC1B5_DST_ORIGIN_X_VALUE 31:0
|
||||
#define NVC1B5_DST_ORIGIN_Y (0x00000750)
|
||||
#define NVC1B5_DST_ORIGIN_Y_VALUE 31:0
|
||||
#define NVC1B5_PM_TRIGGER_END (0x00001114)
|
||||
#define NVC1B5_PM_TRIGGER_END_V 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2022 NVIDIA Corporation
|
||||
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
@@ -32,6 +32,10 @@ extern "C" {
|
||||
|
||||
#define VOLTA_DMA_COPY_A (0x0000C3B5)
|
||||
|
||||
#define NVC3B5_NOP (0x00000100)
|
||||
#define NVC3B5_NOP_PARAMETER 31:0
|
||||
#define NVC3B5_PM_TRIGGER (0x00000140)
|
||||
#define NVC3B5_PM_TRIGGER_V 31:0
|
||||
#define NVC3B5_SET_SEMAPHORE_A (0x00000240)
|
||||
#define NVC3B5_SET_SEMAPHORE_A_UPPER 16:0
|
||||
#define NVC3B5_SET_SEMAPHORE_B (0x00000244)
|
||||
@@ -126,8 +130,6 @@ extern "C" {
|
||||
#define NVC3B5_LAUNCH_DMA_VPRMODE 23:22
|
||||
#define NVC3B5_LAUNCH_DMA_VPRMODE_VPR_NONE (0x00000000)
|
||||
#define NVC3B5_LAUNCH_DMA_VPRMODE_VPR_VID2VID (0x00000001)
|
||||
#define NVC3B5_LAUNCH_DMA_VPRMODE_VPR_VID2SYS (0x00000002)
|
||||
#define NVC3B5_LAUNCH_DMA_VPRMODE_VPR_SYS2VID (0x00000003)
|
||||
#define NVC3B5_LAUNCH_DMA_RESERVED_START_OF_COPY 24:24
|
||||
#define NVC3B5_LAUNCH_DMA_RESERVED_ERR_CODE 31:28
|
||||
#define NVC3B5_OFFSET_IN_UPPER (0x00000400)
|
||||
@@ -198,6 +200,76 @@ extern "C" {
|
||||
#define NVC3B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_TWO (0x00000001)
|
||||
#define NVC3B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_THREE (0x00000002)
|
||||
#define NVC3B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_FOUR (0x00000003)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE (0x0000070C)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC3B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC3B5_SET_DST_WIDTH (0x00000710)
|
||||
#define NVC3B5_SET_DST_WIDTH_V 31:0
|
||||
#define NVC3B5_SET_DST_HEIGHT (0x00000714)
|
||||
#define NVC3B5_SET_DST_HEIGHT_V 31:0
|
||||
#define NVC3B5_SET_DST_DEPTH (0x00000718)
|
||||
#define NVC3B5_SET_DST_DEPTH_V 31:0
|
||||
#define NVC3B5_SET_DST_LAYER (0x0000071C)
|
||||
#define NVC3B5_SET_DST_LAYER_V 31:0
|
||||
#define NVC3B5_SET_DST_ORIGIN (0x00000720)
|
||||
#define NVC3B5_SET_DST_ORIGIN_X 15:0
|
||||
#define NVC3B5_SET_DST_ORIGIN_Y 31:16
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE (0x00000728)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_WIDTH 3:0
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_WIDTH_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT 7:4
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_TWO_GOBS (0x00000001)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_FOUR_GOBS (0x00000002)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_EIGHT_GOBS (0x00000003)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH 11:8
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_ONE_GOB (0x00000000)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_TWO_GOBS (0x00000001)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_FOUR_GOBS (0x00000002)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_EIGHT_GOBS (0x00000003)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS (0x00000004)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS (0x00000005)
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT 15:12
|
||||
#define NVC3B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8 (0x00000001)
|
||||
#define NVC3B5_SET_SRC_WIDTH (0x0000072C)
|
||||
#define NVC3B5_SET_SRC_WIDTH_V 31:0
|
||||
#define NVC3B5_SET_SRC_HEIGHT (0x00000730)
|
||||
#define NVC3B5_SET_SRC_HEIGHT_V 31:0
|
||||
#define NVC3B5_SET_SRC_DEPTH (0x00000734)
|
||||
#define NVC3B5_SET_SRC_DEPTH_V 31:0
|
||||
#define NVC3B5_SET_SRC_LAYER (0x00000738)
|
||||
#define NVC3B5_SET_SRC_LAYER_V 31:0
|
||||
#define NVC3B5_SET_SRC_ORIGIN (0x0000073C)
|
||||
#define NVC3B5_SET_SRC_ORIGIN_X 15:0
|
||||
#define NVC3B5_SET_SRC_ORIGIN_Y 31:16
|
||||
#define NVC3B5_SRC_ORIGIN_X (0x00000744)
|
||||
#define NVC3B5_SRC_ORIGIN_X_VALUE 31:0
|
||||
#define NVC3B5_SRC_ORIGIN_Y (0x00000748)
|
||||
#define NVC3B5_SRC_ORIGIN_Y_VALUE 31:0
|
||||
#define NVC3B5_DST_ORIGIN_X (0x0000074C)
|
||||
#define NVC3B5_DST_ORIGIN_X_VALUE 31:0
|
||||
#define NVC3B5_DST_ORIGIN_Y (0x00000750)
|
||||
#define NVC3B5_DST_ORIGIN_Y_VALUE 31:0
|
||||
#define NVC3B5_PM_TRIGGER_END (0x00001114)
|
||||
#define NVC3B5_PM_TRIGGER_END_V 31:0
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
|
||||
97
kernel-open/nvidia-uvm/clcba2.h
Normal file
97
kernel-open/nvidia-uvm/clcba2.h
Normal file
@@ -0,0 +1,97 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2022 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "nvtypes.h"
|
||||
|
||||
#ifndef _clcba2_h_
|
||||
#define _clcba2_h_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define HOPPER_SEC2_WORK_LAUNCH_A (0x0000CBA2)
|
||||
|
||||
#define NVCBA2_DECRYPT_COPY_SRC_ADDR_HI (0x00000400)
|
||||
#define NVCBA2_DECRYPT_COPY_SRC_ADDR_HI_DATA 24:0
|
||||
#define NVCBA2_DECRYPT_COPY_SRC_ADDR_LO (0x00000404)
|
||||
#define NVCBA2_DECRYPT_COPY_SRC_ADDR_LO_DATA 31:4
|
||||
#define NVCBA2_DECRYPT_COPY_DST_ADDR_HI (0x00000408)
|
||||
#define NVCBA2_DECRYPT_COPY_DST_ADDR_HI_DATA 24:0
|
||||
#define NVCBA2_DECRYPT_COPY_DST_ADDR_LO (0x0000040c)
|
||||
#define NVCBA2_DECRYPT_COPY_DST_ADDR_LO_DATA 31:4
|
||||
#define NVCBA2_DECRYPT_COPY_SIZE (0x00000410)
|
||||
#define NVCBA2_DECRYPT_COPY_SIZE_DATA 31:2
|
||||
#define NVCBA2_DECRYPT_COPY_AUTH_TAG_ADDR_HI (0x00000414)
|
||||
#define NVCBA2_DECRYPT_COPY_AUTH_TAG_ADDR_HI_DATA 24:0
|
||||
#define NVCBA2_DECRYPT_COPY_AUTH_TAG_ADDR_LO (0x00000418)
|
||||
#define NVCBA2_DECRYPT_COPY_AUTH_TAG_ADDR_LO_DATA 31:4
|
||||
#define NVCBA2_METHOD_STREAM_AUTH_TAG_ADDR_HI (0x0000041C)
|
||||
#define NVCBA2_METHOD_STREAM_AUTH_TAG_ADDR_HI_DATA 24:0
|
||||
#define NVCBA2_METHOD_STREAM_AUTH_TAG_ADDR_LO (0x00000420)
|
||||
#define NVCBA2_METHOD_STREAM_AUTH_TAG_ADDR_LO_DATA 31:4
|
||||
#define NVCBA2_SEMAPHORE_A (0x00000440)
|
||||
#define NVCBA2_SEMAPHORE_A_UPPER 24:0
|
||||
#define NVCBA2_SEMAPHORE_B (0x00000444)
|
||||
#define NVCBA2_SEMAPHORE_B_LOWER 31:2
|
||||
#define NVCBA2_SET_SEMAPHORE_PAYLOAD_LOWER (0x00000448)
|
||||
#define NVCBA2_SET_SEMAPHORE_PAYLOAD_LOWER_DATA 31:0
|
||||
#define NVCBA2_SET_SEMAPHORE_PAYLOAD_UPPER (0x0000044C)
|
||||
#define NVCBA2_SET_SEMAPHORE_PAYLOAD_UPPER_DATA 31:0
|
||||
#define NVCBA2_SEMAPHORE_D (0x00000450)
|
||||
#define NVCBA2_SEMAPHORE_D_NOTIFY_INTR 0:0
|
||||
#define NVCBA2_SEMAPHORE_D_NOTIFY_INTR_DISABLE (0x00000000)
|
||||
#define NVCBA2_SEMAPHORE_D_NOTIFY_INTR_ENABLE (0x00000001)
|
||||
#define NVCBA2_SEMAPHORE_D_PAYLOAD_SIZE 1:1
|
||||
#define NVCBA2_SEMAPHORE_D_PAYLOAD_SIZE_32_BIT (0x00000000)
|
||||
#define NVCBA2_SEMAPHORE_D_PAYLOAD_SIZE_64_BIT (0x00000001)
|
||||
#define NVCBA2_SEMAPHORE_D_TIMESTAMP 2:2
|
||||
#define NVCBA2_SEMAPHORE_D_TIMESTAMP_DISABLE (0x00000000)
|
||||
#define NVCBA2_SEMAPHORE_D_TIMESTAMP_ENABLE (0x00000001)
|
||||
#define NVCBA2_SEMAPHORE_D_FLUSH_DISABLE 3:3
|
||||
#define NVCBA2_SEMAPHORE_D_FLUSH_DISABLE_FALSE (0x00000000)
|
||||
#define NVCBA2_SEMAPHORE_D_FLUSH_DISABLE_TRUE (0x00000001)
|
||||
#define NVCBA2_EXECUTE (0x00000470)
|
||||
#define NVCBA2_EXECUTE_NOTIFY 0:0
|
||||
#define NVCBA2_EXECUTE_NOTIFY_DISABLE (0x00000000)
|
||||
#define NVCBA2_EXECUTE_NOTIFY_ENABLE (0x00000001)
|
||||
#define NVCBA2_EXECUTE_NOTIFY_ON 1:1
|
||||
#define NVCBA2_EXECUTE_NOTIFY_ON_END (0x00000000)
|
||||
#define NVCBA2_EXECUTE_NOTIFY_ON_BEGIN (0x00000001)
|
||||
#define NVCBA2_EXECUTE_FLUSH_DISABLE 2:2
|
||||
#define NVCBA2_EXECUTE_FLUSH_DISABLE_FALSE (0x00000000)
|
||||
#define NVCBA2_EXECUTE_FLUSH_DISABLE_TRUE (0x00000001)
|
||||
#define NVCBA2_EXECUTE_NOTIFY_INTR 3:3
|
||||
#define NVCBA2_EXECUTE_NOTIFY_INTR_DISABLE (0x00000000)
|
||||
#define NVCBA2_EXECUTE_NOTIFY_INTR_ENABLE (0x00000001)
|
||||
#define NVCBA2_EXECUTE_PAYLOAD_SIZE 4:4
|
||||
#define NVCBA2_EXECUTE_PAYLOAD_SIZE_32_BIT (0x00000000)
|
||||
#define NVCBA2_EXECUTE_PAYLOAD_SIZE_64_BIT (0x00000001)
|
||||
#define NVCBA2_EXECUTE_TIMESTAMP 5:5
|
||||
#define NVCBA2_EXECUTE_TIMESTAMP_DISABLE (0x00000000)
|
||||
#define NVCBA2_EXECUTE_TIMESTAMP_ENABLE (0x00000001)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}; /* extern "C" */
|
||||
#endif
|
||||
#endif // _clcba2_h
|
||||
@@ -301,7 +301,7 @@ static void _q_flush_function(void *args)
|
||||
static void _raw_q_flush(nv_kthread_q_t *q)
|
||||
{
|
||||
nv_kthread_q_item_t q_item;
|
||||
DECLARE_COMPLETION(completion);
|
||||
DECLARE_COMPLETION_ONSTACK(completion);
|
||||
|
||||
nv_kthread_q_item_init(&q_item, _q_flush_function, &completion);
|
||||
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
NVIDIA_UVM_SOURCES ?=
|
||||
NVIDIA_UVM_SOURCES_CXX ?=
|
||||
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats_sva.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_conf_computing.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_sec2_test.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_maxwell_sec2.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_sec2.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_common.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_linux.c
|
||||
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_debug_optimized.c
|
||||
|
||||
@@ -84,7 +84,9 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += vm_fault_to_errno
|
||||
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
|
||||
@@ -104,5 +106,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_vma_added_flags
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_device_range
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
|
||||
|
||||
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "uvm_lock.h"
|
||||
#include "uvm_test.h"
|
||||
#include "uvm_va_space.h"
|
||||
#include "uvm_va_space_mm.h"
|
||||
#include "uvm_va_range.h"
|
||||
#include "uvm_va_block.h"
|
||||
#include "uvm_tools.h"
|
||||
@@ -72,6 +73,11 @@ uvm_fd_type_t uvm_fd_type(struct file *filp, void **ptr_val)
|
||||
BUILD_BUG_ON(__alignof__(uvm_va_space_t) < (1UL << UVM_FD_TYPE_BITS));
|
||||
break;
|
||||
|
||||
case UVM_FD_MM:
|
||||
UVM_ASSERT(ptr);
|
||||
BUILD_BUG_ON(__alignof__(struct file) < (1UL << UVM_FD_TYPE_BITS));
|
||||
break;
|
||||
|
||||
default:
|
||||
UVM_ASSERT(0);
|
||||
}
|
||||
@@ -82,6 +88,106 @@ uvm_fd_type_t uvm_fd_type(struct file *filp, void **ptr_val)
|
||||
return type;
|
||||
}
|
||||
|
||||
void *uvm_fd_get_type(struct file *filp, uvm_fd_type_t type)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
UVM_ASSERT(uvm_file_is_nvidia_uvm(filp));
|
||||
|
||||
if (uvm_fd_type(filp, &ptr) == type)
|
||||
return ptr;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_api_mm_initialize(UVM_MM_INITIALIZE_PARAMS *params, struct file *filp)
|
||||
{
|
||||
uvm_va_space_t *va_space;
|
||||
uvm_va_space_mm_t *va_space_mm;
|
||||
struct file *uvm_file;
|
||||
uvm_fd_type_t old_fd_type;
|
||||
struct mm_struct *mm;
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_file = fget(params->uvmFd);
|
||||
if (!uvm_file_is_nvidia_uvm(uvm_file)) {
|
||||
status = NV_ERR_INVALID_ARGUMENT;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (uvm_fd_type(uvm_file, (void **)&va_space) != UVM_FD_VA_SPACE) {
|
||||
status = NV_ERR_INVALID_ARGUMENT;
|
||||
goto err;
|
||||
}
|
||||
|
||||
// Tell userspace the MM FD is not required and it may be released
|
||||
// with no loss of functionality.
|
||||
if (!uvm_va_space_mm_enabled(va_space)) {
|
||||
status = NV_WARN_NOTHING_TO_DO;
|
||||
goto err;
|
||||
}
|
||||
|
||||
old_fd_type = nv_atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
|
||||
UVM_FD_UNINITIALIZED,
|
||||
UVM_FD_INITIALIZING);
|
||||
old_fd_type &= UVM_FD_TYPE_MASK;
|
||||
if (old_fd_type != UVM_FD_UNINITIALIZED) {
|
||||
status = NV_ERR_IN_USE;
|
||||
goto err;
|
||||
}
|
||||
|
||||
va_space_mm = &va_space->va_space_mm;
|
||||
uvm_spin_lock(&va_space_mm->lock);
|
||||
switch (va_space->va_space_mm.state) {
|
||||
// We only allow the va_space_mm to be initialised once. If
|
||||
// userspace passed the UVM FD to another process it is up to
|
||||
// userspace to ensure it also passes the UVM MM FD that
|
||||
// initialised the va_space_mm or arranges some other way to keep
|
||||
// a reference on the FD.
|
||||
case UVM_VA_SPACE_MM_STATE_ALIVE:
|
||||
status = NV_ERR_INVALID_STATE;
|
||||
goto err_release_unlock;
|
||||
break;
|
||||
|
||||
// Once userspace has released the va_space_mm the GPU is
|
||||
// effectively dead and no new work can be started. We don't
|
||||
// support re-initializing once userspace has closed the FD.
|
||||
case UVM_VA_SPACE_MM_STATE_RELEASED:
|
||||
status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
|
||||
goto err_release_unlock;
|
||||
break;
|
||||
|
||||
// Keep the warnings at bay
|
||||
case UVM_VA_SPACE_MM_STATE_UNINITIALIZED:
|
||||
mm = va_space->va_space_mm.mm;
|
||||
if (!mm || !mmget_not_zero(mm)) {
|
||||
status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
|
||||
goto err_release_unlock;
|
||||
}
|
||||
|
||||
va_space_mm->state = UVM_VA_SPACE_MM_STATE_ALIVE;
|
||||
break;
|
||||
|
||||
default:
|
||||
UVM_ASSERT(0);
|
||||
break;
|
||||
}
|
||||
uvm_spin_unlock(&va_space_mm->lock);
|
||||
atomic_long_set_release((atomic_long_t *)&filp->private_data, (long)uvm_file | UVM_FD_MM);
|
||||
|
||||
return NV_OK;
|
||||
|
||||
err_release_unlock:
|
||||
uvm_spin_unlock(&va_space_mm->lock);
|
||||
atomic_long_set_release((atomic_long_t *)&filp->private_data, UVM_FD_UNINITIALIZED);
|
||||
|
||||
err:
|
||||
if (uvm_file)
|
||||
fput(uvm_file);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// Called when opening /dev/nvidia-uvm. This code doesn't take any UVM locks, so
|
||||
// there's no need to acquire g_uvm_global.pm.lock, but if that changes the PM
|
||||
// lock will need to be taken.
|
||||
@@ -147,20 +253,44 @@ static void uvm_release_deferred(void *data)
|
||||
uvm_up_read(&g_uvm_global.pm.lock);
|
||||
}
|
||||
|
||||
static void uvm_mm_release(struct file *filp, struct file *uvm_file)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(uvm_file);
|
||||
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
|
||||
struct mm_struct *mm = va_space_mm->mm;
|
||||
|
||||
if (uvm_va_space_mm_enabled(va_space)) {
|
||||
uvm_va_space_mm_unregister(va_space);
|
||||
|
||||
if (uvm_va_space_mm_enabled(va_space))
|
||||
uvm_mmput(mm);
|
||||
|
||||
va_space_mm->mm = NULL;
|
||||
fput(uvm_file);
|
||||
}
|
||||
}
|
||||
|
||||
static int uvm_release(struct inode *inode, struct file *filp)
|
||||
{
|
||||
void *ptr;
|
||||
uvm_va_space_t *va_space;
|
||||
uvm_fd_type_t fd_type;
|
||||
int ret;
|
||||
|
||||
fd_type = uvm_fd_type(filp, (void **)&va_space);
|
||||
fd_type = uvm_fd_type(filp, &ptr);
|
||||
UVM_ASSERT(fd_type != UVM_FD_INITIALIZING);
|
||||
if (fd_type == UVM_FD_UNINITIALIZED) {
|
||||
uvm_kvfree(filp->f_mapping);
|
||||
return 0;
|
||||
}
|
||||
else if (fd_type == UVM_FD_MM) {
|
||||
uvm_kvfree(filp->f_mapping);
|
||||
uvm_mm_release(filp, (struct file *)ptr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
UVM_ASSERT(fd_type == UVM_FD_VA_SPACE);
|
||||
va_space = (uvm_va_space_t *)ptr;
|
||||
filp->private_data = NULL;
|
||||
filp->f_mapping = NULL;
|
||||
|
||||
@@ -756,6 +886,13 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool uvm_vma_is_managed(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_ops == &uvm_vm_ops_disabled ||
|
||||
vma->vm_ops == &uvm_vm_ops_managed ||
|
||||
vma->vm_ops == &uvm_vm_ops_semaphore_pool;
|
||||
}
|
||||
|
||||
static int uvm_mmap_entry(struct file *filp, struct vm_area_struct *vma)
|
||||
{
|
||||
UVM_ENTRY_RET(uvm_mmap(filp, vma));
|
||||
@@ -804,6 +941,9 @@ static NV_STATUS uvm_api_initialize(UVM_INITIALIZE_PARAMS *params, struct file *
|
||||
else
|
||||
status = NV_OK;
|
||||
}
|
||||
else if (old_fd_type == UVM_FD_MM) {
|
||||
status = NV_ERR_INVALID_ARGUMENT;
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(old_fd_type == UVM_FD_INITIALIZING);
|
||||
status = NV_ERR_BUSY_RETRY;
|
||||
@@ -827,6 +967,7 @@ static long uvm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||
return 0;
|
||||
|
||||
UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_INITIALIZE, uvm_api_initialize);
|
||||
UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_MM_INITIALIZE, uvm_api_mm_initialize);
|
||||
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_PAGEABLE_MEM_ACCESS, uvm_api_pageable_mem_access);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_PAGEABLE_MEM_ACCESS_ON_GPU, uvm_api_pageable_mem_access_on_gpu);
|
||||
|
||||
@@ -54,7 +54,7 @@
|
||||
#ifndef _UVM_H_
|
||||
#define _UVM_H_
|
||||
|
||||
#define UVM_API_LATEST_REVISION 7
|
||||
#define UVM_API_LATEST_REVISION 8
|
||||
|
||||
#if !defined(UVM_API_REVISION)
|
||||
#error "please define UVM_API_REVISION macro to a desired version number or UVM_API_LATEST_REVISION macro"
|
||||
@@ -410,6 +410,12 @@ NV_STATUS UvmRegisterGpuSmc(const NvProcessorUuid *gpuUuid,
|
||||
// location will have their range group association changed to
|
||||
// UVM_RANGE_GROUP_ID_NONE.
|
||||
//
|
||||
// If the Confidential Computing feature is enabled in the system, any VA
|
||||
// ranges allocated using UvmAllocSemaphorePool and owned by this GPU will be
|
||||
// unmapped from all GPUs and the CPU. UvmFree must still be called on those
|
||||
// ranges to reclaim the VA. See UvmAllocSemaphorePool to determine which GPU
|
||||
// is considered the owner.
|
||||
//
|
||||
// Arguments:
|
||||
// gpuUuid: (INPUT)
|
||||
// UUID of the GPU to unregister.
|
||||
@@ -1094,10 +1100,12 @@ NV_STATUS UvmAllowMigrationRangeGroups(const NvU64 *rangeGroupIds,
|
||||
// Creates a new mapping in the virtual address space of the process, populates
|
||||
// it at the specified preferred location, maps it on the provided list of
|
||||
// processors if feasible and associates the range with the given range group.
|
||||
// If the preferredLocationUuid is the UUID of the CPU, preferred location is
|
||||
// set to all CPU nodes allowed by the global and thread memory policies.
|
||||
//
|
||||
// This API is equivalent to the following code sequence:
|
||||
// UvmMemMap(base, length);
|
||||
// UvmSetPreferredLocation(base, length, preferredLocationUuid);
|
||||
// UvmSetPreferredLocation(base, length, preferredLocationUuid, -1);
|
||||
// for (i = 0; i < accessedByCount; i++) {
|
||||
// UvmSetAccessedBy(base, length, &accessedByUuids[i]);
|
||||
// }
|
||||
@@ -1262,6 +1270,12 @@ NV_STATUS UvmCleanUpZombieResources(void);
|
||||
//
|
||||
// The VA range can be unmapped and freed via a call to UvmFree.
|
||||
//
|
||||
// If the Confidential Computing feature is enabled in the system, at least one
|
||||
// GPU must be provided in the perGpuAttribs array. The first GPU in the array
|
||||
// is considered the owning GPU. If the owning GPU is unregistered via
|
||||
// UvmUnregisterGpu, this allocation will no longer be usable.
|
||||
// See UvmUnregisterGpu.
|
||||
//
|
||||
// Arguments:
|
||||
// base: (INPUT)
|
||||
// Base address of the virtual address range.
|
||||
@@ -1298,6 +1312,8 @@ NV_STATUS UvmCleanUpZombieResources(void);
|
||||
// NV_ERR_INVALID_ARGUMENT:
|
||||
// perGpuAttribs is NULL but gpuAttribsCount is non-zero or vice-versa,
|
||||
// or caching is requested on more than one GPU.
|
||||
// The Confidential Computing feature is enabled and the perGpuAttribs
|
||||
// list is empty.
|
||||
//
|
||||
// NV_ERR_NOT_SUPPORTED:
|
||||
// The current process is not the one which called UvmInitialize, and
|
||||
@@ -1444,7 +1460,7 @@ NV_STATUS UvmMigrate(void *base,
|
||||
NV_STATUS UvmMigrate(void *base,
|
||||
NvLength length,
|
||||
const NvProcessorUuid *destinationUuid,
|
||||
NvU32 preferredCpuMemoryNode);
|
||||
NvS32 preferredCpuMemoryNode);
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -1537,7 +1553,7 @@ NV_STATUS UvmMigrateAsync(void *base,
|
||||
NV_STATUS UvmMigrateAsync(void *base,
|
||||
NvLength length,
|
||||
const NvProcessorUuid *destinationUuid,
|
||||
NvU32 preferredCpuMemoryNode,
|
||||
NvS32 preferredCpuMemoryNode,
|
||||
void *semaphoreAddress,
|
||||
NvU32 semaphorePayload);
|
||||
#endif
|
||||
@@ -2217,11 +2233,10 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
// supported by the specified processor.
|
||||
//
|
||||
// The virtual address range specified by (base, length) must have been
|
||||
// allocated via a call to either UvmAlloc or UvmMemMap, or be supported
|
||||
// system-allocated pageable memory. If the input range is pageable memory and
|
||||
// at least one GPU in the system supports transparent access to pageable
|
||||
// memory, the behavior described below does not take effect and the preferred
|
||||
// location of the pages in the given range does not change.
|
||||
// allocated via a call to either UvmAlloc or UvmMemMap (managed memory), or be
|
||||
// supported system-allocated pageable memory. If the input range corresponds to
|
||||
// a file backed shared mapping and least one GPU in the system supports
|
||||
// transparent access to pageable memory, the behavior below is not guaranteed.
|
||||
//
|
||||
// If any pages in the VA range are associated with a range group that was made
|
||||
// non-migratable via UvmPreventMigrationRangeGroups, then those pages are
|
||||
@@ -2240,17 +2255,17 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
// not cause a migration if a mapping for that page from that processor can be
|
||||
// established without migrating the page.
|
||||
//
|
||||
// When a page migrates away from its preferred location, the mapping on the
|
||||
// preferred location's processor is cleared so that the next access from that
|
||||
// processor will cause a fault and migrate the page back to its preferred
|
||||
// location. In other words, a page is mapped on the preferred location's
|
||||
// processor only if the page is in its preferred location. Thus, when the
|
||||
// preferred location changes, mappings to pages in the given range are removed
|
||||
// from the new preferred location if the pages are resident in a different
|
||||
// processor. Note that if the preferred location's processor is a GPU, then a
|
||||
// mapping from that GPU to a page in the VA range is only created if a GPU VA
|
||||
// space has been registered for that GPU and the page is in its preferred
|
||||
// location.
|
||||
// When a page that was allocated via either UvmAlloc or UvmMemMap migrates away
|
||||
// from its preferred location, the mapping on the preferred location's
|
||||
// processor is cleared so that the next access from that processor will cause a
|
||||
// fault and migrate the page back to its preferred location. In other words, a
|
||||
// page is mapped on the preferred location's processor only if the page is in
|
||||
// its preferred location. Thus, when the preferred location changes, mappings
|
||||
// to pages in the given range are removed from the new preferred location if
|
||||
// the pages are resident in a different processor. Note that if the preferred
|
||||
// location's processor is a GPU, then a mapping from that GPU to a page in the
|
||||
// VA range is only created if a GPU VA space has been registered for that GPU
|
||||
// and the page is in its preferred location.
|
||||
//
|
||||
// If read duplication has been enabled for any pages in this VA range and
|
||||
// UvmPreventMigrationRangeGroups has not been called on the range group that
|
||||
@@ -2263,7 +2278,7 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
//
|
||||
// If the preferred location processor is present in the accessed-by list of any
|
||||
// of the pages in this VA range, then the migration and mapping policies
|
||||
// associated with associated with the accessed-by list.
|
||||
// associated with this API override those associated with the accessed-by list.
|
||||
//
|
||||
// The state set by this API can be cleared either by calling
|
||||
// UvmUnsetPreferredLocation for the same VA range or by calling
|
||||
@@ -2284,35 +2299,66 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
// preferredLocationUuid: (INPUT)
|
||||
// UUID of the preferred location.
|
||||
//
|
||||
// preferredCpuNumaNode: (INPUT)
|
||||
// Preferred CPU NUMA memory node used if preferredLocationUuid is the
|
||||
// UUID of the CPU. -1 is a special value which indicates all CPU nodes
|
||||
// allowed by the global and thread memory policies. This argument is
|
||||
// ignored if preferredLocationUuid refers to a GPU or the given virtual
|
||||
// address range corresponds to managed memory. If NUMA is not enabled,
|
||||
// only 0 or -1 is allowed.
|
||||
//
|
||||
// Errors:
|
||||
// NV_ERR_INVALID_ADDRESS:
|
||||
// base and length are not properly aligned, or the range does not
|
||||
// represent a valid UVM allocation, or the range is pageable memory and
|
||||
// the system does not support accessing pageable memory, or the range
|
||||
// does not represent a supported Operating System allocation.
|
||||
// One of the following occurred:
|
||||
// - base and length are not properly aligned.
|
||||
// - The range does not represent a valid UVM allocation.
|
||||
// - The range is pageable memory and the system does not support
|
||||
// accessing pageable memory.
|
||||
// - The range does not represent a supported Operating System
|
||||
// allocation.
|
||||
//
|
||||
// NV_ERR_OUT_OF_RANGE:
|
||||
// The VA range exceeds the largest virtual address supported by the
|
||||
// specified processor.
|
||||
//
|
||||
// NV_ERR_INVALID_DEVICE:
|
||||
// preferredLocationUuid is neither the UUID of the CPU nor the UUID of
|
||||
// a GPU that was registered by this process. Or at least one page in
|
||||
// VA range belongs to a non-migratable range group and the specified
|
||||
// UUID represents a fault-capable GPU. Or preferredLocationUuid is the
|
||||
// UUID of a non-fault-capable GPU and at least one page in the VA range
|
||||
// belongs to a non-migratable range group and another non-fault-capable
|
||||
// GPU is in the accessed-by list of the same page but P2P support
|
||||
// between both GPUs has not been enabled.
|
||||
// One of the following occurred:
|
||||
// - preferredLocationUuid is neither the UUID of the CPU nor the UUID
|
||||
// of a GPU that was registered by this process.
|
||||
// - At least one page in VA range belongs to a non-migratable range
|
||||
// group and the specified UUID represents a fault-capable GPU.
|
||||
// - preferredLocationUuid is the UUID of a non-fault-capable GPU and at
|
||||
// least one page in the VA range belongs to a non-migratable range
|
||||
// group and another non-fault-capable GPU is in the accessed-by list
|
||||
// of the same page but P2P support between both GPUs has not been
|
||||
// enabled.
|
||||
//
|
||||
// NV_ERR_INVALID_ARGUMENT:
|
||||
// One of the following occured:
|
||||
// - preferredLocationUuid is the UUID of a CPU and preferredCpuNumaNode
|
||||
// refers to a registered GPU.
|
||||
// - preferredCpuNumaNode is invalid and preferredLocationUuid is the
|
||||
// UUID of the CPU.
|
||||
//
|
||||
// NV_ERR_NOT_SUPPORTED:
|
||||
// The UVM file descriptor is associated with another process and the
|
||||
// input virtual range corresponds to system-allocated pageable memory.
|
||||
//
|
||||
// NV_ERR_GENERIC:
|
||||
// Unexpected error. We try hard to avoid returning this error code,
|
||||
// because it is not very informative.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
#if UVM_API_REV_IS_AT_MOST(7)
|
||||
NV_STATUS UvmSetPreferredLocation(void *base,
|
||||
NvLength length,
|
||||
const NvProcessorUuid *preferredLocationUuid);
|
||||
#else
|
||||
NV_STATUS UvmSetPreferredLocation(void *base,
|
||||
NvLength length,
|
||||
const NvProcessorUuid *preferredLocationUuid,
|
||||
NvS32 preferredCpuNumaNode);
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// UvmUnsetPreferredLocation
|
||||
@@ -2326,10 +2372,9 @@ NV_STATUS UvmSetPreferredLocation(void *base,
|
||||
//
|
||||
// The virtual address range specified by (base, length) must have been
|
||||
// allocated via a call to either UvmAlloc or UvmMemMap, or be supported
|
||||
// system-allocated pageable memory. If the input range is pageable memory and
|
||||
// at least one GPU in the system supports transparent access to pageable
|
||||
// memory, the behavior described below does not take effect and the preferred
|
||||
// location of the pages in the given range does not change.
|
||||
// system-allocated pageable memory. If the input range corresponds to a file
|
||||
// backed shared mapping and least one GPU in the system supports transparent
|
||||
// access to pageable memory, the behavior below is not guaranteed.
|
||||
//
|
||||
// If the VA range is associated with a non-migratable range group, then that
|
||||
// association is cleared. i.e. the pages in this VA range have their range
|
||||
@@ -2348,10 +2393,18 @@ NV_STATUS UvmSetPreferredLocation(void *base,
|
||||
//
|
||||
// Errors:
|
||||
// NV_ERR_INVALID_ADDRESS:
|
||||
// base and length are not properly aligned or the range does not
|
||||
// represent a valid UVM allocation, or the range is pageable memory and
|
||||
// the system does not support accessing pageable memory, or the range
|
||||
// does not represent a supported Operating System allocation.
|
||||
// One of the following occured:
|
||||
// - base and length are not properly aligned or the range does not
|
||||
// represent a valid UVM allocation.
|
||||
// - The range is pageable memory and the system does not support
|
||||
// accessing pageable memory.
|
||||
// - The range does not represent a supported Operating System
|
||||
// allocation.
|
||||
// - The range contains both managed and pageable memory allocations.
|
||||
//
|
||||
// NV_ERR_NOT_SUPPORTED:
|
||||
// The UVM file descriptor is associated with another process and the
|
||||
// input virtual range corresponds to system-allocated pageable memory.
|
||||
//
|
||||
// NV_ERR_GENERIC:
|
||||
// Unexpected error. We try hard to avoid returning this error code,
|
||||
@@ -2632,13 +2685,34 @@ NV_STATUS UvmDisableSystemWideAtomics(const NvProcessorUuid *gpuUuid);
|
||||
// NV_ERR_INVALID_STATE:
|
||||
// UVM was not initialized before calling this function.
|
||||
//
|
||||
// NV_ERR_GENERIC:
|
||||
// Unexpected error. We try hard to avoid returning this error code,
|
||||
// because it is not very informative.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
NV_STATUS UvmGetFileDescriptor(UvmFileDescriptor *returnedFd);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// UvmGetMmFileDescriptor
|
||||
//
|
||||
// Returns the UVM file descriptor currently being used to keep the
|
||||
// memory management context valid. The data type of the returned file
|
||||
// descriptor is platform specific.
|
||||
//
|
||||
// If UvmInitialize has not yet been called, an error is returned.
|
||||
//
|
||||
// Arguments:
|
||||
// returnedFd: (OUTPUT)
|
||||
// A platform specific file descriptor.
|
||||
//
|
||||
// Error codes:
|
||||
// NV_ERR_INVALID_ARGUMENT:
|
||||
// returnedFd is NULL.
|
||||
//
|
||||
// NV_ERR_INVALID_STATE:
|
||||
// UVM was not initialized before calling this function.
|
||||
//
|
||||
// NV_ERR_NOT_SUPPORTED:
|
||||
// This file descriptor is not required on this platform.
|
||||
//------------------------------------------------------------------------------
|
||||
NV_STATUS UvmGetMmFileDescriptor(UvmFileDescriptor *returnedFd);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// UvmIs8Supported
|
||||
//
|
||||
@@ -3761,7 +3835,6 @@ NV_STATUS UvmToolsDisableCounters(UvmToolsCountersHandle counters,
|
||||
// NV_ERR_INVALID_ARGUMENT:
|
||||
// Read spans more than a single target process allocation.
|
||||
//
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
NV_STATUS UvmToolsReadProcessMemory(UvmToolsSessionHandle session,
|
||||
void *buffer,
|
||||
|
||||
@@ -49,11 +49,13 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// A single top level PDE on Ada covers 128 TB and that's the minimum size
|
||||
// that can be used.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 128ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 128 * UVM_SIZE_1TB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = 384ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = 384 * UVM_SIZE_1TB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
parent_gpu->ce_phys_vidmem_write_supported = true;
|
||||
|
||||
parent_gpu->peer_copy_mode = g_uvm_global.peer_copy_mode;
|
||||
|
||||
// Not all units on Ada support 49-bit addressing, including those which
|
||||
|
||||
@@ -47,14 +47,16 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// A single top level PDE on Ampere covers 128 TB and that's the minimum
|
||||
// size that can be used.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 128ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 128 * UVM_SIZE_1TB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = 384ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = 384 * UVM_SIZE_1TB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
// See uvm_mmu.h for mapping placement
|
||||
parent_gpu->flat_vidmem_va_base = 136ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->flat_sysmem_va_base = 256ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->flat_vidmem_va_base = 136 * UVM_SIZE_1TB;
|
||||
parent_gpu->flat_sysmem_va_base = 256 * UVM_SIZE_1TB;
|
||||
|
||||
parent_gpu->ce_phys_vidmem_write_supported = true;
|
||||
|
||||
parent_gpu->peer_copy_mode = g_uvm_global.peer_copy_mode;
|
||||
|
||||
|
||||
@@ -183,7 +183,10 @@ void uvm_hal_ampere_ce_memcopy_patch_src_c6b5(uvm_push_t *push, uvm_gpu_address_
|
||||
src->address -= uvm_pushbuffer_get_gpu_va_for_push(push->channel->pool->manager->pushbuffer, push);
|
||||
}
|
||||
|
||||
bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
|
||||
bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
size_t num_elements,
|
||||
size_t element_size)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
|
||||
@@ -201,21 +201,20 @@ static bool uvm_api_range_invalid_64k(NvU64 base, NvU64 length)
|
||||
return uvm_api_range_invalid_aligned(base, length, UVM_PAGE_SIZE_64K);
|
||||
}
|
||||
|
||||
// Returns true if the interval [start, start + length -1] is entirely covered
|
||||
// by vmas.
|
||||
//
|
||||
// LOCKING: mm->mmap_lock must be held in at least read mode.
|
||||
bool uvm_is_valid_vma_range(struct mm_struct *mm, NvU64 start, NvU64 length);
|
||||
typedef enum
|
||||
{
|
||||
UVM_API_RANGE_TYPE_MANAGED,
|
||||
UVM_API_RANGE_TYPE_HMM,
|
||||
UVM_API_RANGE_TYPE_ATS,
|
||||
UVM_API_RANGE_TYPE_INVALID
|
||||
} uvm_api_range_type_t;
|
||||
|
||||
// Check that the interval [base, base + length) is fully covered by UVM
|
||||
// managed ranges (NV_OK is returned), or (if ATS is enabled and mm != NULL)
|
||||
// fully covered by valid vmas (NV_WARN_NOTHING_TO_DO is returned), or (if HMM
|
||||
// is enabled and mm != NULL) fully covered by valid vmas (NV_OK is returned).
|
||||
// Any other input results in a return status of NV_ERR_INVALID_ADDRESS.
|
||||
// If the interval [base, base + length) is fully covered by VMAs which all have
|
||||
// the same uvm_api_range_type_t, that range type is returned.
|
||||
//
|
||||
// LOCKING: va_space->lock must be held in at least read mode. If mm != NULL,
|
||||
// mm->mmap_lock must also be held in at least read mode.
|
||||
NV_STATUS uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 base, NvU64 length);
|
||||
uvm_api_range_type_t uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 base, NvU64 length);
|
||||
|
||||
NV_STATUS uvm_api_pageable_mem_access_on_gpu(UVM_PAGEABLE_MEM_ACCESS_ON_GPU_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_api_register_gpu(UVM_REGISTER_GPU_PARAMS *params, struct file *filp);
|
||||
|
||||
@@ -57,6 +57,10 @@ NV_STATUS uvm_ats_add_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
|
||||
return uvm_ats_ibm_add_gpu(parent_gpu);
|
||||
}
|
||||
else if (UVM_ATS_SVA_SUPPORTED()) {
|
||||
if (g_uvm_global.ats.enabled)
|
||||
return uvm_ats_sva_add_gpu(parent_gpu);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
@@ -71,6 +75,10 @@ void uvm_ats_remove_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
|
||||
uvm_ats_ibm_remove_gpu(parent_gpu);
|
||||
}
|
||||
else if (UVM_ATS_SVA_SUPPORTED()) {
|
||||
if (g_uvm_global.ats.enabled)
|
||||
uvm_ats_sva_remove_gpu(parent_gpu);
|
||||
}
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_bind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
@@ -87,6 +95,8 @@ NV_STATUS uvm_ats_bind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
|
||||
if (UVM_ATS_IBM_SUPPORTED())
|
||||
status = uvm_ats_ibm_bind_gpu(gpu_va_space);
|
||||
else if (UVM_ATS_SVA_SUPPORTED())
|
||||
status = uvm_ats_sva_bind_gpu(gpu_va_space);
|
||||
|
||||
return status;
|
||||
}
|
||||
@@ -100,6 +110,8 @@ void uvm_ats_unbind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
|
||||
if (UVM_ATS_IBM_SUPPORTED())
|
||||
uvm_ats_ibm_unbind_gpu(gpu_va_space);
|
||||
else if (UVM_ATS_SVA_SUPPORTED())
|
||||
uvm_ats_sva_unbind_gpu(gpu_va_space);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_register_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
@@ -126,6 +138,8 @@ NV_STATUS uvm_ats_register_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
|
||||
if (UVM_ATS_IBM_SUPPORTED())
|
||||
status = uvm_ats_ibm_register_gpu_va_space(gpu_va_space);
|
||||
else if (UVM_ATS_SVA_SUPPORTED())
|
||||
status = uvm_ats_sva_register_gpu_va_space(gpu_va_space);
|
||||
|
||||
if (status == NV_OK)
|
||||
uvm_processor_mask_set(&va_space->ats.registered_gpu_va_spaces, gpu_id);
|
||||
@@ -148,6 +162,8 @@ void uvm_ats_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
|
||||
if (UVM_ATS_IBM_SUPPORTED())
|
||||
uvm_ats_ibm_unregister_gpu_va_space(gpu_va_space);
|
||||
else if (UVM_ATS_SVA_SUPPORTED())
|
||||
uvm_ats_sva_unregister_gpu_va_space(gpu_va_space);
|
||||
|
||||
uvm_va_space_down_write(va_space);
|
||||
uvm_processor_mask_clear(&va_space->ats.registered_gpu_va_spaces, gpu_id);
|
||||
|
||||
@@ -29,7 +29,9 @@
|
||||
#include "uvm_ats_ibm.h"
|
||||
#include "nv_uvm_types.h"
|
||||
|
||||
#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED())
|
||||
#include "uvm_ats_sva.h"
|
||||
|
||||
#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -41,6 +43,7 @@ typedef struct
|
||||
{
|
||||
uvm_ibm_va_space_t ibm;
|
||||
|
||||
uvm_sva_va_space_t sva;
|
||||
};
|
||||
} uvm_ats_va_space_t;
|
||||
|
||||
@@ -58,6 +61,7 @@ typedef struct
|
||||
{
|
||||
uvm_ibm_gpu_va_space_t ibm;
|
||||
|
||||
uvm_sva_gpu_va_space_t sva;
|
||||
};
|
||||
} uvm_ats_gpu_va_space_t;
|
||||
|
||||
@@ -90,6 +94,8 @@ void uvm_ats_remove_gpu(uvm_parent_gpu_t *parent_gpu);
|
||||
// LOCKING: mmap_lock must be lockable.
|
||||
// VA space lock must be lockable.
|
||||
// gpu_va_space->gpu must be retained.
|
||||
// mm must be retained with uvm_va_space_mm_retain() iff
|
||||
// UVM_ATS_SVA_SUPPORTED() is 1
|
||||
NV_STATUS uvm_ats_bind_gpu(uvm_gpu_va_space_t *gpu_va_space);
|
||||
|
||||
// Decrements the refcount on the {gpu, mm} pair. Removes the binding from the
|
||||
|
||||
@@ -41,18 +41,18 @@ MODULE_PARM_DESC(uvm_exp_perf_prefetch_ats_order_non_replayable,
|
||||
// the module parameters, clamped to the vma containing fault_addr (if any).
|
||||
// Note that this means the region contains fault_addr but may not begin at
|
||||
// fault_addr.
|
||||
static void expand_fault_region(struct mm_struct *mm,
|
||||
NvU64 fault_addr,
|
||||
static void expand_fault_region(struct vm_area_struct *vma,
|
||||
NvU64 start,
|
||||
size_t length,
|
||||
uvm_fault_client_type_t client_type,
|
||||
unsigned long *start,
|
||||
unsigned long *size)
|
||||
unsigned long *migrate_start,
|
||||
unsigned long *migrate_length)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned int order;
|
||||
unsigned long outer, aligned_start, aligned_size;
|
||||
|
||||
*start = fault_addr;
|
||||
*size = PAGE_SIZE;
|
||||
*migrate_start = start;
|
||||
*migrate_length = length;
|
||||
|
||||
if (client_type == UVM_FAULT_CLIENT_TYPE_HUB)
|
||||
order = uvm_exp_perf_prefetch_ats_order_non_replayable;
|
||||
@@ -62,32 +62,31 @@ static void expand_fault_region(struct mm_struct *mm,
|
||||
if (order == 0)
|
||||
return;
|
||||
|
||||
vma = find_vma_intersection(mm, fault_addr, fault_addr + 1);
|
||||
if (!vma)
|
||||
return;
|
||||
|
||||
UVM_ASSERT(vma);
|
||||
UVM_ASSERT(order < BITS_PER_LONG - PAGE_SHIFT);
|
||||
|
||||
aligned_size = (1UL << order) * PAGE_SIZE;
|
||||
|
||||
aligned_start = fault_addr & ~(aligned_size - 1);
|
||||
aligned_start = start & ~(aligned_size - 1);
|
||||
|
||||
*start = max(vma->vm_start, aligned_start);
|
||||
*migrate_start = max(vma->vm_start, aligned_start);
|
||||
outer = min(vma->vm_end, aligned_start + aligned_size);
|
||||
*size = outer - *start;
|
||||
*migrate_length = outer - *migrate_start;
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
NvU64 fault_addr,
|
||||
uvm_fault_access_type_t access_type,
|
||||
uvm_fault_client_type_t client_type)
|
||||
static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct vm_area_struct *vma,
|
||||
NvU64 start,
|
||||
size_t length,
|
||||
uvm_fault_access_type_t access_type,
|
||||
uvm_fault_client_type_t client_type)
|
||||
{
|
||||
uvm_va_space_t *va_space = gpu_va_space->va_space;
|
||||
struct mm_struct *mm = va_space->va_space_mm.mm;
|
||||
bool write = (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
|
||||
NV_STATUS status;
|
||||
NvU64 start;
|
||||
NvU64 length;
|
||||
NvU64 user_space_start;
|
||||
NvU64 user_space_length;
|
||||
|
||||
// Request uvm_migrate_pageable() to touch the corresponding page after
|
||||
// population.
|
||||
@@ -124,16 +123,14 @@ static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
.populate_permissions = write ? UVM_POPULATE_PERMISSIONS_WRITE : UVM_POPULATE_PERMISSIONS_ANY,
|
||||
.touch = true,
|
||||
.skip_mapped = true,
|
||||
.user_space_start = &start,
|
||||
.user_space_length = &length,
|
||||
.user_space_start = &user_space_start,
|
||||
.user_space_length = &user_space_length,
|
||||
};
|
||||
|
||||
UVM_ASSERT(uvm_ats_can_service_faults(gpu_va_space, mm));
|
||||
|
||||
expand_fault_region(mm, fault_addr, client_type, &uvm_migrate_args.start, &uvm_migrate_args.length);
|
||||
expand_fault_region(vma, start, length, client_type, &uvm_migrate_args.start, &uvm_migrate_args.length);
|
||||
|
||||
// TODO: Bug 2103669: Service more than a single fault at a time
|
||||
//
|
||||
// We are trying to use migrate_vma API in the kernel (if it exists) to
|
||||
// populate and map the faulting region on the GPU. We want to do this only
|
||||
// on the first touch. That is, pages which are not already mapped. So, we
|
||||
@@ -148,114 +145,141 @@ static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_service_fault_entry(uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_fault_buffer_entry_t *current_entry,
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate)
|
||||
static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
|
||||
NvU64 addr,
|
||||
size_t size,
|
||||
uvm_fault_client_type_t client_type)
|
||||
{
|
||||
NvU64 gmmu_region_base;
|
||||
bool in_gmmu_region;
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_fault_access_type_t service_access_type;
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate;
|
||||
|
||||
if (client_type == UVM_FAULT_CLIENT_TYPE_GPC)
|
||||
ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.replayable.ats_invalidate;
|
||||
else
|
||||
ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;
|
||||
|
||||
if (!ats_invalidate->write_faults_in_batch) {
|
||||
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
|
||||
ats_invalidate->write_faults_in_batch = true;
|
||||
}
|
||||
|
||||
uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct vm_area_struct *vma,
|
||||
NvU64 base,
|
||||
uvm_ats_fault_context_t *ats_context)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_va_block_region_t subregion;
|
||||
uvm_va_block_region_t region = uvm_va_block_region(0, PAGES_PER_UVM_VA_BLOCK);
|
||||
uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
|
||||
uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
|
||||
uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
|
||||
uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask;
|
||||
uvm_fault_client_type_t client_type = ats_context->client_type;
|
||||
|
||||
UVM_ASSERT(vma);
|
||||
UVM_ASSERT(IS_ALIGNED(base, UVM_VA_BLOCK_SIZE));
|
||||
UVM_ASSERT(g_uvm_global.ats.enabled);
|
||||
UVM_ASSERT(gpu_va_space);
|
||||
UVM_ASSERT(gpu_va_space->ats.enabled);
|
||||
UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
|
||||
|
||||
UVM_ASSERT(current_entry->fault_access_type ==
|
||||
uvm_fault_access_type_mask_highest(current_entry->access_type_mask));
|
||||
uvm_page_mask_zero(faults_serviced_mask);
|
||||
uvm_page_mask_zero(reads_serviced_mask);
|
||||
|
||||
service_access_type = current_entry->fault_access_type;
|
||||
if (!(vma->vm_flags & VM_READ))
|
||||
return status;
|
||||
|
||||
// ATS lookups are disabled on all addresses within the same
|
||||
// UVM_GMMU_ATS_GRANULARITY as existing GMMU mappings (see documentation in
|
||||
// uvm_mmu.h). User mode is supposed to reserve VAs as appropriate to
|
||||
// prevent any system memory allocations from falling within the NO_ATS
|
||||
// range of other GMMU mappings, so this shouldn't happen during normal
|
||||
// operation. However, since this scenario may lead to infinite fault loops,
|
||||
// we handle it by canceling the fault.
|
||||
//
|
||||
// TODO: Bug 2103669: Remove redundant VA range lookups
|
||||
gmmu_region_base = UVM_ALIGN_DOWN(current_entry->fault_address, UVM_GMMU_ATS_GRANULARITY);
|
||||
in_gmmu_region = !uvm_va_space_range_empty(current_entry->va_space,
|
||||
gmmu_region_base,
|
||||
gmmu_region_base + UVM_GMMU_ATS_GRANULARITY - 1);
|
||||
if (in_gmmu_region) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
}
|
||||
else {
|
||||
// TODO: Bug 2103669: Service more than a single fault at a time
|
||||
status = uvm_ats_service_fault(gpu_va_space,
|
||||
current_entry->fault_address,
|
||||
service_access_type,
|
||||
current_entry->fault_source.client_type);
|
||||
if (!(vma->vm_flags & VM_WRITE)) {
|
||||
// If VMA doesn't have write permissions, all write faults are fatal.
|
||||
// Try servicing such faults for read iff they are also present in
|
||||
// read_fault_mask. This is because for replayable faults, if there are
|
||||
// pending read accesses on the same page, we have to service them
|
||||
// before we can cancel the write/atomic faults. So we try with read
|
||||
// fault access type even though these write faults are fatal.
|
||||
if (ats_context->client_type == UVM_FAULT_CLIENT_TYPE_GPC)
|
||||
uvm_page_mask_and(write_fault_mask, write_fault_mask, read_fault_mask);
|
||||
else
|
||||
uvm_page_mask_zero(write_fault_mask);
|
||||
}
|
||||
|
||||
// Do not flag prefetch faults as fatal unless something fatal happened
|
||||
if (status == NV_ERR_INVALID_ADDRESS) {
|
||||
if (current_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH) {
|
||||
current_entry->is_fatal = true;
|
||||
current_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
|
||||
for_each_va_block_subregion_in_mask(subregion, write_fault_mask, region) {
|
||||
NvU64 start = base + (subregion.first * PAGE_SIZE);
|
||||
size_t length = uvm_va_block_region_num_pages(subregion) * PAGE_SIZE;
|
||||
uvm_fault_access_type_t access_type = (vma->vm_flags & VM_WRITE) ?
|
||||
UVM_FAULT_ACCESS_TYPE_WRITE :
|
||||
UVM_FAULT_ACCESS_TYPE_READ;
|
||||
|
||||
// Compute cancel mode for replayable faults
|
||||
if (current_entry->is_replayable) {
|
||||
if (service_access_type == UVM_FAULT_ACCESS_TYPE_READ || in_gmmu_region)
|
||||
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
else
|
||||
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
|
||||
UVM_ASSERT(start >= vma->vm_start);
|
||||
UVM_ASSERT((start + length) <= vma->vm_end);
|
||||
|
||||
// If there are pending read accesses on the same page, we have to
|
||||
// service them before we can cancel the write/atomic faults. So we
|
||||
// retry with read fault access type.
|
||||
if (!in_gmmu_region &&
|
||||
current_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ &&
|
||||
uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
|
||||
status = uvm_ats_service_fault(gpu_va_space,
|
||||
current_entry->fault_address,
|
||||
UVM_FAULT_ACCESS_TYPE_READ,
|
||||
current_entry->fault_source.client_type);
|
||||
status = service_ats_faults(gpu_va_space, vma, start, length, access_type, client_type);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// If read accesses are also invalid, cancel the fault. If a
|
||||
// different error code is returned, exit
|
||||
if (status == NV_ERR_INVALID_ADDRESS)
|
||||
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
else if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
}
|
||||
if (vma->vm_flags & VM_WRITE) {
|
||||
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
|
||||
|
||||
// The Linux kernel never invalidates TLB entries on mapping
|
||||
// permission upgrade. This is a problem if the GPU has cached
|
||||
// entries with the old permission. The GPU will re-fetch the entry
|
||||
// if the PTE is invalid and page size is not 4K (this is the case
|
||||
// on P9). However, if a page gets upgraded from R/O to R/W and GPU
|
||||
// has the PTEs cached with R/O permissions we will enter an
|
||||
// infinite loop because we just forward the fault to the Linux
|
||||
// kernel and it will see that the permissions in the page table are
|
||||
// correct. Therefore, we flush TLB entries on ATS write faults.
|
||||
flush_tlb_write_faults(gpu_va_space, start, length, client_type);
|
||||
}
|
||||
else {
|
||||
current_entry->is_invalid_prefetch = true;
|
||||
uvm_page_mask_region_fill(reads_serviced_mask, subregion);
|
||||
}
|
||||
|
||||
// Do not fail overall fault servicing due to logical errors
|
||||
status = NV_OK;
|
||||
}
|
||||
|
||||
// The Linux kernel never invalidates TLB entries on mapping permission
|
||||
// upgrade. This is a problem if the GPU has cached entries with the old
|
||||
// permission. The GPU will re-fetch the entry if the PTE is invalid and
|
||||
// page size is not 4K (this is the case on P9). However, if a page gets
|
||||
// upgraded from R/O to R/W and GPU has the PTEs cached with R/O
|
||||
// permissions we will enter an infinite loop because we just forward the
|
||||
// fault to the Linux kernel and it will see that the permissions in the
|
||||
// page table are correct. Therefore, we flush TLB entries on ATS write
|
||||
// faults.
|
||||
if (!current_entry->is_fatal && current_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) {
|
||||
if (!ats_invalidate->write_faults_in_batch) {
|
||||
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
|
||||
ats_invalidate->write_faults_in_batch = true;
|
||||
}
|
||||
// Remove write faults from read_fault_mask
|
||||
uvm_page_mask_andnot(read_fault_mask, read_fault_mask, write_fault_mask);
|
||||
|
||||
uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch,
|
||||
current_entry->fault_address,
|
||||
PAGE_SIZE,
|
||||
PAGE_SIZE,
|
||||
UVM_MEMBAR_NONE);
|
||||
for_each_va_block_subregion_in_mask(subregion, read_fault_mask, region) {
|
||||
NvU64 start = base + (subregion.first * PAGE_SIZE);
|
||||
size_t length = uvm_va_block_region_num_pages(subregion) * PAGE_SIZE;
|
||||
|
||||
UVM_ASSERT(start >= vma->vm_start);
|
||||
UVM_ASSERT((start + length) <= vma->vm_end);
|
||||
|
||||
status = service_ats_faults(gpu_va_space, vma, start, length, UVM_FAULT_ACCESS_TYPE_READ, client_type);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next)
|
||||
{
|
||||
uvm_va_range_t *prev;
|
||||
NvU64 gmmu_region_base = UVM_ALIGN_DOWN(address, UVM_GMMU_ATS_GRANULARITY);
|
||||
|
||||
UVM_ASSERT(va_space);
|
||||
|
||||
if (next) {
|
||||
if (next->node.start <= gmmu_region_base + UVM_GMMU_ATS_GRANULARITY - 1)
|
||||
return true;
|
||||
|
||||
prev = uvm_va_range_container(uvm_range_tree_prev(&va_space->va_range_tree, &next->node));
|
||||
}
|
||||
else {
|
||||
// No VA range exists after address, so check the last VA range in the
|
||||
// tree.
|
||||
prev = uvm_va_range_container(uvm_range_tree_last(&va_space->va_range_tree));
|
||||
}
|
||||
|
||||
return prev && (prev->node.end >= gmmu_region_base);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate,
|
||||
uvm_tracker_t *out_tracker)
|
||||
@@ -287,3 +311,4 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
@@ -25,10 +25,31 @@
|
||||
#include "uvm_lock.h"
|
||||
#include "uvm_global.h"
|
||||
#include "uvm_va_space.h"
|
||||
#include "uvm_gpu.h"
|
||||
|
||||
NV_STATUS uvm_ats_service_fault_entry(uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_fault_buffer_entry_t *current_entry,
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate);
|
||||
// Service ATS faults in the range (base, base + UVM_VA_BLOCK_SIZE) with service
|
||||
// type for individual pages in the range requested by page masks set in
|
||||
// ats_context->read_fault_mask/write_fault_mask. base must be aligned to
|
||||
// UVM_VA_BLOCK_SIZE. The caller is responsible for ensuring that faulting
|
||||
// addresses fall completely within the VMA. The caller is also responsible for
|
||||
// ensuring that the faulting addresses don't overlap a GMMU region. (See
|
||||
// uvm_ats_check_in_gmmu_region). The caller is also responsible for handling
|
||||
// any errors returned by this function (fault cancellations etc.).
|
||||
//
|
||||
// Returns the fault service status in ats_context->faults_serviced_mask. In
|
||||
// addition, ats_context->reads_serviced_mask returns whether read servicing
|
||||
// worked on write faults iff the read service was also requested in the
|
||||
// corresponding bit in read_fault_mask. These returned masks are only valid if
|
||||
// the return status is NV_OK. Status other than NV_OK indicate system global
|
||||
// fault servicing failures.
|
||||
NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct vm_area_struct *vma,
|
||||
NvU64 base,
|
||||
uvm_ats_fault_context_t *ats_context);
|
||||
|
||||
// Return whether there are any VA ranges (and thus GMMU mappings) within the
|
||||
// UVM_GMMU_ATS_GRANULARITY-aligned region containing address.
|
||||
bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next);
|
||||
|
||||
// This function performs pending TLB invalidations for ATS and clears the
|
||||
// ats_invalidate->write_faults_in_batch flag
|
||||
|
||||
156
kernel-open/nvidia-uvm/uvm_ats_sva.c
Normal file
156
kernel-open/nvidia-uvm/uvm_ats_sva.c
Normal file
@@ -0,0 +1,156 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2018-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_ats_sva.h"
|
||||
|
||||
#if UVM_ATS_SVA_SUPPORTED()
|
||||
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_va_space.h"
|
||||
#include "uvm_va_space_mm.h"
|
||||
|
||||
#include <linux/iommu.h>
|
||||
#include <linux/mm_types.h>
|
||||
|
||||
// linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
|
||||
// reference required for the iommu_sva_bind_device() call. This header is not
|
||||
// present in all the supported versions. Instead of adding a conftest just for
|
||||
// this header file, use UVM_ATS_SVA_SUPPORTED().
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
// iommu_sva_bind_device() removed drvdata paramter with commit
|
||||
// 942fd5435dccb273f90176b046ae6bbba60cfbd8 (10/31/2022).
|
||||
#if defined(NV_IOMMU_SVA_BIND_DEVICE_HAS_DRVDATA_ARG)
|
||||
#define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm, NULL)
|
||||
#else
|
||||
#define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
|
||||
#endif
|
||||
|
||||
NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
|
||||
|
||||
return errno_to_nv_status(ret);
|
||||
}
|
||||
|
||||
void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_sva_bind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
struct iommu_sva *iommu_handle;
|
||||
struct pci_dev *pci_dev = gpu_va_space->gpu->parent->pci_dev;
|
||||
uvm_sva_gpu_va_space_t *sva_gpu_va_space = &gpu_va_space->ats.sva;
|
||||
struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
|
||||
|
||||
UVM_ASSERT(gpu_va_space->ats.enabled);
|
||||
UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_INIT);
|
||||
UVM_ASSERT(mm);
|
||||
|
||||
// The mmput() below may trigger the kernel's mm teardown with exit_mmap()
|
||||
// and uvm_va_space_mm_shutdown() and uvm_vm_close_managed() in that path
|
||||
// will try to grab the va_space lock and deadlock if va_space was already
|
||||
// locked.
|
||||
uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE);
|
||||
|
||||
// iommu_sva_bind_device() requires the mm reference to be acquired. Since
|
||||
// the mm is already retained, mm is still valid but may be inactive since
|
||||
// mm_users can still be zero since UVM doesn't use mm_users and maintains a
|
||||
// separate refcount (retained_count) for the mm in va_space_mm. See the
|
||||
// block comment in va_space_mm.c for more details. So, return an error if
|
||||
// mm_users is zero.
|
||||
if (!mmget_not_zero(mm))
|
||||
return NV_ERR_PAGE_TABLE_NOT_AVAIL;
|
||||
|
||||
// Multiple calls for the {same pci_dev, mm} pair are refcounted by the ARM
|
||||
// SMMU Layer.
|
||||
iommu_handle = UVM_IOMMU_SVA_BIND_DEVICE(&pci_dev->dev, mm);
|
||||
if (IS_ERR(iommu_handle)) {
|
||||
status = errno_to_nv_status(PTR_ERR(iommu_handle));
|
||||
goto out;
|
||||
}
|
||||
|
||||
// If this is not the first bind of the gpu in the mm, then the previously
|
||||
// stored iommu_handle in the gpu_va_space must match the handle returned by
|
||||
// iommu_sva_bind_device().
|
||||
if (sva_gpu_va_space->iommu_handle) {
|
||||
UVM_ASSERT(sva_gpu_va_space->iommu_handle == iommu_handle);
|
||||
nv_kref_get(&sva_gpu_va_space->kref);
|
||||
}
|
||||
else {
|
||||
sva_gpu_va_space->iommu_handle = iommu_handle;
|
||||
nv_kref_init(&sva_gpu_va_space->kref);
|
||||
}
|
||||
|
||||
out:
|
||||
mmput(mm);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void uvm_sva_reset_iommu_handle(nv_kref_t *nv_kref)
|
||||
{
|
||||
uvm_sva_gpu_va_space_t *sva_gpu_va_space = container_of(nv_kref, uvm_sva_gpu_va_space_t, kref);
|
||||
sva_gpu_va_space->iommu_handle = NULL;
|
||||
}
|
||||
|
||||
void uvm_ats_sva_unbind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
uvm_sva_gpu_va_space_t *sva_gpu_va_space = &gpu_va_space->ats.sva;
|
||||
|
||||
// ARM SMMU layer decrements the refcount for the {pci_dev, mm} pair.
|
||||
// The actual unbind happens only when the refcount reaches zero.
|
||||
if (sva_gpu_va_space->iommu_handle) {
|
||||
iommu_sva_unbind_device(sva_gpu_va_space->iommu_handle);
|
||||
nv_kref_put(&sva_gpu_va_space->kref, uvm_sva_reset_iommu_handle);
|
||||
}
|
||||
}
|
||||
|
||||
NV_STATUS uvm_ats_sva_register_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
NvU32 pasid;
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_sva_gpu_va_space_t *sva_gpu_va_space = &gpu_va_space->ats.sva;
|
||||
|
||||
// A successful iommu_sva_bind_device() should have preceded this call.
|
||||
UVM_ASSERT(sva_gpu_va_space->iommu_handle);
|
||||
|
||||
pasid = iommu_sva_get_pasid(sva_gpu_va_space->iommu_handle);
|
||||
if (pasid == IOMMU_PASID_INVALID)
|
||||
status = errno_to_nv_status(ENODEV);
|
||||
else
|
||||
gpu_va_space->ats.pasid = pasid;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
gpu_va_space->ats.pasid = -1U;
|
||||
}
|
||||
|
||||
#endif // UVM_ATS_SVA_SUPPORTED()
|
||||
112
kernel-open/nvidia-uvm/uvm_ats_sva.h
Normal file
112
kernel-open/nvidia-uvm/uvm_ats_sva.h
Normal file
@@ -0,0 +1,112 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2018-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef __UVM_ATS_SVA_H__
|
||||
#define __UVM_ATS_SVA_H__
|
||||
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_forward_decl.h"
|
||||
|
||||
#include <linux/iommu.h>
|
||||
|
||||
// For ATS support on aarch64, arm_smmu_sva_bind() is needed for
|
||||
// iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
|
||||
// conftest-able. We instead look for the presence of ioasid_get() or
|
||||
// mm_pasid_set(). ioasid_get() was added in the same patch series as
|
||||
// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
|
||||
// same patch as the removal of ioasid_get(). We assume the presence of
|
||||
// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
|
||||
// present.
|
||||
//
|
||||
// arm_smmu_sva_bind() was added with commit
|
||||
// 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
|
||||
// commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
|
||||
// 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
|
||||
// and added mm_pasid_set().
|
||||
#if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
|
||||
#define UVM_ATS_SVA_SUPPORTED() 1
|
||||
#else
|
||||
#define UVM_ATS_SVA_SUPPORTED() 0
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int placeholder;
|
||||
} uvm_sva_va_space_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Reference count for the iommu_handle
|
||||
nv_kref_t kref;
|
||||
struct iommu_sva *iommu_handle;
|
||||
} uvm_sva_gpu_va_space_t;
|
||||
|
||||
#if UVM_ATS_SVA_SUPPORTED()
|
||||
NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu);
|
||||
|
||||
// LOCKING: mmap_lock must be lockable
|
||||
// VA space lock must not be held.
|
||||
NV_STATUS uvm_ats_sva_bind_gpu(uvm_gpu_va_space_t *gpu_va_space);
|
||||
|
||||
// LOCKING: VA space lock must not be held.
|
||||
void uvm_ats_sva_unbind_gpu(uvm_gpu_va_space_t *gpu_va_space);
|
||||
|
||||
// LOCKING: None
|
||||
NV_STATUS uvm_ats_sva_register_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
|
||||
|
||||
// LOCKING: None
|
||||
void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
|
||||
#else
|
||||
static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_ats_sva_bind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void uvm_ats_sva_unbind_gpu(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_ats_sva_register_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
|
||||
{
|
||||
|
||||
}
|
||||
#endif // UVM_ATS_SVA_SUPPORTED
|
||||
|
||||
#endif // __UVM_ATS_SVA_H__
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "uvm_va_space.h"
|
||||
#include "uvm_rm_mem.h"
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_gpu.h"
|
||||
|
||||
#define CE_TEST_MEM_SIZE (2 * 1024 * 1024)
|
||||
#define CE_TEST_MEM_END_SIZE 32
|
||||
@@ -53,6 +54,11 @@ static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
|
||||
uvm_push_t push;
|
||||
bool is_proxy;
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, CE_TEST_MEM_SIZE, 0, &host_mem);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
host_ptr = (NvU32 *)uvm_rm_mem_get_cpu_va(host_mem);
|
||||
@@ -67,7 +73,7 @@ static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
is_proxy = uvm_channel_is_proxy(push.channel);
|
||||
host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, is_proxy);
|
||||
host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, is_proxy).address;
|
||||
|
||||
// All of the following CE transfers are done from a single (L)CE and
|
||||
// disabling pipelining is enough to order them when needed. Only push_end
|
||||
@@ -75,7 +81,7 @@ static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
|
||||
|
||||
// Initialize to a bad value
|
||||
for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
|
||||
mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy);
|
||||
mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy).address;
|
||||
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
@@ -84,7 +90,7 @@ static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
|
||||
|
||||
// Set the first buffer to 1
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[0], gpu, is_proxy);
|
||||
mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[0], gpu, is_proxy).address;
|
||||
gpu->parent->ce_hal->memset_v_4(&push, mem_gpu_va, 1, CE_TEST_MEM_SIZE);
|
||||
|
||||
for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
|
||||
@@ -92,9 +98,9 @@ static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
|
||||
if (dst == CE_TEST_MEM_COUNT)
|
||||
dst_va = host_mem_gpu_va;
|
||||
else
|
||||
dst_va = uvm_rm_mem_get_gpu_va(mem[dst], gpu, is_proxy);
|
||||
dst_va = uvm_rm_mem_get_gpu_va(mem[dst], gpu, is_proxy).address;
|
||||
|
||||
src_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy);
|
||||
src_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy).address;
|
||||
|
||||
// The first memcpy needs to be non-pipelined as otherwise the previous
|
||||
// memset/memcpy to the source may not be done yet.
|
||||
@@ -168,6 +174,11 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)
|
||||
uvm_push_t push;
|
||||
NvU32 value;
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(NvU32), 0, &host_mem);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
host_ptr = (NvU32 *)uvm_rm_mem_get_cpu_va(host_mem);
|
||||
@@ -176,7 +187,7 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Membar test");
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, uvm_channel_is_proxy(push.channel));
|
||||
host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, uvm_channel_is_proxy(push.channel)).address;
|
||||
|
||||
for (i = 0; i < REDUCTIONS; ++i) {
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
@@ -327,6 +338,11 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
if (!gpu->parent->ce_hal->memcopy_is_valid(&push, dst, src)) {
|
||||
TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// The input virtual addresses exist in UVM's internal address space, not
|
||||
// the proxy address space
|
||||
if (uvm_channel_is_proxy(push.channel)) {
|
||||
@@ -334,6 +350,16 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
|
||||
return NV_ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
// If physical accesses aren't supported, silently convert to virtual to
|
||||
// test the flat mapping.
|
||||
TEST_CHECK_RET(gpu_verif_addr.is_virtual);
|
||||
|
||||
if (!src.is_virtual)
|
||||
src = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(src.aperture, src.address));
|
||||
|
||||
if (!dst.is_virtual)
|
||||
dst = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(dst.aperture, dst.address));
|
||||
|
||||
// Memset src with the appropriate element size, then memcpy to dst and from
|
||||
// dst to the verif location (physical sysmem).
|
||||
|
||||
@@ -383,17 +409,17 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
|
||||
uvm_mem_t *gpu_uvm_mem = NULL;
|
||||
uvm_rm_mem_t *sys_rm_mem = NULL;
|
||||
uvm_rm_mem_t *gpu_rm_mem = NULL;
|
||||
uvm_gpu_address_t gpu_addresses[4];
|
||||
NvU64 gpu_va;
|
||||
size_t size;
|
||||
uvm_gpu_address_t gpu_addresses[4] = {0};
|
||||
size_t size = gpu->big_page.internal_size;
|
||||
static const size_t element_sizes[] = {1, 4, 8};
|
||||
const size_t iterations = 4;
|
||||
size_t i, j, k, s;
|
||||
uvm_mem_alloc_params_t mem_params = {0};
|
||||
|
||||
size = gpu->big_page.internal_size;
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &verif_mem), done);
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, &verif_mem), done);
|
||||
else
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &verif_mem), done);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, gpu), done);
|
||||
|
||||
gpu_verif_addr = uvm_mem_gpu_address_virtual_kernel(verif_mem, gpu);
|
||||
@@ -432,18 +458,27 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
|
||||
// Virtual address (in UVM's internal address space) backed by vidmem
|
||||
TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, size, 0, &gpu_rm_mem), done);
|
||||
is_proxy_va_space = false;
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);
|
||||
gpu_addresses[2] = uvm_gpu_address_virtual(gpu_va);
|
||||
gpu_addresses[2] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);
|
||||
|
||||
// Virtual address (in UVM's internal address space) backed by sysmem
|
||||
TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);
|
||||
gpu_addresses[3] = uvm_gpu_address_virtual(gpu_va);
|
||||
gpu_addresses[3] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);
|
||||
|
||||
for (i = 0; i < iterations; ++i) {
|
||||
for (j = 0; j < ARRAY_SIZE(gpu_addresses); ++j) {
|
||||
for (k = 0; k < ARRAY_SIZE(gpu_addresses); ++k) {
|
||||
for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
|
||||
// Because gpu_verif_addr is in sysmem, when the Confidential
|
||||
// Computing feature is enabled, only the following cases are
|
||||
// valid.
|
||||
//
|
||||
// TODO: Bug 3839176: the test partially waived on
|
||||
// Confidential Computing because it assumes that GPU can
|
||||
// access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu) &&
|
||||
!(gpu_addresses[k].is_unprotected && gpu_addresses[j].is_unprotected)) {
|
||||
continue;
|
||||
}
|
||||
TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
|
||||
gpu_addresses[k],
|
||||
gpu_addresses[j],
|
||||
@@ -514,6 +549,11 @@ static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)
|
||||
// Semaphore reduction needs 1 word (4 bytes).
|
||||
const size_t size = sizeof(NvU32);
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = test_semaphore_alloc_sem(gpu, size, &mem);
|
||||
TEST_CHECK_RET(status == NV_OK);
|
||||
|
||||
@@ -561,6 +601,11 @@ static NV_STATUS test_semaphore_release(uvm_gpu_t *gpu)
|
||||
// Semaphore release needs 1 word (4 bytes).
|
||||
const size_t size = sizeof(NvU32);
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = test_semaphore_alloc_sem(gpu, size, &mem);
|
||||
TEST_CHECK_RET(status == NV_OK);
|
||||
|
||||
@@ -610,6 +655,11 @@ static NV_STATUS test_semaphore_timestamp(uvm_gpu_t *gpu)
|
||||
// The semaphore is 4 words long (16 bytes).
|
||||
const size_t size = 16;
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = test_semaphore_alloc_sem(gpu, size, &mem);
|
||||
TEST_CHECK_RET(status == NV_OK);
|
||||
|
||||
@@ -646,6 +696,517 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static bool mem_match(uvm_mem_t *mem1, uvm_mem_t *mem2, size_t size)
|
||||
{
|
||||
void *mem1_addr;
|
||||
void *mem2_addr;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem1));
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem2));
|
||||
UVM_ASSERT(mem1->size >= size);
|
||||
UVM_ASSERT(mem2->size >= size);
|
||||
|
||||
mem1_addr = uvm_mem_get_cpu_addr_kernel(mem1);
|
||||
mem2_addr = uvm_mem_get_cpu_addr_kernel(mem2);
|
||||
|
||||
return !memcmp(mem1_addr, mem2_addr, size);
|
||||
}
|
||||
|
||||
static NV_STATUS zero_vidmem(uvm_mem_t *mem)
|
||||
{
|
||||
uvm_push_t push;
|
||||
uvm_gpu_address_t gpu_address;
|
||||
uvm_gpu_t *gpu = mem->backing_gpu;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
|
||||
|
||||
gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
|
||||
gpu->parent->ce_hal->memset_1(&push, gpu_address, 0, mem->size);
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void write_range_cpu(uvm_mem_t *mem, NvU64 base_val)
|
||||
{
|
||||
NvU64 *mem_cpu_va;
|
||||
unsigned i;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem));
|
||||
UVM_ASSERT(IS_ALIGNED(mem->size, sizeof(*mem_cpu_va)));
|
||||
|
||||
mem_cpu_va = (NvU64 *) uvm_mem_get_cpu_addr_kernel(mem);
|
||||
|
||||
for (i = 0; i < (mem->size / sizeof(*mem_cpu_va)); i++)
|
||||
mem_cpu_va[i] = base_val++;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_vidmem_protected(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(mem);
|
||||
|
||||
*mem = NULL;
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc_vidmem_protected(size, gpu, mem));
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
|
||||
TEST_NV_CHECK_GOTO(zero_vidmem(*mem), err);
|
||||
|
||||
return NV_OK;
|
||||
|
||||
err:
|
||||
uvm_mem_free(*mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_sysmem_unprotected(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(mem);
|
||||
|
||||
*mem = NULL;
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(*mem), err);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
|
||||
|
||||
memset(uvm_mem_get_cpu_addr_kernel(*mem), 0, (*mem)->size);
|
||||
|
||||
return NV_OK;
|
||||
|
||||
err:
|
||||
uvm_mem_free(*mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void cpu_encrypt(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
{
|
||||
size_t offset = 0;
|
||||
char *src_plain = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
char *dst_cipher = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
|
||||
while (offset < size) {
|
||||
uvm_conf_computing_cpu_encrypt(channel, dst_cipher, src_plain, NULL, copy_size, auth_tag_buffer);
|
||||
|
||||
offset += copy_size;
|
||||
dst_cipher += copy_size;
|
||||
src_plain += copy_size;
|
||||
auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpu_acquire_encryption_ivs(uvm_channel_t *channel,
|
||||
size_t size,
|
||||
NvU32 copy_size,
|
||||
UvmCslIv *ivs)
|
||||
{
|
||||
size_t offset = 0;
|
||||
int i = 0;
|
||||
|
||||
for (; offset < size; offset += copy_size)
|
||||
uvm_conf_computing_acquire_encryption_iv(channel, &ivs[i++]);
|
||||
}
|
||||
|
||||
static void cpu_encrypt_rev(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size,
|
||||
UvmCslIv *encrypt_iv)
|
||||
{
|
||||
char *src_plain = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
char *dst_cipher = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
int i;
|
||||
|
||||
// CPU encrypt order is the opposite of the GPU decrypt order
|
||||
for (i = (size / copy_size) - 1; i >= 0; i--) {
|
||||
uvm_conf_computing_cpu_encrypt(channel,
|
||||
dst_cipher + i * copy_size,
|
||||
src_plain + i * copy_size,
|
||||
encrypt_iv + i,
|
||||
copy_size,
|
||||
auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
const UvmCslIv *decrypt_iv,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
{
|
||||
size_t i;
|
||||
char *dst_plain = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
char *src_cipher = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
|
||||
for (i = 0; i < size / copy_size; i++) {
|
||||
TEST_NV_CHECK_RET(uvm_conf_computing_cpu_decrypt(channel,
|
||||
dst_plain + i * copy_size,
|
||||
src_cipher + i * copy_size,
|
||||
decrypt_iv + i,
|
||||
copy_size,
|
||||
auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
const UvmCslIv *decrypt_iv,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
{
|
||||
int i;
|
||||
char *dst_plain = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
char *src_cipher = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
|
||||
UVM_ASSERT((size / copy_size) <= INT_MAX);
|
||||
|
||||
// CPU decrypt order is the opposite of the GPU decrypt order
|
||||
for (i = (size / copy_size) - 1; i >= 0; i--) {
|
||||
TEST_NV_CHECK_RET(uvm_conf_computing_cpu_decrypt(channel,
|
||||
dst_plain + i * copy_size,
|
||||
src_cipher + i * copy_size,
|
||||
decrypt_iv + i,
|
||||
copy_size,
|
||||
auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// GPU address to use as source or destination in CE decrypt/encrypt operations.
|
||||
// If the uvm_mem backing storage is contiguous in the [offset, offset + size)
|
||||
// interval, the physical address gets priority over the virtual counterpart.
|
||||
static uvm_gpu_address_t gpu_address(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU64 offset, NvU32 size)
|
||||
{
|
||||
uvm_gpu_address_t gpu_virtual_address;
|
||||
|
||||
if (uvm_mem_is_physically_contiguous(mem, offset, size))
|
||||
return uvm_mem_gpu_address_physical(mem, gpu, offset, size);
|
||||
|
||||
gpu_virtual_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
|
||||
gpu_virtual_address.address += offset;
|
||||
|
||||
return gpu_virtual_address;
|
||||
}
|
||||
|
||||
// Automatically get the correct address for the authentication tag. The
|
||||
// addressing mode of the tag should match that of the reference address
|
||||
// (destination pointer for GPU encrypt, source pointer for GPU encrypt)
|
||||
static uvm_gpu_address_t auth_tag_gpu_address(uvm_mem_t *auth_tag_mem,
|
||||
uvm_gpu_t *gpu,
|
||||
size_t offset,
|
||||
uvm_gpu_address_t reference)
|
||||
{
|
||||
uvm_gpu_address_t auth_tag_gpu_address;
|
||||
|
||||
if (!reference.is_virtual)
|
||||
return uvm_mem_gpu_address_physical(auth_tag_mem, gpu, offset, UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
|
||||
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);
|
||||
auth_tag_gpu_address.address += offset;
|
||||
|
||||
return auth_tag_gpu_address;
|
||||
}
|
||||
|
||||
// Note: no membar is issued in any of the GPU transfers (encryptions)
|
||||
static void gpu_encrypt(uvm_push_t *push,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
UvmCslIv *decrypt_iv,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
{
|
||||
size_t i;
|
||||
size_t num_iterations = size / copy_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
for (i = 0; i < num_iterations; i++) {
|
||||
uvm_gpu_address_t dst_cipher = gpu_address(dst_mem, gpu, i * copy_size, copy_size);
|
||||
uvm_gpu_address_t src_plain = gpu_address(src_mem, gpu, i * copy_size, copy_size);
|
||||
uvm_gpu_address_t auth_tag = auth_tag_gpu_address(auth_tag_mem,
|
||||
gpu,
|
||||
i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
dst_cipher);
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
|
||||
|
||||
if (i > 0)
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
|
||||
gpu->parent->ce_hal->encrypt(push, dst_cipher, src_plain, copy_size, auth_tag);
|
||||
decrypt_iv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no membar is issued in any of the GPU transfers (decryptions)
|
||||
static void gpu_decrypt(uvm_push_t *push,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
{
|
||||
size_t i;
|
||||
size_t num_iterations = size / copy_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
for (i = 0; i < num_iterations; i++) {
|
||||
uvm_gpu_address_t dst_plain = gpu_address(dst_mem, gpu, i * copy_size, copy_size);
|
||||
uvm_gpu_address_t src_cipher = gpu_address(src_mem, gpu, i * copy_size, copy_size);
|
||||
uvm_gpu_address_t auth_tag = auth_tag_gpu_address(auth_tag_mem,
|
||||
gpu,
|
||||
i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
src_cipher);
|
||||
|
||||
if (i > 0)
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
|
||||
gpu->parent->ce_hal->decrypt(push, dst_plain, src_cipher, copy_size, auth_tag);
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
|
||||
uvm_channel_type_t decrypt_channel_type,
|
||||
uvm_channel_type_t encrypt_channel_type,
|
||||
size_t size,
|
||||
NvU32 copy_size,
|
||||
bool decrypt_in_order,
|
||||
bool encrypt_in_order)
|
||||
{
|
||||
uvm_push_t push;
|
||||
NvU64 init_value;
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *src_plain = NULL;
|
||||
uvm_mem_t *src_cipher = NULL;
|
||||
uvm_mem_t *dst_cipher = NULL;
|
||||
uvm_mem_t *dst_plain_gpu = NULL;
|
||||
uvm_mem_t *dst_plain = NULL;
|
||||
uvm_mem_t *auth_tag_mem = NULL;
|
||||
size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
UvmCslIv *decrypt_iv = NULL;
|
||||
UvmCslIv *encrypt_iv = NULL;
|
||||
uvm_tracker_t tracker;
|
||||
size_t src_plain_size;
|
||||
|
||||
TEST_CHECK_RET(copy_size <= size);
|
||||
TEST_CHECK_RET(IS_ALIGNED(size, copy_size));
|
||||
|
||||
uvm_tracker_init(&tracker);
|
||||
|
||||
decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
|
||||
if (!decrypt_iv) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
encrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
|
||||
if (!encrypt_iv) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &src_cipher, size), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_vidmem_protected(gpu, &dst_plain_gpu, size), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &dst_cipher, size), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &dst_plain, size), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &auth_tag_mem, auth_tag_buffer_size), out);
|
||||
|
||||
// The plaintext CPU buffer size should fit the initialization value
|
||||
src_plain_size = UVM_ALIGN_UP(size, sizeof(init_value));
|
||||
TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &src_plain, src_plain_size), out);
|
||||
|
||||
// Initialize the plaintext CPU buffer using a value that uniquely
|
||||
// identifies the given inputs
|
||||
TEST_CHECK_GOTO((((NvU64) size) < (1ULL << 63)), out);
|
||||
init_value = ((NvU64) decrypt_in_order << 63) | ((NvU64) size) | ((NvU64) copy_size);
|
||||
write_range_cpu(src_plain, init_value);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager,
|
||||
decrypt_channel_type,
|
||||
&push,
|
||||
"CPU > GPU decrypt"),
|
||||
out);
|
||||
|
||||
// CPU (decrypted) > CPU (encrypted), using CPU, if in-order
|
||||
// acquire IVs if not in-order
|
||||
if (encrypt_in_order)
|
||||
cpu_encrypt(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size);
|
||||
else
|
||||
cpu_acquire_encryption_ivs(push.channel, size, copy_size, encrypt_iv);
|
||||
|
||||
// CPU (encrypted) > GPU (decrypted), using GPU
|
||||
gpu_decrypt(&push, dst_plain_gpu, src_cipher, auth_tag_mem, size, copy_size);
|
||||
|
||||
// Use acquired IVs to encrypt in reverse order
|
||||
if (!encrypt_in_order)
|
||||
cpu_encrypt_rev(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size, encrypt_iv);
|
||||
|
||||
uvm_push_end(&push);
|
||||
TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), out);
|
||||
|
||||
// GPU (decrypted) > CPU (encrypted), using GPU
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
|
||||
encrypt_channel_type,
|
||||
&tracker,
|
||||
&push,
|
||||
"GPU > CPU encrypt"),
|
||||
out);
|
||||
|
||||
gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
|
||||
|
||||
TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
|
||||
|
||||
TEST_CHECK_GOTO(!mem_match(dst_cipher, src_plain, size), out);
|
||||
|
||||
// CPU (encrypted) > CPU (decrypted), using CPU
|
||||
if (decrypt_in_order) {
|
||||
TEST_NV_CHECK_GOTO(cpu_decrypt_in_order(push.channel,
|
||||
dst_plain,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
out);
|
||||
}
|
||||
else {
|
||||
TEST_NV_CHECK_GOTO(cpu_decrypt_out_of_order(push.channel,
|
||||
dst_plain,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
out);
|
||||
}
|
||||
|
||||
TEST_CHECK_GOTO(mem_match(src_plain, dst_plain, size), out);
|
||||
|
||||
out:
|
||||
uvm_mem_free(auth_tag_mem);
|
||||
uvm_mem_free(dst_plain);
|
||||
uvm_mem_free(dst_plain_gpu);
|
||||
uvm_mem_free(dst_cipher);
|
||||
uvm_mem_free(src_cipher);
|
||||
uvm_mem_free(src_plain);
|
||||
uvm_tracker_deinit(&tracker);
|
||||
uvm_kvfree(decrypt_iv);
|
||||
uvm_kvfree(encrypt_iv);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_encryption_decryption(uvm_gpu_t *gpu,
|
||||
uvm_channel_type_t decrypt_channel_type,
|
||||
uvm_channel_type_t encrypt_channel_type)
|
||||
{
|
||||
bool cpu_decrypt_in_order = true;
|
||||
bool cpu_encrypt_in_order = true;
|
||||
size_t size[] = {UVM_PAGE_SIZE_4K, UVM_PAGE_SIZE_4K * 2, UVM_PAGE_SIZE_2M};
|
||||
size_t copy_size[] = {UVM_PAGE_SIZE_4K, UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_2M};
|
||||
unsigned i;
|
||||
|
||||
struct {
|
||||
bool encrypt_in_order;
|
||||
bool decrypt_in_order;
|
||||
} orders[] = {{true, true}, {true, false}, {false, true}, {false, false}};
|
||||
|
||||
struct {
|
||||
size_t size;
|
||||
NvU32 copy_size;
|
||||
} small_sizes[] = {{1, 1}, {3, 1}, {8, 1}, {2, 2}, {8, 4}, {UVM_PAGE_SIZE_4K - 8, 8}, {UVM_PAGE_SIZE_4K + 8, 8}};
|
||||
|
||||
// Only Confidential Computing uses CE encryption/decryption
|
||||
if (!uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
// Use a size, and copy size, that are not a multiple of common page sizes.
|
||||
for (i = 0; i < ARRAY_SIZE(small_sizes); ++i) {
|
||||
// Skip tests that need large pushbuffer on WLC. Secure work launch
|
||||
// needs to do at least one decrypt operation so tests that only need
|
||||
// one operation work ok. Tests using more operations might overflow
|
||||
// UVM_MAX_WLC_PUSH_SIZE.
|
||||
if (encrypt_channel_type == UVM_CHANNEL_TYPE_WLC && (small_sizes[i].size / small_sizes[i].copy_size > 1))
|
||||
continue;
|
||||
|
||||
TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu,
|
||||
decrypt_channel_type,
|
||||
encrypt_channel_type,
|
||||
small_sizes[i].size,
|
||||
small_sizes[i].copy_size,
|
||||
cpu_decrypt_in_order,
|
||||
cpu_encrypt_in_order));
|
||||
}
|
||||
|
||||
// Use sizes, and copy sizes, that are a multiple of common page sizes.
|
||||
// This is the most typical usage of encrypt/decrypt in the UVM driver.
|
||||
for (i = 0; i < ARRAY_SIZE(orders); ++i) {
|
||||
unsigned j;
|
||||
|
||||
cpu_encrypt_in_order = orders[i].encrypt_in_order;
|
||||
cpu_decrypt_in_order = orders[i].decrypt_in_order;
|
||||
|
||||
for (j = 0; j < ARRAY_SIZE(size); ++j) {
|
||||
unsigned k;
|
||||
|
||||
for (k = 0; k < ARRAY_SIZE(copy_size); ++k) {
|
||||
if (copy_size[k] > size[j])
|
||||
continue;
|
||||
|
||||
// Skip tests that need large pushbuffer on WLC. Secure work
|
||||
// launch needs to do at least one decrypt operation so tests
|
||||
// that only need one operation work ok. Tests using more
|
||||
// operations might overflow UVM_MAX_WLC_PUSH_SIZE.
|
||||
if (encrypt_channel_type == UVM_CHANNEL_TYPE_WLC && (size[j] / copy_size[k] > 1))
|
||||
continue;
|
||||
|
||||
// There is no difference between in-order and out-of-order
|
||||
// decryption when encrypting once.
|
||||
if ((copy_size[k] == size[j]) && !cpu_decrypt_in_order)
|
||||
continue;
|
||||
|
||||
TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu,
|
||||
decrypt_channel_type,
|
||||
encrypt_channel_type,
|
||||
size[j],
|
||||
copy_size[k],
|
||||
cpu_decrypt_in_order,
|
||||
cpu_encrypt_in_order));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS test_ce(uvm_va_space_t *va_space, bool skipTimestampTest)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
@@ -660,6 +1221,8 @@ static NV_STATUS test_ce(uvm_va_space_t *va_space, bool skipTimestampTest)
|
||||
if (!skipTimestampTest)
|
||||
TEST_NV_CHECK_RET(test_semaphore_timestamp(gpu));
|
||||
|
||||
TEST_NV_CHECK_RET(test_encryption_decryption(gpu, UVM_CHANNEL_TYPE_CPU_TO_GPU, UVM_CHANNEL_TYPE_GPU_TO_CPU));
|
||||
TEST_NV_CHECK_RET(test_encryption_decryption(gpu, UVM_CHANNEL_TYPE_WLC, UVM_CHANNEL_TYPE_WLC));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -51,7 +51,7 @@
|
||||
#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)
|
||||
|
||||
// Maximum number of channels per pool.
|
||||
#define UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL 8
|
||||
#define UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL UVM_PUSH_MAX_CONCURRENT_PUSHES
|
||||
|
||||
// Semaphore payloads cannot advance too much between calls to
|
||||
// uvm_gpu_tracking_semaphore_update_completed_value(). In practice the jumps
|
||||
@@ -66,7 +66,7 @@
|
||||
|
||||
#define uvm_channel_pool_assert_locked(pool) ( \
|
||||
{ \
|
||||
if (uvm_channel_pool_is_proxy(pool)) \
|
||||
if (uvm_channel_pool_uses_mutex(pool)) \
|
||||
uvm_assert_mutex_locked(&(pool)->mutex); \
|
||||
else \
|
||||
uvm_assert_spinlock_locked(&(pool)->spinlock); \
|
||||
@@ -94,7 +94,29 @@ typedef enum
|
||||
|
||||
// ^^^^^^
|
||||
// Channel types backed by a CE.
|
||||
UVM_CHANNEL_TYPE_COUNT = UVM_CHANNEL_TYPE_CE_COUNT,
|
||||
// ----------------------------------
|
||||
// Channel types not backed by a CE.
|
||||
// vvvvvv
|
||||
|
||||
// SEC2 channels
|
||||
UVM_CHANNEL_TYPE_SEC2 = UVM_CHANNEL_TYPE_CE_COUNT,
|
||||
|
||||
// ----------------------------------
|
||||
// Channel type with fixed schedules
|
||||
|
||||
// Work Launch Channel (WLC) is a specialized channel
|
||||
// for launching work on other channels when
|
||||
// Confidential Computing is enabled.
|
||||
// It is paired with LCIC (below)
|
||||
UVM_CHANNEL_TYPE_WLC,
|
||||
|
||||
// Launch Confirmation Indicator Channel (LCIC) is a
|
||||
// specialized channel with fixed schedule. It gets
|
||||
// triggered by executing WLC work, and makes sure that
|
||||
// WLC get/put pointers are up-to-date.
|
||||
UVM_CHANNEL_TYPE_LCIC,
|
||||
|
||||
UVM_CHANNEL_TYPE_COUNT,
|
||||
} uvm_channel_type_t;
|
||||
|
||||
typedef enum
|
||||
@@ -112,7 +134,15 @@ typedef enum
|
||||
// There is a single proxy pool and channel per GPU.
|
||||
UVM_CHANNEL_POOL_TYPE_CE_PROXY = (1 << 1),
|
||||
|
||||
UVM_CHANNEL_POOL_TYPE_COUNT = 2,
|
||||
// A pool of SEC2 channels owned by UVM. These channels are backed by a SEC2
|
||||
// engine.
|
||||
UVM_CHANNEL_POOL_TYPE_SEC2 = (1 << 2),
|
||||
|
||||
UVM_CHANNEL_POOL_TYPE_WLC = (1 << 3),
|
||||
|
||||
UVM_CHANNEL_POOL_TYPE_LCIC = (1 << 4),
|
||||
|
||||
UVM_CHANNEL_POOL_TYPE_COUNT = 5,
|
||||
|
||||
// A mask used to select pools of any type.
|
||||
UVM_CHANNEL_POOL_TYPE_MASK = ((1U << UVM_CHANNEL_POOL_TYPE_COUNT) - 1)
|
||||
@@ -136,16 +166,24 @@ struct uvm_gpfifo_entry_struct
|
||||
// this entry.
|
||||
NvU64 tracking_semaphore_value;
|
||||
|
||||
union {
|
||||
struct {
|
||||
// Offset of the pushbuffer in the pushbuffer allocation used by
|
||||
// this entry.
|
||||
NvU32 pushbuffer_offset;
|
||||
|
||||
// Size of the pushbuffer used for this entry.
|
||||
NvU32 pushbuffer_size;
|
||||
};
|
||||
|
||||
// Value of control entry
|
||||
// Exact value of GPFIFO entry copied directly to GPFIFO[PUT] location.
|
||||
NvU64 control_value;
|
||||
};
|
||||
|
||||
// The following fields are only valid when type is
|
||||
// UVM_GPFIFO_ENTRY_TYPE_NORMAL.
|
||||
|
||||
// Offset of the pushbuffer in the pushbuffer allocation used by
|
||||
// this entry.
|
||||
NvU32 pushbuffer_offset;
|
||||
|
||||
// Size of the pushbuffer used for this entry.
|
||||
NvU32 pushbuffer_size;
|
||||
|
||||
// List node used by the pushbuffer tracking
|
||||
struct list_head pending_list_node;
|
||||
|
||||
@@ -160,6 +198,19 @@ typedef struct
|
||||
// Owning channel manager
|
||||
uvm_channel_manager_t *manager;
|
||||
|
||||
// On Volta+ GPUs, all channels in a pool are members of the same TSG, i.e.,
|
||||
// num_tsgs is 1. Pre-Volta GPUs also have a single TSG object, but since HW
|
||||
// does not support TSG for CE engines, a HW TSG is not created, but a TSG
|
||||
// object is required to allocate channels.
|
||||
// When Confidential Computing mode is enabled, the WLC and LCIC channel
|
||||
// types require one TSG for each WLC/LCIC pair of channels. In this case,
|
||||
// we do not use a TSG per channel pool, but instead a TSG per WLC/LCIC
|
||||
// channel pair, num_tsgs equals to the number of channel pairs.
|
||||
uvmGpuTsgHandle *tsg_handles;
|
||||
|
||||
// Number TSG handles owned by this pool.
|
||||
NvU32 num_tsgs;
|
||||
|
||||
// Channels in this pool
|
||||
uvm_channel_t *channels;
|
||||
|
||||
@@ -176,22 +227,26 @@ typedef struct
|
||||
// Lock protecting the state of channels in the pool.
|
||||
//
|
||||
// There are two pool lock types available: spinlock and mutex. The mutex
|
||||
// variant is required when the thread holding the pool lock must
|
||||
// sleep (ex: acquire another mutex) deeper in the call stack, either in UVM
|
||||
// or RM. For example, work submission to proxy channels in SR-IOV heavy
|
||||
// entails calling an RM API that acquires a mutex, so the proxy channel
|
||||
// pool must use the mutex variant.
|
||||
//
|
||||
// Unless the mutex is required, the spinlock is preferred. This is because,
|
||||
// other than for proxy channels, work submission takes little time and does
|
||||
// not involve any RM calls, so UVM can avoid any invocation that may result
|
||||
// on a sleep. All non-proxy channel pools use the spinlock variant, even in
|
||||
// SR-IOV heavy.
|
||||
// variant is required when the thread holding the pool lock must sleep
|
||||
// (ex: acquire another mutex) deeper in the call stack, either in UVM or
|
||||
// RM.
|
||||
union {
|
||||
uvm_spinlock_t spinlock;
|
||||
uvm_mutex_t mutex;
|
||||
};
|
||||
|
||||
// Secure operations require that uvm_push_begin order matches
|
||||
// uvm_push_end order, because the engine's state is used in its internal
|
||||
// operation and each push may modify this state. push_locks is protected by
|
||||
// the channel pool lock.
|
||||
DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
|
||||
|
||||
// Counting semaphore for available and unlocked channels, it must be
|
||||
// acquired before submitting work to a secure channel.
|
||||
uvm_semaphore_t push_sem;
|
||||
|
||||
// See uvm_channel_is_secure() documentation.
|
||||
bool secure;
|
||||
} uvm_channel_pool_t;
|
||||
|
||||
struct uvm_channel_struct
|
||||
@@ -242,6 +297,66 @@ struct uvm_channel_struct
|
||||
// uvm_channel_end_push().
|
||||
uvm_gpu_tracking_semaphore_t tracking_sem;
|
||||
|
||||
struct
|
||||
{
|
||||
// Secure operations require that uvm_push_begin order matches
|
||||
// uvm_push_end order, because the engine's state is used in
|
||||
// its internal operation and each push may modify this state.
|
||||
uvm_mutex_t push_lock;
|
||||
|
||||
// Every secure channel has cryptographic state in HW, which is
|
||||
// mirrored here for CPU-side operations.
|
||||
UvmCslContext ctx;
|
||||
bool is_ctx_initialized;
|
||||
|
||||
// CPU-side CSL crypto operations which operate on the same CSL state
|
||||
// are not thread-safe, so they must be wrapped in locks at the UVM
|
||||
// level. Encryption, decryption and logging operations must be
|
||||
// protected with the ctx_lock.
|
||||
uvm_mutex_t ctx_lock;
|
||||
} csl;
|
||||
|
||||
struct
|
||||
{
|
||||
// The value of GPU side PUT index.
|
||||
// Indirect work submission introduces delay between updating the CPU
|
||||
// put when ending a push, and updating the GPU visible value via
|
||||
// indirect work launch. It is used to order multiple pending indirect
|
||||
// work launches to match the order of push end-s that triggered them.
|
||||
volatile NvU32 gpu_put;
|
||||
|
||||
// Static pushbuffer for channels with static schedule (WLC/LCIC)
|
||||
uvm_rm_mem_t *static_pb_protected_vidmem;
|
||||
|
||||
// Static pushbuffer staging buffer for WLC
|
||||
uvm_rm_mem_t *static_pb_unprotected_sysmem;
|
||||
void *static_pb_unprotected_sysmem_cpu;
|
||||
void *static_pb_unprotected_sysmem_auth_tag_cpu;
|
||||
|
||||
// The above static locations are required by the WLC (and LCIC)
|
||||
// schedule. Protected sysmem location completes WLC's independence
|
||||
// from the pushbuffer allocator.
|
||||
void *static_pb_protected_sysmem;
|
||||
|
||||
// Static tracking semaphore notifier values
|
||||
// Because of LCIC's fixed schedule, the secure semaphore release
|
||||
// mechanism uses two additional static locations for incrementing the
|
||||
// notifier values. See:
|
||||
// . channel_semaphore_secure_release()
|
||||
// . setup_lcic_schedule()
|
||||
// . internal_channel_submit_work_wlc()
|
||||
uvm_rm_mem_t *static_notifier_unprotected_sysmem;
|
||||
NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
|
||||
NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
|
||||
uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
|
||||
uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
|
||||
|
||||
// Explicit location for push launch tag used by WLC.
|
||||
// Encryption auth tags have to be located in unprotected sysmem.
|
||||
void *launch_auth_tag_cpu;
|
||||
NvU64 launch_auth_tag_gpu_va;
|
||||
} conf_computing;
|
||||
|
||||
// RM channel information
|
||||
union
|
||||
{
|
||||
@@ -337,6 +452,73 @@ struct uvm_channel_manager_struct
|
||||
// Create a channel manager for the GPU
|
||||
NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **manager_out);
|
||||
|
||||
static bool uvm_channel_pool_is_ce(uvm_channel_pool_t *pool);
|
||||
|
||||
// A channel is secure if it has HW encryption capabilities.
|
||||
//
|
||||
// Secure channels are treated differently in the UVM driver. Each secure
|
||||
// channel has a unique CSL context associated with it, has relatively
|
||||
// restrictive reservation policies (in comparison with non-secure channels),
|
||||
// it is requested to be allocated differently by RM, etc.
|
||||
static bool uvm_channel_pool_is_secure(uvm_channel_pool_t *pool)
|
||||
{
|
||||
return pool->secure;
|
||||
}
|
||||
|
||||
static bool uvm_channel_is_secure(uvm_channel_t *channel)
|
||||
{
|
||||
return uvm_channel_pool_is_secure(channel->pool);
|
||||
}
|
||||
|
||||
static bool uvm_channel_pool_is_sec2(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
|
||||
|
||||
return (pool->pool_type == UVM_CHANNEL_POOL_TYPE_SEC2);
|
||||
}
|
||||
|
||||
static bool uvm_channel_pool_is_secure_ce(uvm_channel_pool_t *pool)
|
||||
{
|
||||
return uvm_channel_pool_is_secure(pool) && uvm_channel_pool_is_ce(pool);
|
||||
}
|
||||
|
||||
static bool uvm_channel_pool_is_wlc(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
|
||||
|
||||
return (pool->pool_type == UVM_CHANNEL_POOL_TYPE_WLC);
|
||||
}
|
||||
|
||||
static bool uvm_channel_pool_is_lcic(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
|
||||
|
||||
return (pool->pool_type == UVM_CHANNEL_POOL_TYPE_LCIC);
|
||||
}
|
||||
|
||||
static bool uvm_channel_is_sec2(uvm_channel_t *channel)
|
||||
{
|
||||
return uvm_channel_pool_is_sec2(channel->pool);
|
||||
}
|
||||
|
||||
static bool uvm_channel_is_secure_ce(uvm_channel_t *channel)
|
||||
{
|
||||
return uvm_channel_pool_is_secure_ce(channel->pool);
|
||||
}
|
||||
|
||||
static bool uvm_channel_is_wlc(uvm_channel_t *channel)
|
||||
{
|
||||
return uvm_channel_pool_is_wlc(channel->pool);
|
||||
}
|
||||
|
||||
static bool uvm_channel_is_lcic(uvm_channel_t *channel)
|
||||
{
|
||||
return uvm_channel_pool_is_lcic(channel->pool);
|
||||
}
|
||||
|
||||
bool uvm_channel_type_requires_secure_pool(uvm_gpu_t *gpu, uvm_channel_type_t channel_type);
|
||||
NV_STATUS uvm_channel_secure_init(uvm_gpu_t *gpu, uvm_channel_t *channel);
|
||||
|
||||
static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
|
||||
@@ -352,6 +534,8 @@ static bool uvm_channel_is_proxy(uvm_channel_t *channel)
|
||||
static bool uvm_channel_pool_is_ce(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
|
||||
if (uvm_channel_pool_is_wlc(pool) || uvm_channel_pool_is_lcic(pool))
|
||||
return true;
|
||||
|
||||
return (pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE) || uvm_channel_pool_is_proxy(pool);
|
||||
}
|
||||
@@ -361,6 +545,8 @@ static bool uvm_channel_is_ce(uvm_channel_t *channel)
|
||||
return uvm_channel_pool_is_ce(channel->pool);
|
||||
}
|
||||
|
||||
bool uvm_channel_pool_uses_mutex(uvm_channel_pool_t *pool);
|
||||
|
||||
// Proxy channels are used to push page tree related methods, so their channel
|
||||
// type is UVM_CHANNEL_TYPE_MEMOPS.
|
||||
static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
|
||||
@@ -415,6 +601,13 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
|
||||
// beginning.
|
||||
NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);
|
||||
|
||||
// Check if WLC/LCIC mechanism is ready/setup
|
||||
// Should only return false during initialization
|
||||
static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
|
||||
{
|
||||
return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
|
||||
(manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
|
||||
}
|
||||
// Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
|
||||
// associated with access_channel.
|
||||
//
|
||||
|
||||
@@ -60,6 +60,11 @@ static NV_STATUS test_ordering(uvm_va_space_t *va_space)
|
||||
gpu = uvm_va_space_find_first_gpu(va_space);
|
||||
TEST_CHECK_RET(gpu != NULL);
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_all(gpu, UVM_RM_MEM_TYPE_SYS, buffer_size, 0, &mem);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
@@ -69,7 +74,7 @@ static NV_STATUS test_ordering(uvm_va_space_t *va_space)
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Initial memset");
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel));
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address;
|
||||
|
||||
// Semaphore release as part of uvm_push_end() will do the membar
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
@@ -104,7 +109,7 @@ static NV_STATUS test_ordering(uvm_va_space_t *va_space)
|
||||
value + 1);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
gpu_va_base = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel));
|
||||
gpu_va_base = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address;
|
||||
gpu_va_src = gpu_va_base + (value % values_count) * sizeof(NvU32);
|
||||
gpu_va_dst = gpu_va_base + ((value + 1) % values_count) * sizeof(NvU32);
|
||||
|
||||
@@ -200,6 +205,9 @@ static NV_STATUS uvm_test_rc_for_gpu(uvm_gpu_t *gpu)
|
||||
uvm_for_each_pool(pool, manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NvU32 i;
|
||||
for (i = 0; i < 512; ++i) {
|
||||
@@ -341,8 +349,8 @@ static void snapshot_counter(uvm_push_t *push,
|
||||
return;
|
||||
|
||||
is_proxy_channel = uvm_channel_is_proxy(push->channel);
|
||||
counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel);
|
||||
snapshot_gpu_va = uvm_rm_mem_get_gpu_va(snapshot_mem, gpu, is_proxy_channel) + index * 2 * sizeof(NvU32);
|
||||
counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address;
|
||||
snapshot_gpu_va = uvm_rm_mem_get_gpu_va(snapshot_mem, gpu, is_proxy_channel).address + index * 2 * sizeof(NvU32);
|
||||
|
||||
// Copy the last and first counter to a snapshot for later verification.
|
||||
|
||||
@@ -367,7 +375,7 @@ static void set_counter(uvm_push_t *push, uvm_rm_mem_t *counter_mem, NvU32 value
|
||||
bool is_proxy_channel;
|
||||
|
||||
is_proxy_channel = uvm_channel_is_proxy(push->channel);
|
||||
counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel);
|
||||
counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address;
|
||||
|
||||
gpu->parent->ce_hal->memset_v_4(push, counter_gpu_va, value, count * sizeof(NvU32));
|
||||
}
|
||||
@@ -427,7 +435,7 @@ static void test_memset_rm_mem(uvm_push_t *push, uvm_rm_mem_t *rm_mem, NvU32 val
|
||||
UVM_ASSERT(rm_mem->size % 4 == 0);
|
||||
|
||||
gpu = uvm_push_get_gpu(push);
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_channel_is_proxy(push->channel));
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_channel_is_proxy(push->channel)).address;
|
||||
|
||||
gpu->parent->ce_hal->memset_v_4(push, gpu_va, value, rm_mem->size);
|
||||
}
|
||||
@@ -672,6 +680,74 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
// The following test is inspired by uvm_push_test.c:test_concurrent_pushes.
|
||||
// This test verifies that concurrent pushes using the same secure channel pool
|
||||
// select different channels.
|
||||
NV_STATUS test_secure_channel_selection(uvm_va_space_t *va_space)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_channel_pool_t *pool;
|
||||
uvm_push_t *pushes;
|
||||
uvm_gpu_t *gpu;
|
||||
NvU32 i;
|
||||
NvU32 num_pushes;
|
||||
|
||||
gpu = uvm_va_space_find_first_gpu(va_space);
|
||||
|
||||
if (!uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
uvm_thread_context_lock_disable_tracking();
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
uvm_channel_type_t channel_type;
|
||||
|
||||
for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
|
||||
if (!uvm_channel_type_requires_secure_pool(gpu, channel_type))
|
||||
continue;
|
||||
|
||||
pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
|
||||
TEST_CHECK_RET(pool != NULL);
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
|
||||
if (pool->num_channels < 2)
|
||||
continue;
|
||||
|
||||
num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);
|
||||
|
||||
pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
|
||||
TEST_CHECK_RET(pushes != NULL);
|
||||
|
||||
for (i = 0; i < num_pushes; i++) {
|
||||
uvm_push_t *push = &pushes[i];
|
||||
status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i);
|
||||
TEST_NV_CHECK_GOTO(status, error);
|
||||
if (i > 0)
|
||||
TEST_CHECK_GOTO(pushes[i-1].channel != push->channel, error);
|
||||
}
|
||||
for (i = 0; i < num_pushes; i++) {
|
||||
uvm_push_t *push = &pushes[i];
|
||||
status = uvm_push_end_and_wait(push);
|
||||
TEST_NV_CHECK_GOTO(status, error);
|
||||
}
|
||||
|
||||
uvm_kvfree(pushes);
|
||||
}
|
||||
}
|
||||
|
||||
uvm_thread_context_lock_enable_tracking();
|
||||
|
||||
return status;
|
||||
error:
|
||||
uvm_thread_context_lock_enable_tracking();
|
||||
uvm_kvfree(pushes);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
@@ -683,6 +759,14 @@ NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
|
||||
uvm_for_each_pool(pool, manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
|
||||
// Skip WLC channels as those can't accept ctrl gpfifos
|
||||
// after their schedule is set up
|
||||
if (uvm_channel_pool_is_wlc(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NvU32 i;
|
||||
|
||||
@@ -714,6 +798,14 @@ NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space)
|
||||
uvm_for_each_pool(pool, manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
|
||||
// Skip WLC channels as those can't accept ctrl gpfifos
|
||||
// after their schedule is set up
|
||||
if (uvm_channel_pool_is_wlc(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NvU32 i;
|
||||
uvm_push_t push;
|
||||
@@ -757,6 +849,11 @@ NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
|
||||
|
||||
gpu = uvm_va_space_find_first_gpu(va_space);
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
uvm_channel_manager_t *manager = gpu->channel_manager;
|
||||
|
||||
@@ -850,6 +947,9 @@ static NV_STATUS test_channel_pushbuffer_extension_base(uvm_va_space_t *va_space
|
||||
uvm_for_each_pool(pool, manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NvU32 i;
|
||||
uvm_push_t push;
|
||||
@@ -897,6 +997,10 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
status = test_secure_channel_selection(va_space);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
// The following tests have side effects, they reset the GPU's
|
||||
// channel_manager.
|
||||
status = test_channel_pushbuffer_extension_base(va_space);
|
||||
@@ -937,12 +1041,18 @@ static NV_STATUS uvm_test_channel_stress_stream(uvm_va_space_t *va_space,
|
||||
uvm_mutex_lock(&g_uvm_global.global_lock);
|
||||
uvm_va_space_down_read_rm(va_space);
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(uvm_va_space_find_first_gpu(va_space)))
|
||||
goto done;
|
||||
|
||||
status = stress_test_all_gpus_in_va(va_space,
|
||||
params->num_streams,
|
||||
params->iterations,
|
||||
params->seed,
|
||||
params->verbose);
|
||||
|
||||
done:
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
uvm_mutex_unlock(&g_uvm_global.global_lock);
|
||||
|
||||
|
||||
@@ -211,6 +211,11 @@ static inline NvBool uvm_uuid_is_cpu(const NvProcessorUuid *uuid)
|
||||
{
|
||||
return memcmp(uuid, &NV_PROCESSOR_UUID_CPU_DEFAULT, sizeof(*uuid)) == 0;
|
||||
}
|
||||
#define UVM_SIZE_1KB (1024ULL)
|
||||
#define UVM_SIZE_1MB (1024 * UVM_SIZE_1KB)
|
||||
#define UVM_SIZE_1GB (1024 * UVM_SIZE_1MB)
|
||||
#define UVM_SIZE_1TB (1024 * UVM_SIZE_1GB)
|
||||
#define UVM_SIZE_1PB (1024 * UVM_SIZE_1TB)
|
||||
|
||||
#define UVM_ALIGN_DOWN(x, a) ({ \
|
||||
typeof(x) _a = a; \
|
||||
@@ -352,6 +357,7 @@ typedef enum
|
||||
UVM_FD_UNINITIALIZED,
|
||||
UVM_FD_INITIALIZING,
|
||||
UVM_FD_VA_SPACE,
|
||||
UVM_FD_MM,
|
||||
UVM_FD_COUNT
|
||||
} uvm_fd_type_t;
|
||||
|
||||
@@ -388,6 +394,10 @@ bool uvm_file_is_nvidia_uvm(struct file *filp);
|
||||
// NULL returns the value of the pointer.
|
||||
uvm_fd_type_t uvm_fd_type(struct file *filp, void **ptr_val);
|
||||
|
||||
// Returns the pointer stored in filp->private_data if the type
|
||||
// matches, otherwise returns NULL.
|
||||
void *uvm_fd_get_type(struct file *filp, uvm_fd_type_t type);
|
||||
|
||||
// Reads the first word in the supplied struct page.
|
||||
static inline void uvm_touch_page(struct page *page)
|
||||
{
|
||||
@@ -400,4 +410,7 @@ static inline void uvm_touch_page(struct page *page)
|
||||
kunmap(page);
|
||||
}
|
||||
|
||||
// Return true if the VMA is one used by UVM managed allocations.
|
||||
bool uvm_vma_is_managed(struct vm_area_struct *vma);
|
||||
|
||||
#endif /* _UVM_COMMON_H */
|
||||
|
||||
446
kernel-open/nvidia-uvm/uvm_conf_computing.c
Normal file
446
kernel-open/nvidia-uvm/uvm_conf_computing.c
Normal file
@@ -0,0 +1,446 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_common.h"
|
||||
#include "uvm_global.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
#include "uvm_kvmalloc.h"
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_processors.h"
|
||||
#include "uvm_tracker.h"
|
||||
#include "nv_uvm_interface.h"
|
||||
#include "uvm_va_block.h"
|
||||
|
||||
|
||||
static UvmGpuConfComputeMode uvm_conf_computing_get_mode(const uvm_parent_gpu_t *parent)
|
||||
{
|
||||
return parent->rm_info.gpuConfComputeCaps.mode;
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_mode_enabled_parent(const uvm_parent_gpu_t *parent)
|
||||
{
|
||||
return uvm_conf_computing_get_mode(parent) != UVM_GPU_CONF_COMPUTE_MODE_NONE;
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_mode_enabled(const uvm_gpu_t *gpu)
|
||||
{
|
||||
return uvm_conf_computing_mode_enabled_parent(gpu->parent);
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu)
|
||||
{
|
||||
return uvm_conf_computing_get_mode(gpu->parent) == UVM_GPU_CONF_COMPUTE_MODE_HCC;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent)
|
||||
{
|
||||
UvmGpuConfComputeMode cc, sys_cc;
|
||||
uvm_gpu_t *first;
|
||||
|
||||
uvm_assert_mutex_locked(&g_uvm_global.global_lock);
|
||||
|
||||
// TODO: Bug 2844714.
|
||||
// Since we have no routine to traverse parent gpus,
|
||||
// find first child GPU and get its parent.
|
||||
first = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
|
||||
if (!first)
|
||||
return NV_OK;
|
||||
|
||||
sys_cc = uvm_conf_computing_get_mode(first->parent);
|
||||
cc = uvm_conf_computing_get_mode(parent);
|
||||
|
||||
return cc == sys_cc ? NV_OK : NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static void dma_buffer_destroy_locked(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer)
|
||||
{
|
||||
uvm_assert_mutex_locked(&dma_buffer_pool->lock);
|
||||
|
||||
list_del(&dma_buffer->node);
|
||||
uvm_tracker_wait_deinit(&dma_buffer->tracker);
|
||||
|
||||
uvm_mem_free(dma_buffer->alloc);
|
||||
uvm_mem_free(dma_buffer->auth_tag);
|
||||
uvm_kvfree(dma_buffer);
|
||||
}
|
||||
|
||||
static uvm_gpu_t *dma_buffer_pool_to_gpu(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool)
|
||||
{
|
||||
return container_of(dma_buffer_pool, uvm_gpu_t, conf_computing.dma_buffer_pool);
|
||||
}
|
||||
|
||||
// Allocate and map a new DMA stage buffer to CPU and GPU (VA)
|
||||
static NV_STATUS dma_buffer_create(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t **dma_buffer_out)
|
||||
{
|
||||
uvm_gpu_t *dma_owner;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_mem_t *alloc = NULL;
|
||||
NV_STATUS status = NV_OK;
|
||||
size_t auth_tags_size = (UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
|
||||
dma_buffer = uvm_kvmalloc_zero(sizeof(*dma_buffer));
|
||||
if (!dma_buffer)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
dma_owner = dma_buffer_pool_to_gpu(dma_buffer_pool);
|
||||
uvm_tracker_init(&dma_buffer->tracker);
|
||||
INIT_LIST_HEAD(&dma_buffer->node);
|
||||
|
||||
status = uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE, dma_owner, NULL, &alloc);
|
||||
if (status != NV_OK)
|
||||
goto err;
|
||||
|
||||
dma_buffer->alloc = alloc;
|
||||
|
||||
status = uvm_mem_map_gpu_kernel(alloc, dma_owner);
|
||||
if (status != NV_OK)
|
||||
goto err;
|
||||
|
||||
status = uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(auth_tags_size, dma_owner, NULL, &alloc);
|
||||
if (status != NV_OK)
|
||||
goto err;
|
||||
|
||||
dma_buffer->auth_tag = alloc;
|
||||
|
||||
status = uvm_mem_map_gpu_kernel(alloc, dma_owner);
|
||||
if (status != NV_OK)
|
||||
goto err;
|
||||
|
||||
*dma_buffer_out = dma_buffer;
|
||||
|
||||
return status;
|
||||
|
||||
err:
|
||||
dma_buffer_destroy_locked(dma_buffer_pool, dma_buffer);
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_conf_computing_dma_buffer_pool_sync(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool)
|
||||
{
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
|
||||
if (dma_buffer_pool->num_dma_buffers == 0)
|
||||
return;
|
||||
|
||||
uvm_mutex_lock(&dma_buffer_pool->lock);
|
||||
list_for_each_entry(dma_buffer, &dma_buffer_pool->free_dma_buffers, node)
|
||||
uvm_tracker_wait(&dma_buffer->tracker);
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
}
|
||||
|
||||
static void conf_computing_dma_buffer_pool_deinit(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool)
|
||||
{
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_conf_computing_dma_buffer_t *next_buff;
|
||||
|
||||
if (dma_buffer_pool->num_dma_buffers == 0)
|
||||
return;
|
||||
|
||||
// Because the pool is teared down at the same time the GPU is unregistered
|
||||
// the lock is required only to quiet assertions not for functional reasons
|
||||
// see dma_buffer_destroy_locked()).
|
||||
uvm_mutex_lock(&dma_buffer_pool->lock);
|
||||
|
||||
list_for_each_entry_safe(dma_buffer, next_buff, &dma_buffer_pool->free_dma_buffers, node) {
|
||||
dma_buffer_destroy_locked(dma_buffer_pool, dma_buffer);
|
||||
dma_buffer_pool->num_dma_buffers--;
|
||||
}
|
||||
|
||||
UVM_ASSERT(dma_buffer_pool->num_dma_buffers == 0);
|
||||
UVM_ASSERT(list_empty(&dma_buffer_pool->free_dma_buffers));
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
}
|
||||
|
||||
static void dma_buffer_pool_add(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer)
|
||||
{
|
||||
uvm_assert_mutex_locked(&dma_buffer_pool->lock);
|
||||
list_add_tail(&dma_buffer->node, &dma_buffer_pool->free_dma_buffers);
|
||||
}
|
||||
|
||||
static NV_STATUS conf_computing_dma_buffer_pool_init(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool)
|
||||
{
|
||||
size_t i;
|
||||
uvm_gpu_t *gpu;
|
||||
size_t num_dma_buffers = 32;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
UVM_ASSERT(dma_buffer_pool->num_dma_buffers == 0);
|
||||
|
||||
gpu = dma_buffer_pool_to_gpu(dma_buffer_pool);
|
||||
|
||||
UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
|
||||
|
||||
INIT_LIST_HEAD(&dma_buffer_pool->free_dma_buffers);
|
||||
uvm_mutex_init(&dma_buffer_pool->lock, UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
|
||||
dma_buffer_pool->num_dma_buffers = num_dma_buffers;
|
||||
|
||||
uvm_mutex_lock(&dma_buffer_pool->lock);
|
||||
for (i = 0; i < num_dma_buffers; i++) {
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
|
||||
status = dma_buffer_create(dma_buffer_pool, &dma_buffer);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
|
||||
dma_buffer_pool_add(dma_buffer_pool, dma_buffer);
|
||||
}
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
|
||||
if (i < num_dma_buffers)
|
||||
conf_computing_dma_buffer_pool_deinit(dma_buffer_pool);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS dma_buffer_pool_expand_locked(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool)
|
||||
{
|
||||
size_t i;
|
||||
uvm_gpu_t *gpu;
|
||||
size_t nb_to_alloc;
|
||||
NV_STATUS status = NV_OK;
|
||||
UVM_ASSERT(dma_buffer_pool->num_dma_buffers > 0);
|
||||
|
||||
gpu = dma_buffer_pool_to_gpu(dma_buffer_pool);
|
||||
nb_to_alloc = dma_buffer_pool->num_dma_buffers;
|
||||
for (i = 0; i < nb_to_alloc; ++i) {
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
|
||||
status = dma_buffer_create(dma_buffer_pool, &dma_buffer);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
|
||||
dma_buffer_pool_add(dma_buffer_pool, dma_buffer);
|
||||
}
|
||||
|
||||
dma_buffer_pool->num_dma_buffers += i;
|
||||
|
||||
if (i == 0)
|
||||
return status;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_dma_buffer_alloc(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t **dma_buffer_out,
|
||||
uvm_tracker_t *out_tracker)
|
||||
{
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer = NULL;
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(dma_buffer_pool->num_dma_buffers > 0);
|
||||
|
||||
// TODO: Bug 3385623: Heuristically expand DMA memory pool
|
||||
uvm_mutex_lock(&dma_buffer_pool->lock);
|
||||
if (list_empty(&dma_buffer_pool->free_dma_buffers)) {
|
||||
status = dma_buffer_pool_expand_locked(dma_buffer_pool);
|
||||
|
||||
if (status != NV_OK) {
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
// We're guaranteed that at least one DMA stage buffer is available at this
|
||||
// point.
|
||||
dma_buffer = list_first_entry(&dma_buffer_pool->free_dma_buffers, uvm_conf_computing_dma_buffer_t, node);
|
||||
list_del_init(&dma_buffer->node);
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
|
||||
status = uvm_tracker_wait_for_other_gpus(&dma_buffer->tracker, dma_buffer->alloc->dma_owner);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
if (out_tracker)
|
||||
status = uvm_tracker_add_tracker_safe(out_tracker, &dma_buffer->tracker);
|
||||
else
|
||||
status = uvm_tracker_wait(&dma_buffer->tracker);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
uvm_page_mask_zero(&dma_buffer->encrypted_page_mask);
|
||||
*dma_buffer_out = dma_buffer;
|
||||
|
||||
return status;
|
||||
|
||||
error:
|
||||
uvm_tracker_deinit(&dma_buffer->tracker);
|
||||
uvm_conf_computing_dma_buffer_free(dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_conf_computing_dma_buffer_free(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer,
|
||||
uvm_tracker_t *tracker)
|
||||
{
|
||||
|
||||
NV_STATUS status;
|
||||
|
||||
if (!dma_buffer)
|
||||
return;
|
||||
|
||||
UVM_ASSERT(dma_buffer_pool->num_dma_buffers > 0);
|
||||
|
||||
uvm_tracker_remove_completed(&dma_buffer->tracker);
|
||||
if (tracker) {
|
||||
uvm_tracker_remove_completed(tracker);
|
||||
status = uvm_tracker_add_tracker_safe(&dma_buffer->tracker, tracker);
|
||||
if (status != NV_OK)
|
||||
UVM_ASSERT(status == uvm_global_get_status());
|
||||
}
|
||||
|
||||
uvm_mutex_lock(&dma_buffer_pool->lock);
|
||||
dma_buffer_pool_add(dma_buffer_pool, dma_buffer);
|
||||
uvm_mutex_unlock(&dma_buffer_pool->lock);
|
||||
}
|
||||
|
||||
static void dummy_iv_mem_deinit(uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_mem_free(gpu->conf_computing.iv_mem);
|
||||
}
|
||||
|
||||
static NV_STATUS dummy_iv_mem_init(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
if (!uvm_conf_computing_mode_is_hcc(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_mem_alloc_sysmem_dma(sizeof(UvmCslIv), gpu, NULL, &gpu->conf_computing.iv_mem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_mem_map_gpu_kernel(gpu->conf_computing.iv_mem, gpu);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
return NV_OK;
|
||||
|
||||
error:
|
||||
dummy_iv_mem_deinit(gpu);
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
if (!uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = conf_computing_dma_buffer_pool_init(&gpu->conf_computing.dma_buffer_pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = dummy_iv_mem_init(gpu);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
return NV_OK;
|
||||
|
||||
error:
|
||||
uvm_conf_computing_gpu_deinit(gpu);
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
|
||||
{
|
||||
dummy_iv_mem_deinit(gpu);
|
||||
conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
status = nvUvmInterfaceCslLogDeviceEncryption(&channel->csl.ctx, iv);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
// nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
|
||||
// overflows. This is not supposed to happen on CC.
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
status = nvUvmInterfaceCslAcquireEncryptionIv(&channel->csl.ctx, iv);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
// nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
|
||||
// overflows. This is not supposed to happen on CC.
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
|
||||
void *dst_cipher,
|
||||
const void *src_plain,
|
||||
UvmCslIv *encrypt_iv,
|
||||
size_t size,
|
||||
void *auth_tag_buffer)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(size);
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
|
||||
size,
|
||||
(NvU8 const *) src_plain,
|
||||
encrypt_iv,
|
||||
(NvU8 *) dst_cipher,
|
||||
(NvU8 *) auth_tag_buffer);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
// nvUvmInterfaceCslEncrypt fails when a 64-bit encryption counter
|
||||
// overflows. This is not supposed to happen on CC.
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
|
||||
void *dst_plain,
|
||||
const void *src_cipher,
|
||||
const UvmCslIv *src_iv,
|
||||
size_t size,
|
||||
const void *auth_tag_buffer)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
status = nvUvmInterfaceCslDecrypt(&channel->csl.ctx,
|
||||
size,
|
||||
(const NvU8 *) src_cipher,
|
||||
src_iv,
|
||||
(NvU8 *) dst_plain,
|
||||
(const NvU8 *) auth_tag_buffer);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
return status;
|
||||
}
|
||||
178
kernel-open/nvidia-uvm/uvm_conf_computing.h
Normal file
178
kernel-open/nvidia-uvm/uvm_conf_computing.h
Normal file
@@ -0,0 +1,178 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef __UVM_CONF_COMPUTING_H__
|
||||
#define __UVM_CONF_COMPUTING_H__
|
||||
|
||||
#include "nv_uvm_types.h"
|
||||
#include "uvm_forward_decl.h"
|
||||
#include "uvm_lock.h"
|
||||
#include "uvm_tracker.h"
|
||||
#include "uvm_va_block_types.h"
|
||||
|
||||
#include "linux/list.h"
|
||||
|
||||
#define UVM_CONF_COMPUTING_AUTH_TAG_SIZE (UVM_CSL_CRYPT_AUTH_TAG_SIZE_BYTES)
|
||||
|
||||
// An authentication tag pointer is required by HW to be 16-bytes aligned.
|
||||
#define UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT 16
|
||||
|
||||
// An IV pointer is required by HW to be 16-bytes aligned.
|
||||
//
|
||||
// Use sizeof(UvmCslIv) to refer to the IV size.
|
||||
#define UVM_CONF_COMPUTING_IV_ALIGNMENT 16
|
||||
|
||||
// SEC2 decrypt operation buffers are required to be 16-bytes aligned. CE
|
||||
// encrypt/decrypt can be unaligned if the buffer lies in a single 32B segment.
|
||||
// Otherwise, they need to be 32B aligned.
|
||||
#define UVM_CONF_COMPUTING_BUF_ALIGNMENT 32
|
||||
|
||||
#define UVM_CONF_COMPUTING_DMA_BUFFER_SIZE UVM_VA_BLOCK_SIZE
|
||||
|
||||
// SEC2 supports at most a stream of 64 entries in the method stream for
|
||||
// signing. Each entry is made of the method address and method data, therefore
|
||||
// the maximum buffer size is: UVM_METHOD_SIZE * 2 * 64 = 512.
|
||||
// UVM, however, won't use this amount of entries, in the worst case scenario,
|
||||
// we push a semaphore_releases or a decrypt. A SEC2 semaphore_release uses 6 1U
|
||||
// entries, whereas a SEC2 decrypt uses 10 1U entries. For 10 entries,
|
||||
// UVM_METHOD_SIZE * 2 * 10 = 80.
|
||||
#define UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE 80
|
||||
|
||||
// All GPUs derive confidential computing status from their parent.
|
||||
// By current policy all parent GPUs have identical confidential
|
||||
// computing status.
|
||||
NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent);
|
||||
bool uvm_conf_computing_mode_enabled_parent(const uvm_parent_gpu_t *parent);
|
||||
bool uvm_conf_computing_mode_enabled(const uvm_gpu_t *gpu);
|
||||
bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// List of free DMA buffers (uvm_conf_computing_dma_buffer_t).
|
||||
// A free DMA buffer can be grabbed anytime, though the tracker
|
||||
// inside it may still have pending work.
|
||||
struct list_head free_dma_buffers;
|
||||
|
||||
// Used to grow the pool when full.
|
||||
size_t num_dma_buffers;
|
||||
|
||||
// Lock protecting the dma_buffer_pool
|
||||
uvm_mutex_t lock;
|
||||
} uvm_conf_computing_dma_buffer_pool_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Backing DMA allocation
|
||||
uvm_mem_t *alloc;
|
||||
|
||||
// Used internally by the pool management code to track the state of
|
||||
// a free buffer.
|
||||
uvm_tracker_t tracker;
|
||||
|
||||
// When the DMA buffer is used as the destination of a GPU encryption, SEC2
|
||||
// writes the authentication tag here. Later when the buffer is decrypted
|
||||
// on the CPU the authentication tag is used again (read) for CSL to verify
|
||||
// the authenticity. The allocation is big enough for one authentication
|
||||
// tag per PAGE_SIZE page in the alloc buffer.
|
||||
uvm_mem_t *auth_tag;
|
||||
|
||||
// CSL supports out-of-order decryption, the decrypt IV is used similarly
|
||||
// to the authentication tag. The allocation is big enough for one IV per
|
||||
// PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
|
||||
// IV and authentication tag must match.
|
||||
UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];
|
||||
|
||||
// Bitmap of the encrypted pages in the backing allocation
|
||||
uvm_page_mask_t encrypted_page_mask;
|
||||
|
||||
// See uvm_conf_computing_dma_pool lists
|
||||
struct list_head node;
|
||||
} uvm_conf_computing_dma_buffer_t;
|
||||
|
||||
// Retrieve a DMA buffer from the given DMA allocation pool.
|
||||
// NV_OK Stage buffer successfully retrieved
|
||||
// NV_ERR_NO_MEMORY No free DMA buffers are available for grab, and
|
||||
// expanding the memory pool to get new ones failed.
|
||||
//
|
||||
// out_dma_buffer is only valid if NV_OK is returned. The caller is responsible
|
||||
// for calling uvm_conf_computing_dma_buffer_free once the operations on this
|
||||
// buffer are done.
|
||||
// When out_tracker is passed to the function, the buffer's dependencies are
|
||||
// added to the tracker. The caller is guaranteed that all pending tracker
|
||||
// entries come from the same GPU as the pool's owner. Before being able to use
|
||||
// the DMA buffer, the caller is responsible for either acquiring or waiting
|
||||
// on out_tracker. If out_tracker is NULL, the wait happens in the allocation
|
||||
// itself.
|
||||
// Upon success the encrypted_page_mask is cleared as part of the allocation.
|
||||
NV_STATUS uvm_conf_computing_dma_buffer_alloc(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t **out_dma_buffer,
|
||||
uvm_tracker_t *out_tracker);
|
||||
|
||||
// Free a DMA buffer to the DMA allocation pool. All DMA buffers must be freed
|
||||
// prior to GPU deinit.
|
||||
//
|
||||
// The tracker is optional and a NULL tracker indicates that no new operation
|
||||
// has been pushed for the buffer. A non-NULL tracker indicates any additional
|
||||
// pending operations on the buffer pushed by the caller that need to be
|
||||
// synchronized before freeing or re-using the buffer.
|
||||
void uvm_conf_computing_dma_buffer_free(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer,
|
||||
uvm_tracker_t *tracker);
|
||||
|
||||
// Synchronize trackers in all entries in the GPU's DMA pool
|
||||
void uvm_conf_computing_dma_buffer_pool_sync(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool);
|
||||
|
||||
|
||||
// Initialization and deinitialization of Confidential Computing data structures
|
||||
// for the given GPU.
|
||||
NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
|
||||
void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);
|
||||
|
||||
// Logs encryption information from the GPU and returns the IV.
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);
|
||||
|
||||
// Acquires next CPU encryption IV and returns it.
|
||||
void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
|
||||
|
||||
// CPU side encryption helper with explicit IV, which is obtained from
|
||||
// uvm_conf_computing_acquire_encryption_iv. Without an explicit IV
|
||||
// the function uses the next IV in order. Encrypts data in src_plain and
|
||||
// write the cipher text in dst_cipher. src_plain and dst_cipher can't overlap.
|
||||
// The IV is invalidated and can't be used again after this operation.
|
||||
void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
|
||||
void *dst_cipher,
|
||||
const void *src_plain,
|
||||
UvmCslIv *encrypt_iv,
|
||||
size_t size,
|
||||
void *auth_tag_buffer);
|
||||
|
||||
// CPU side decryption helper. Decrypts data from src_cipher and writes the
|
||||
// plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
|
||||
// from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
|
||||
NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
|
||||
void *dst_plain,
|
||||
const void *src_cipher,
|
||||
const UvmCslIv *src_iv,
|
||||
size_t size,
|
||||
const void *auth_tag_buffer);
|
||||
#endif // __UVM_CONF_COMPUTING_H__
|
||||
@@ -37,6 +37,7 @@ typedef struct uvm_ce_hal_struct uvm_ce_hal_t;
|
||||
typedef struct uvm_arch_hal_struct uvm_arch_hal_t;
|
||||
typedef struct uvm_fault_buffer_hal_struct uvm_fault_buffer_hal_t;
|
||||
typedef struct uvm_access_counter_buffer_hal_struct uvm_access_counter_buffer_hal_t;
|
||||
typedef struct uvm_sec2_hal_struct uvm_sec2_hal_t;
|
||||
typedef struct uvm_gpu_semaphore_struct uvm_gpu_semaphore_t;
|
||||
typedef struct uvm_gpu_tracking_semaphore_struct uvm_gpu_tracking_semaphore_t;
|
||||
typedef struct uvm_gpu_semaphore_pool_struct uvm_gpu_semaphore_pool_t;
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "uvm_gpu_access_counters.h"
|
||||
#include "uvm_ats.h"
|
||||
#include "uvm_test.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
#include "uvm_linux.h"
|
||||
|
||||
@@ -66,21 +67,6 @@ static uvm_user_channel_t *get_user_channel(uvm_rb_tree_node_t *node)
|
||||
return container_of(node, uvm_user_channel_t, instance_ptr.node);
|
||||
}
|
||||
|
||||
static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_info)
|
||||
{
|
||||
char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
|
||||
|
||||
parent_gpu->rm_info = *gpu_info;
|
||||
|
||||
format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid);
|
||||
snprintf(parent_gpu->name,
|
||||
sizeof(parent_gpu->name),
|
||||
"ID %u: %s: %s",
|
||||
uvm_id_value(parent_gpu->id),
|
||||
parent_gpu->rm_info.name,
|
||||
uuid_buffer);
|
||||
}
|
||||
|
||||
static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)
|
||||
{
|
||||
switch (link_type) {
|
||||
@@ -94,44 +80,68 @@ static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)
|
||||
return UVM_GPU_LINK_NVLINK_3;
|
||||
case UVM_LINK_TYPE_NVLINK_4:
|
||||
return UVM_GPU_LINK_NVLINK_4;
|
||||
case UVM_LINK_TYPE_C2C:
|
||||
return UVM_GPU_LINK_C2C;
|
||||
default:
|
||||
return UVM_GPU_LINK_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS get_gpu_caps(uvm_parent_gpu_t *parent_gpu)
|
||||
static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_info)
|
||||
{
|
||||
char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
|
||||
|
||||
parent_gpu->rm_info = *gpu_info;
|
||||
|
||||
parent_gpu->system_bus.link = get_gpu_link_type(gpu_info->sysmemLink);
|
||||
UVM_ASSERT(parent_gpu->system_bus.link != UVM_GPU_LINK_INVALID);
|
||||
|
||||
parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps;
|
||||
|
||||
if (gpu_info->systemMemoryWindowSize > 0) {
|
||||
// memory_window_end is inclusive but uvm_gpu_is_coherent() checks
|
||||
// memory_window_end > memory_window_start as its condition.
|
||||
UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1);
|
||||
parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart;
|
||||
parent_gpu->system_bus.memory_window_end = gpu_info->systemMemoryWindowStart +
|
||||
gpu_info->systemMemoryWindowSize - 1;
|
||||
}
|
||||
|
||||
parent_gpu->nvswitch_info.is_nvswitch_connected = gpu_info->connectedToSwitch;
|
||||
|
||||
// nvswitch is routed via physical pages, where the upper 13-bits of the
|
||||
// 47-bit address space holds the routing information for each peer.
|
||||
// Currently, this is limited to a 16GB framebuffer window size.
|
||||
if (parent_gpu->nvswitch_info.is_nvswitch_connected)
|
||||
parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;
|
||||
|
||||
format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid);
|
||||
snprintf(parent_gpu->name,
|
||||
sizeof(parent_gpu->name),
|
||||
"ID %u: %s: %s",
|
||||
uvm_id_value(parent_gpu->id),
|
||||
parent_gpu->rm_info.name,
|
||||
uuid_buffer);
|
||||
}
|
||||
|
||||
static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
UvmGpuCaps gpu_caps;
|
||||
|
||||
memset(&gpu_caps, 0, sizeof(gpu_caps));
|
||||
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceQueryCaps(parent_gpu->rm_device, &gpu_caps));
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceQueryCaps(uvm_gpu_device_handle(gpu), &gpu_caps));
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
parent_gpu->sysmem_link = get_gpu_link_type(gpu_caps.sysmemLink);
|
||||
UVM_ASSERT(parent_gpu->sysmem_link != UVM_GPU_LINK_INVALID);
|
||||
|
||||
parent_gpu->sysmem_link_rate_mbyte_per_s = gpu_caps.sysmemLinkRateMBps;
|
||||
parent_gpu->nvswitch_info.is_nvswitch_connected = gpu_caps.connectedToSwitch;
|
||||
|
||||
// nvswitch is routed via physical pages, where the upper 13-bits of the
|
||||
// 47-bit address space holds the routing information for each peer.
|
||||
// Currently, this is limited to a 16GB framebuffer window size.
|
||||
if (parent_gpu->nvswitch_info.is_nvswitch_connected)
|
||||
parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_caps.nvswitchMemoryWindowStart;
|
||||
|
||||
if (gpu_caps.numaEnabled) {
|
||||
parent_gpu->numa_info.enabled = true;
|
||||
parent_gpu->numa_info.node_id = gpu_caps.numaNodeId;
|
||||
parent_gpu->numa_info.system_memory_window_start = gpu_caps.systemMemoryWindowStart;
|
||||
parent_gpu->numa_info.system_memory_window_end = gpu_caps.systemMemoryWindowStart +
|
||||
gpu_caps.systemMemoryWindowSize -
|
||||
1;
|
||||
UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent));
|
||||
gpu->mem_info.numa.enabled = true;
|
||||
gpu->mem_info.numa.node_id = gpu_caps.numaNodeId;
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(!g_uvm_global.ats.enabled);
|
||||
UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
@@ -347,26 +357,30 @@ NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
|
||||
static void gpu_info_print_ce_caps(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
{
|
||||
NvU32 i;
|
||||
UvmGpuCopyEnginesCaps ces_caps;
|
||||
UvmGpuCopyEnginesCaps *ces_caps;
|
||||
NV_STATUS status;
|
||||
|
||||
memset(&ces_caps, 0, sizeof(ces_caps));
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceQueryCopyEnginesCaps(uvm_gpu_device_handle(gpu), &ces_caps));
|
||||
ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps));
|
||||
if (!ces_caps) {
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (no memory)\n");
|
||||
return;
|
||||
}
|
||||
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceQueryCopyEnginesCaps(uvm_gpu_device_handle(gpu), ces_caps));
|
||||
if (status != NV_OK) {
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (query failed)\n");
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "supported_ces:\n");
|
||||
for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) {
|
||||
UvmGpuCopyEngineCaps *ce_caps = ces_caps.copyEngineCaps + i;
|
||||
UvmGpuCopyEngineCaps *ce_caps = ces_caps->copyEngineCaps + i;
|
||||
|
||||
if (!ce_caps->supported)
|
||||
continue;
|
||||
|
||||
UVM_SEQ_OR_DBG_PRINT(s, " ce %u pce mask 0x%08x grce %u shared %u sysmem read %u sysmem write %u sysmem %u nvlink p2p %u "
|
||||
"p2p %u\n",
|
||||
UVM_SEQ_OR_DBG_PRINT(s, " ce %u pce mask 0x%08x grce %u shared %u sysmem read %u sysmem write %u sysmem %u "
|
||||
"nvlink p2p %u p2p %u\n",
|
||||
i,
|
||||
ce_caps->cePceMask,
|
||||
ce_caps->grce,
|
||||
@@ -377,6 +391,9 @@ static void gpu_info_print_ce_caps(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
ce_caps->nvlinkP2p,
|
||||
ce_caps->p2p);
|
||||
}
|
||||
|
||||
out:
|
||||
uvm_kvfree(ces_caps);
|
||||
}
|
||||
|
||||
static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)
|
||||
@@ -394,7 +411,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)
|
||||
|
||||
static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
|
||||
{
|
||||
BUILD_BUG_ON(UVM_GPU_LINK_MAX != 6);
|
||||
BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);
|
||||
|
||||
switch (link_type) {
|
||||
UVM_ENUM_STRING_CASE(UVM_GPU_LINK_INVALID);
|
||||
@@ -403,6 +420,7 @@ static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
|
||||
UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_2);
|
||||
UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_3);
|
||||
UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_4);
|
||||
UVM_ENUM_STRING_CASE(UVM_GPU_LINK_C2C);
|
||||
UVM_ENUM_STRING_DEFAULT();
|
||||
}
|
||||
}
|
||||
@@ -410,7 +428,6 @@ static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
|
||||
static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
{
|
||||
const UvmGpuInfo *gpu_info = &gpu->parent->rm_info;
|
||||
uvm_numa_info_t *numa_info = &gpu->parent->numa_info;
|
||||
NvU64 num_pages_in;
|
||||
NvU64 num_pages_out;
|
||||
NvU64 mapped_cpu_pages_size;
|
||||
@@ -429,9 +446,9 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
return;
|
||||
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "CPU link type %s\n",
|
||||
uvm_gpu_link_type_string(gpu->parent->sysmem_link));
|
||||
uvm_gpu_link_type_string(gpu->parent->system_bus.link));
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "CPU link bandwidth %uMBps\n",
|
||||
gpu->parent->sysmem_link_rate_mbyte_per_s);
|
||||
gpu->parent->system_bus.link_rate_mbyte_per_s);
|
||||
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "architecture 0x%X\n", gpu_info->gpuArch);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "implementation 0x%X\n", gpu_info->gpuImplementation);
|
||||
@@ -453,13 +470,13 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
gpu->mem_info.max_allocatable_address,
|
||||
gpu->mem_info.max_allocatable_address / (1024 * 1024));
|
||||
|
||||
if (numa_info->enabled) {
|
||||
NvU64 window_size = numa_info->system_memory_window_end - numa_info->system_memory_window_start + 1;
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "numa_node_id %u\n", numa_info->node_id);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "system_memory_window_start 0x%llx\n",
|
||||
numa_info->system_memory_window_start);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "system_memory_window_end 0x%llx\n",
|
||||
numa_info->system_memory_window_end);
|
||||
if (gpu->mem_info.numa.enabled) {
|
||||
NvU64 window_size = gpu->parent->system_bus.memory_window_end - gpu->parent->system_bus.memory_window_start + 1;
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "numa_node_id %u\n", uvm_gpu_numa_node(gpu));
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "memory_window_start 0x%llx\n",
|
||||
gpu->parent->system_bus.memory_window_start);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "memory_window_end 0x%llx\n",
|
||||
gpu->parent->system_bus.memory_window_end);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "system_memory_window_size 0x%llx (%llu MBs)\n",
|
||||
window_size,
|
||||
window_size / (1024 * 1024));
|
||||
@@ -550,6 +567,10 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
|
||||
|
||||
gpu_info_print_ce_caps(gpu, s);
|
||||
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "dma_buffer_pool_num_buffers %lu\n",
|
||||
gpu->conf_computing.dma_buffer_pool.num_dma_buffers);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -843,7 +864,7 @@ static void deinit_procfs_peer_cap_files(uvm_gpu_peer_t *peer_caps)
|
||||
proc_remove(peer_caps->procfs.peer_file[1]);
|
||||
}
|
||||
|
||||
static NV_STATUS init_semaphore_pool(uvm_gpu_t *gpu)
|
||||
static NV_STATUS init_semaphore_pools(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_gpu_t *other_gpu;
|
||||
@@ -852,7 +873,17 @@ static NV_STATUS init_semaphore_pool(uvm_gpu_t *gpu)
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// When the Confidential Computing feature is enabled, a separate secure
|
||||
// pool is created that holds page allocated in the CPR of vidmem.
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
status = uvm_gpu_semaphore_secure_pool_create(gpu, &gpu->secure_semaphore_pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
for_each_global_gpu(other_gpu) {
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
break;
|
||||
if (other_gpu == gpu)
|
||||
continue;
|
||||
status = uvm_gpu_semaphore_pool_map_gpu(other_gpu->semaphore_pool, gpu);
|
||||
@@ -863,7 +894,7 @@ static NV_STATUS init_semaphore_pool(uvm_gpu_t *gpu)
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void deinit_semaphore_pool(uvm_gpu_t *gpu)
|
||||
static void deinit_semaphore_pools(uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_gpu_t *other_gpu;
|
||||
|
||||
@@ -874,6 +905,7 @@ static void deinit_semaphore_pool(uvm_gpu_t *gpu)
|
||||
}
|
||||
|
||||
uvm_gpu_semaphore_pool_destroy(gpu->semaphore_pool);
|
||||
uvm_gpu_semaphore_pool_destroy(gpu->secure_semaphore_pool);
|
||||
}
|
||||
|
||||
static NV_STATUS find_unused_global_gpu_id(uvm_parent_gpu_t *parent_gpu, uvm_global_gpu_id_t *out_id)
|
||||
@@ -1067,6 +1099,13 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
|
||||
return status;
|
||||
}
|
||||
|
||||
status = uvm_conf_computing_init_parent_gpu(parent_gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n",
|
||||
nvstatusToString(status), parent_gpu->name);
|
||||
return status;
|
||||
}
|
||||
|
||||
parent_gpu->pci_dev = gpu_platform_info->pci_dev;
|
||||
parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev);
|
||||
parent_gpu->dma_addressable_start = gpu_platform_info->dma_addressable_start;
|
||||
@@ -1102,12 +1141,6 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
|
||||
|
||||
uvm_mmu_init_gpu_chunk_sizes(parent_gpu);
|
||||
|
||||
status = get_gpu_caps(parent_gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to get GPU caps: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
|
||||
return status;
|
||||
}
|
||||
|
||||
status = uvm_ats_add_gpu(parent_gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
|
||||
@@ -1166,6 +1199,12 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = get_gpu_caps(gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to get GPU caps: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
|
||||
return status;
|
||||
}
|
||||
|
||||
uvm_mmu_init_gpu_peer_addresses(gpu);
|
||||
|
||||
status = alloc_and_init_address_space(gpu);
|
||||
@@ -1198,7 +1237,7 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = init_semaphore_pool(gpu);
|
||||
status = init_semaphore_pools(gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to initialize the semaphore pool: %s, GPU %s\n",
|
||||
nvstatusToString(status),
|
||||
@@ -1228,6 +1267,14 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = uvm_conf_computing_gpu_init(gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to initialize Confidential Compute: %s for GPU %s\n",
|
||||
nvstatusToString(status),
|
||||
uvm_gpu_name(gpu));
|
||||
return status;
|
||||
}
|
||||
|
||||
status = init_procfs_files(gpu);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to init procfs files: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
|
||||
@@ -1403,6 +1450,8 @@ static void remove_gpus_from_gpu(uvm_gpu_t *gpu)
|
||||
// Sync all trackers in PMM
|
||||
uvm_pmm_gpu_sync(&gpu->pmm);
|
||||
|
||||
// Sync all trackers in the GPU's DMA allocation pool
|
||||
uvm_conf_computing_dma_buffer_pool_sync(&gpu->conf_computing.dma_buffer_pool);
|
||||
}
|
||||
|
||||
// Remove all references to the given GPU from its parent, since it is being
|
||||
@@ -1485,7 +1534,7 @@ static void deinit_gpu(uvm_gpu_t *gpu)
|
||||
// pain during development.
|
||||
deconfigure_address_space(gpu);
|
||||
|
||||
deinit_semaphore_pool(gpu);
|
||||
deinit_semaphore_pools(gpu);
|
||||
|
||||
uvm_pmm_sysmem_mappings_deinit(&gpu->pmm_reverse_sysmem_mappings);
|
||||
|
||||
@@ -1536,6 +1585,13 @@ static void remove_gpu(uvm_gpu_t *gpu)
|
||||
if (free_parent)
|
||||
destroy_nvlink_peers(gpu);
|
||||
|
||||
// uvm_mem_free and other uvm_mem APIs invoked by the Confidential Compute
|
||||
// deinitialization must be called before the GPU is removed from the global
|
||||
// table.
|
||||
//
|
||||
// TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot.
|
||||
uvm_conf_computing_gpu_deinit(gpu);
|
||||
|
||||
// TODO: Bug 2844714: If the parent is not being freed, the following
|
||||
// gpu_table_lock is only needed to protect concurrent
|
||||
// find_first_valid_gpu() in BH from the __clear_bit here. After
|
||||
@@ -2213,9 +2269,12 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(p2p_caps_params->p2pLink != UVM_LINK_TYPE_C2C);
|
||||
|
||||
// check for peer-to-peer compatibility (PCI-E or NvLink).
|
||||
peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
|
||||
if (peer_caps->link_type == UVM_GPU_LINK_INVALID
|
||||
|| peer_caps->link_type == UVM_GPU_LINK_C2C
|
||||
)
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
@@ -2225,8 +2284,8 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
|
||||
peer_caps->is_indirect_peer = (p2p_caps_params->indirectAccess == NV_TRUE);
|
||||
|
||||
if (peer_caps->is_indirect_peer) {
|
||||
UVM_ASSERT(gpu0->parent->numa_info.enabled);
|
||||
UVM_ASSERT(gpu1->parent->numa_info.enabled);
|
||||
UVM_ASSERT(gpu0->mem_info.numa.enabled);
|
||||
UVM_ASSERT(gpu1->mem_info.numa.enabled);
|
||||
|
||||
status = uvm_pmm_gpu_indirect_peer_init(&gpu0->pmm, gpu1);
|
||||
if (status != NV_OK)
|
||||
@@ -2415,8 +2474,7 @@ static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu)
|
||||
|
||||
// Indirect peers are only supported when onlined as NUMA nodes, because
|
||||
// we want to use vm_insert_page and dma_map_page.
|
||||
if (p2p_caps_params.indirectAccess &&
|
||||
(!gpu->parent->numa_info.enabled || !other_gpu->parent->numa_info.enabled))
|
||||
if (p2p_caps_params.indirectAccess && (!gpu->mem_info.numa.enabled || !other_gpu->mem_info.numa.enabled))
|
||||
continue;
|
||||
|
||||
status = enable_nvlink_peer_access(gpu, other_gpu, &p2p_caps_params);
|
||||
@@ -2601,6 +2659,9 @@ uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu)
|
||||
if (uvm_gpu_is_virt_mode_sriov_heavy(gpu))
|
||||
return UVM_APERTURE_VID;
|
||||
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return UVM_APERTURE_VID;
|
||||
|
||||
return UVM_APERTURE_DEFAULT;
|
||||
}
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "uvm_rb_tree.h"
|
||||
#include "uvm_perf_prefetch.h"
|
||||
#include "nv-kthread-q.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
// Buffer length to store uvm gpu id, RM device name and gpu uuid.
|
||||
#define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \
|
||||
@@ -133,6 +134,12 @@ struct uvm_service_block_context_struct
|
||||
|
||||
// This is set if the page migrated to/from the GPU and CPU.
|
||||
bool did_migrate;
|
||||
|
||||
// Sequence number used to start a mmu notifier read side critical
|
||||
// section.
|
||||
unsigned long notifier_seq;
|
||||
|
||||
struct vm_fault *vmf;
|
||||
} cpu_fault;
|
||||
|
||||
//
|
||||
@@ -168,6 +175,31 @@ struct uvm_service_block_context_struct
|
||||
uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree;
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
|
||||
// VMA. Used for batching ATS faults in a vma.
|
||||
uvm_page_mask_t read_fault_mask;
|
||||
|
||||
// Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a
|
||||
// SAM VMA. Used for batching ATS faults in a vma.
|
||||
uvm_page_mask_t write_fault_mask;
|
||||
|
||||
// Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region
|
||||
// of a SAM VMA. Used to return ATS fault status.
|
||||
uvm_page_mask_t faults_serviced_mask;
|
||||
|
||||
// Mask of successfully serviced read faults on pages in write_fault_mask.
|
||||
uvm_page_mask_t reads_serviced_mask;
|
||||
|
||||
// Temporary mask used for uvm_page_mask_or_equal. This is used since
|
||||
// bitmap_or_equal() isn't present in all linux kernel versions.
|
||||
uvm_page_mask_t tmp_mask;
|
||||
|
||||
// Client type of the service requestor.
|
||||
uvm_fault_client_type_t client_type;
|
||||
} uvm_ats_fault_context_t;
|
||||
|
||||
struct uvm_fault_service_batch_context_struct
|
||||
{
|
||||
// Array of elements fetched from the GPU fault buffer. The number of
|
||||
@@ -200,6 +232,8 @@ struct uvm_fault_service_batch_context_struct
|
||||
|
||||
NvU32 num_replays;
|
||||
|
||||
uvm_ats_fault_context_t ats_context;
|
||||
|
||||
// Unique id (per-GPU) generated for tools events recording
|
||||
NvU32 batch_id;
|
||||
|
||||
@@ -338,6 +372,9 @@ typedef struct
|
||||
// Unique id (per-GPU) generated for tools events recording
|
||||
NvU32 batch_id;
|
||||
|
||||
// Information required to service ATS faults.
|
||||
uvm_ats_fault_context_t ats_context;
|
||||
|
||||
// Information required to invalidate stale ATS PTEs from the GPU TLBs
|
||||
uvm_ats_fault_invalidate_t ats_invalidate;
|
||||
} non_replayable;
|
||||
@@ -349,22 +386,6 @@ typedef struct
|
||||
NvU64 disable_prefetch_faults_timestamp;
|
||||
} uvm_fault_buffer_info_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// True if the platform supports HW coherence (P9) and RM has exposed the
|
||||
// GPU's memory as a NUMA node to the kernel.
|
||||
bool enabled;
|
||||
|
||||
// Range in the system physical address space where the memory of this GPU
|
||||
// is mapped
|
||||
NvU64 system_memory_window_start;
|
||||
NvU64 system_memory_window_end;
|
||||
|
||||
NvU64 memblock_size;
|
||||
|
||||
unsigned node_id;
|
||||
} uvm_numa_info_t;
|
||||
|
||||
struct uvm_access_counter_service_batch_context_struct
|
||||
{
|
||||
uvm_access_counter_buffer_entry_t *notification_cache;
|
||||
@@ -502,6 +523,10 @@ typedef struct
|
||||
|
||||
// Page tables with the mapping.
|
||||
uvm_page_table_range_vec_t *range_vec;
|
||||
|
||||
// Used during init to indicate whether the mapping has been fully
|
||||
// initialized.
|
||||
bool ready;
|
||||
} uvm_gpu_identity_mapping_t;
|
||||
|
||||
// Root chunk mapping
|
||||
@@ -524,6 +549,7 @@ typedef enum
|
||||
UVM_GPU_LINK_NVLINK_2,
|
||||
UVM_GPU_LINK_NVLINK_3,
|
||||
UVM_GPU_LINK_NVLINK_4,
|
||||
UVM_GPU_LINK_C2C,
|
||||
UVM_GPU_LINK_MAX
|
||||
} uvm_gpu_link_type_t;
|
||||
|
||||
@@ -581,6 +607,14 @@ struct uvm_gpu_struct
|
||||
// Max (inclusive) physical address of this GPU's memory that the driver
|
||||
// can allocate through PMM (PMA).
|
||||
NvU64 max_allocatable_address;
|
||||
|
||||
struct
|
||||
{
|
||||
// True if the platform supports HW coherence and the GPU's memory
|
||||
// is exposed as a NUMA node to the kernel.
|
||||
bool enabled;
|
||||
unsigned int node_id;
|
||||
} numa;
|
||||
} mem_info;
|
||||
|
||||
struct
|
||||
@@ -637,6 +671,8 @@ struct uvm_gpu_struct
|
||||
|
||||
uvm_gpu_semaphore_pool_t *semaphore_pool;
|
||||
|
||||
uvm_gpu_semaphore_pool_t *secure_semaphore_pool;
|
||||
|
||||
uvm_channel_manager_t *channel_manager;
|
||||
|
||||
uvm_pmm_gpu_t pmm;
|
||||
@@ -696,6 +732,25 @@ struct uvm_gpu_struct
|
||||
// mappings (instead of kernel), and it is used in most configurations.
|
||||
uvm_pmm_sysmem_mappings_t pmm_reverse_sysmem_mappings;
|
||||
|
||||
struct
|
||||
{
|
||||
uvm_conf_computing_dma_buffer_pool_t dma_buffer_pool;
|
||||
|
||||
// Dummy memory used to store the IV contents during CE encryption.
|
||||
// This memory location is also only available after CE channels
|
||||
// because we use them to write PTEs for allocations such as this one.
|
||||
// This location is used when a physical addressing for the IV buffer
|
||||
// is required. See uvm_hal_hopper_ce_encrypt().
|
||||
uvm_mem_t *iv_mem;
|
||||
|
||||
// Dummy memory used to store the IV contents during CE encryption.
|
||||
// Because of the limitations of `iv_mem', and the need to have such
|
||||
// buffer at channel initialization, we use an RM allocation.
|
||||
// This location is used when a virtual addressing for the IV buffer
|
||||
// is required. See uvm_hal_hopper_ce_encrypt().
|
||||
uvm_rm_mem_t *iv_rm_mem;
|
||||
} conf_computing;
|
||||
|
||||
// ECC handling
|
||||
// In order to trap ECC errors as soon as possible the driver has the hw
|
||||
// interrupt register mapped directly. If an ECC interrupt is ever noticed
|
||||
@@ -833,6 +888,10 @@ struct uvm_parent_gpu_struct
|
||||
uvm_arch_hal_t *arch_hal;
|
||||
uvm_fault_buffer_hal_t *fault_buffer_hal;
|
||||
uvm_access_counter_buffer_hal_t *access_counter_buffer_hal;
|
||||
uvm_sec2_hal_t *sec2_hal;
|
||||
|
||||
// Whether CE supports physical addressing mode for writes to vidmem
|
||||
bool ce_phys_vidmem_write_supported;
|
||||
|
||||
uvm_gpu_peer_copy_mode_t peer_copy_mode;
|
||||
|
||||
@@ -954,9 +1013,6 @@ struct uvm_parent_gpu_struct
|
||||
// Fault buffer info. This is only valid if supports_replayable_faults is set to true
|
||||
uvm_fault_buffer_info_t fault_buffer_info;
|
||||
|
||||
// NUMA info, mainly for ATS
|
||||
uvm_numa_info_t numa_info;
|
||||
|
||||
// PMM lazy free processing queue.
|
||||
// TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
|
||||
nv_kthread_q_t lazy_free_q;
|
||||
@@ -1049,8 +1105,22 @@ struct uvm_parent_gpu_struct
|
||||
NvU64 fabric_memory_window_start;
|
||||
} nvswitch_info;
|
||||
|
||||
uvm_gpu_link_type_t sysmem_link;
|
||||
NvU32 sysmem_link_rate_mbyte_per_s;
|
||||
struct
|
||||
{
|
||||
// Note that this represents the link to system memory, not the link the
|
||||
// system used to discover the GPU. There are some cases such as NVLINK2
|
||||
// where the GPU is still on the PCIe bus, but it accesses memory over
|
||||
// this link rather than PCIe.
|
||||
uvm_gpu_link_type_t link;
|
||||
NvU32 link_rate_mbyte_per_s;
|
||||
|
||||
// Range in the system physical address space where the memory of this
|
||||
// GPU is exposed as coherent. memory_window_end is inclusive.
|
||||
// memory_window_start == memory_window_end indicates that no window is
|
||||
// present (coherence is not supported).
|
||||
NvU64 memory_window_start;
|
||||
NvU64 memory_window_end;
|
||||
} system_bus;
|
||||
};
|
||||
|
||||
static const char *uvm_gpu_name(uvm_gpu_t *gpu)
|
||||
@@ -1146,23 +1216,20 @@ NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space);
|
||||
|
||||
void uvm_gpu_exit_va_space(uvm_va_space_t *va_space);
|
||||
|
||||
static uvm_numa_info_t *uvm_gpu_numa_info(uvm_gpu_t *gpu)
|
||||
static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu)
|
||||
{
|
||||
UVM_ASSERT(gpu->parent->numa_info.enabled);
|
||||
|
||||
return &gpu->parent->numa_info;
|
||||
UVM_ASSERT(gpu->mem_info.numa.enabled);
|
||||
return gpu->mem_info.numa.node_id;
|
||||
}
|
||||
|
||||
static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struct page *page)
|
||||
{
|
||||
uvm_numa_info_t *numa_info = uvm_gpu_numa_info(gpu);
|
||||
|
||||
unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT;
|
||||
unsigned long gpu_offset = sys_addr - numa_info->system_memory_window_start;
|
||||
unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start;
|
||||
|
||||
UVM_ASSERT(page_to_nid(page) == numa_info->node_id);
|
||||
UVM_ASSERT(sys_addr >= numa_info->system_memory_window_start);
|
||||
UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= numa_info->system_memory_window_end);
|
||||
UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu));
|
||||
UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start);
|
||||
UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end);
|
||||
|
||||
return uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_offset);
|
||||
}
|
||||
@@ -1270,8 +1337,8 @@ static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
|
||||
uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
|
||||
|
||||
if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) {
|
||||
UVM_ASSERT(gpu0->parent->numa_info.enabled);
|
||||
UVM_ASSERT(gpu1->parent->numa_info.enabled);
|
||||
UVM_ASSERT(gpu0->mem_info.numa.enabled);
|
||||
UVM_ASSERT(gpu1->mem_info.numa.enabled);
|
||||
UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE);
|
||||
UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1));
|
||||
return true;
|
||||
@@ -1291,6 +1358,9 @@ static uvm_gpu_address_t uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t *gpu
|
||||
UVM_ASSERT(uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu));
|
||||
UVM_ASSERT(pa <= gpu->mem_info.max_allocatable_address);
|
||||
|
||||
if (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu))
|
||||
UVM_ASSERT(gpu->static_flat_mapping.ready);
|
||||
|
||||
return uvm_gpu_address_virtual(gpu->parent->flat_vidmem_va_base + pa);
|
||||
}
|
||||
|
||||
@@ -1308,6 +1378,23 @@ static uvm_gpu_address_t uvm_gpu_address_virtual_from_sysmem_phys(uvm_gpu_t *gpu
|
||||
return uvm_gpu_address_virtual(gpu->parent->flat_sysmem_va_base + pa);
|
||||
}
|
||||
|
||||
// Given a GPU or CPU physical address (not peer), retrieve an address suitable
|
||||
// for CE access.
|
||||
static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
|
||||
{
|
||||
UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS);
|
||||
|
||||
if (phys_addr.aperture == UVM_APERTURE_VID) {
|
||||
if (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
||||
return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
|
||||
}
|
||||
else if (uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu)) {
|
||||
return uvm_gpu_address_virtual_from_sysmem_phys(gpu, phys_addr.address);
|
||||
}
|
||||
|
||||
return uvm_gpu_address_from_phys(phys_addr);
|
||||
}
|
||||
|
||||
static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_gpu_id_t peer_id)
|
||||
{
|
||||
return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)];
|
||||
@@ -1383,6 +1470,11 @@ bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
|
||||
// addresses.
|
||||
NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
|
||||
|
||||
static bool uvm_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
|
||||
}
|
||||
|
||||
static bool uvm_gpu_has_pushbuffer_segments(uvm_gpu_t *gpu)
|
||||
{
|
||||
return gpu->parent->max_host_va > (1ull << 40);
|
||||
@@ -1446,6 +1538,7 @@ typedef enum
|
||||
{
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
|
||||
} uvm_gpu_buffer_flush_mode_t;
|
||||
|
||||
#endif // __UVM_GPU_H__
|
||||
|
||||
@@ -210,13 +210,13 @@ static NV_STATUS config_granularity_to_bytes(UVM_ACCESS_COUNTER_GRANULARITY gran
|
||||
*bytes = 64 * 1024ULL;
|
||||
break;
|
||||
case UVM_ACCESS_COUNTER_GRANULARITY_2M:
|
||||
*bytes = 2 * 1024 * 1024ULL;
|
||||
*bytes = 2 * UVM_SIZE_1MB;
|
||||
break;
|
||||
case UVM_ACCESS_COUNTER_GRANULARITY_16M:
|
||||
*bytes = 16 * 1024 * 1024ULL;
|
||||
*bytes = 16 * UVM_SIZE_1MB;
|
||||
break;
|
||||
case UVM_ACCESS_COUNTER_GRANULARITY_16G:
|
||||
*bytes = 16 * 1024 * 1024 * 1024ULL;
|
||||
*bytes = 16 * UVM_SIZE_1GB;
|
||||
break;
|
||||
default:
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
@@ -404,7 +404,8 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
|
||||
UVM_ASSERT(parent_gpu->access_counter_buffer_hal != NULL);
|
||||
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceInitAccessCntrInfo(parent_gpu->rm_device,
|
||||
&access_counters->rm_info));
|
||||
&access_counters->rm_info,
|
||||
0));
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to init notify buffer info from RM: %s, GPU %s\n",
|
||||
nvstatusToString(status),
|
||||
@@ -707,6 +708,7 @@ static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_fl
|
||||
UVM_ASSERT(gpu->parent->access_counters_supported);
|
||||
|
||||
// Read PUT pointer from the GPU if requested
|
||||
UVM_ASSERT(flush_mode != UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT);
|
||||
if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT)
|
||||
access_counters->cached_put = UVM_GPU_READ_ONCE(*access_counters->rm_info.pAccessCntrBufferPut);
|
||||
|
||||
@@ -1198,6 +1200,11 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
|
||||
service_context->num_retries = 0;
|
||||
service_context->block_context.mm = mm;
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block)) {
|
||||
uvm_hmm_service_context_init(service_context);
|
||||
uvm_hmm_migrate_begin_wait(va_block);
|
||||
}
|
||||
|
||||
uvm_mutex_lock(&va_block->lock);
|
||||
|
||||
reverse_mappings_to_va_block_page_mask(va_block, reverse_mappings, num_reverse_mappings, accessed_pages);
|
||||
@@ -1211,6 +1218,9 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
|
||||
|
||||
uvm_mutex_unlock(&va_block->lock);
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_finish(va_block);
|
||||
|
||||
if (status == NV_OK)
|
||||
*out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
|
||||
}
|
||||
|
||||
@@ -85,76 +85,86 @@ static void uvm_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *parent_gpu);
|
||||
|
||||
static unsigned schedule_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
|
||||
|
||||
if (parent_gpu->isr.is_suspended)
|
||||
return 0;
|
||||
|
||||
// handling gets set to false for all handlers during removal, so quit if
|
||||
// the GPU is in the process of being removed.
|
||||
if (parent_gpu->isr.replayable_faults.handling) {
|
||||
if (!parent_gpu->isr.replayable_faults.handling)
|
||||
return 0;
|
||||
|
||||
// Use raw call instead of UVM helper. Ownership will be recorded in the
|
||||
// bottom half. See comment replayable_faults_isr_bottom_half().
|
||||
if (down_trylock(&parent_gpu->isr.replayable_faults.service_lock.sem) == 0) {
|
||||
if (uvm_gpu_replayable_faults_pending(parent_gpu)) {
|
||||
nv_kref_get(&parent_gpu->gpu_kref);
|
||||
// Use raw call instead of UVM helper. Ownership will be recorded in the
|
||||
// bottom half. See comment replayable_faults_isr_bottom_half().
|
||||
if (down_trylock(&parent_gpu->isr.replayable_faults.service_lock.sem) != 0)
|
||||
return 0;
|
||||
|
||||
// Interrupts need to be disabled here to avoid an interrupt
|
||||
// storm
|
||||
uvm_gpu_replayable_faults_intr_disable(parent_gpu);
|
||||
|
||||
// Schedule a bottom half, but do *not* release the GPU ISR
|
||||
// lock. The bottom half releases the GPU ISR lock as part of
|
||||
// its cleanup.
|
||||
nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
|
||||
&parent_gpu->isr.replayable_faults.bottom_half_q_item);
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
up(&parent_gpu->isr.replayable_faults.service_lock.sem);
|
||||
}
|
||||
}
|
||||
if (!uvm_gpu_replayable_faults_pending(parent_gpu)) {
|
||||
up(&parent_gpu->isr.replayable_faults.service_lock.sem);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
nv_kref_get(&parent_gpu->gpu_kref);
|
||||
|
||||
// Interrupts need to be disabled here to avoid an interrupt storm
|
||||
uvm_gpu_replayable_faults_intr_disable(parent_gpu);
|
||||
|
||||
// Schedule a bottom half, but do *not* release the GPU ISR lock. The bottom
|
||||
// half releases the GPU ISR lock as part of its cleanup.
|
||||
nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
|
||||
&parent_gpu->isr.replayable_faults.bottom_half_q_item);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static unsigned schedule_non_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
bool scheduled;
|
||||
|
||||
if (parent_gpu->isr.is_suspended)
|
||||
return 0;
|
||||
|
||||
// handling gets set to false for all handlers during removal, so quit if
|
||||
// the GPU is in the process of being removed.
|
||||
if (parent_gpu->isr.non_replayable_faults.handling) {
|
||||
// Non-replayable_faults are stored in a synchronized circular queue
|
||||
// shared by RM/UVM. Therefore, we can query the number of pending
|
||||
// faults. This type of faults are not replayed and since RM advances
|
||||
// GET to PUT when copying the fault packets to the queue, no further
|
||||
// interrupts will be triggered by the gpu and faults may stay
|
||||
// unserviced. Therefore, if there is a fault in the queue, we schedule
|
||||
// a bottom half unconditionally.
|
||||
if (uvm_gpu_non_replayable_faults_pending(parent_gpu)) {
|
||||
bool scheduled;
|
||||
nv_kref_get(&parent_gpu->gpu_kref);
|
||||
if (!parent_gpu->isr.non_replayable_faults.handling)
|
||||
return 0;
|
||||
|
||||
scheduled = nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
|
||||
&parent_gpu->isr.non_replayable_faults.bottom_half_q_item) != 0;
|
||||
// Non-replayable_faults are stored in a synchronized circular queue
|
||||
// shared by RM/UVM. Therefore, we can query the number of pending
|
||||
// faults. This type of faults are not replayed and since RM advances
|
||||
// GET to PUT when copying the fault packets to the queue, no further
|
||||
// interrupts will be triggered by the gpu and faults may stay
|
||||
// unserviced. Therefore, if there is a fault in the queue, we schedule
|
||||
// a bottom half unconditionally.
|
||||
if (!uvm_gpu_non_replayable_faults_pending(parent_gpu))
|
||||
return 0;
|
||||
|
||||
// If the q_item did not get scheduled because it was already
|
||||
// queued, that instance will handle the pending faults. Just
|
||||
// drop the GPU kref.
|
||||
if (!scheduled)
|
||||
uvm_parent_gpu_kref_put(parent_gpu);
|
||||
nv_kref_get(&parent_gpu->gpu_kref);
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
scheduled = nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
|
||||
&parent_gpu->isr.non_replayable_faults.bottom_half_q_item) != 0;
|
||||
|
||||
return 0;
|
||||
// If the q_item did not get scheduled because it was already
|
||||
// queued, that instance will handle the pending faults. Just
|
||||
// drop the GPU kref.
|
||||
if (!scheduled)
|
||||
uvm_parent_gpu_kref_put(parent_gpu);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static unsigned schedule_access_counters_handler(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
|
||||
|
||||
if (parent_gpu->isr.is_suspended)
|
||||
return 0;
|
||||
|
||||
if (!parent_gpu->isr.access_counters.handling_ref_count)
|
||||
return 0;
|
||||
|
||||
if (down_trylock(&parent_gpu->isr.access_counters.service_lock.sem))
|
||||
if (down_trylock(&parent_gpu->isr.access_counters.service_lock.sem) != 0)
|
||||
return 0;
|
||||
|
||||
if (!uvm_gpu_access_counters_pending(parent_gpu)) {
|
||||
@@ -199,7 +209,7 @@ static NV_STATUS uvm_isr_top_half(const NvProcessorUuid *gpu_uuid)
|
||||
{
|
||||
uvm_parent_gpu_t *parent_gpu;
|
||||
unsigned num_handlers_scheduled = 0;
|
||||
NV_STATUS status;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
if (!in_interrupt() && in_atomic()) {
|
||||
// Early-out if we're not in interrupt context, but memory allocations
|
||||
@@ -238,18 +248,15 @@ static NV_STATUS uvm_isr_top_half(const NvProcessorUuid *gpu_uuid)
|
||||
|
||||
++parent_gpu->isr.interrupt_count;
|
||||
|
||||
if (parent_gpu->isr.is_suspended) {
|
||||
status = NV_ERR_NO_INTR_PENDING;
|
||||
}
|
||||
else {
|
||||
num_handlers_scheduled += schedule_replayable_faults_handler(parent_gpu);
|
||||
num_handlers_scheduled += schedule_non_replayable_faults_handler(parent_gpu);
|
||||
num_handlers_scheduled += schedule_access_counters_handler(parent_gpu);
|
||||
num_handlers_scheduled += schedule_replayable_faults_handler(parent_gpu);
|
||||
num_handlers_scheduled += schedule_non_replayable_faults_handler(parent_gpu);
|
||||
num_handlers_scheduled += schedule_access_counters_handler(parent_gpu);
|
||||
|
||||
if (num_handlers_scheduled == 0)
|
||||
status = NV_WARN_MORE_PROCESSING_REQUIRED;
|
||||
if (num_handlers_scheduled == 0) {
|
||||
if (parent_gpu->isr.is_suspended)
|
||||
status = NV_ERR_NO_INTR_PENDING;
|
||||
else
|
||||
status = NV_OK;
|
||||
status = NV_WARN_MORE_PROCESSING_REQUIRED;
|
||||
}
|
||||
|
||||
uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
|
||||
@@ -511,6 +518,9 @@ static void replayable_faults_isr_bottom_half(void *args)
|
||||
uvm_gpu_replayable_faults_isr_unlock(parent_gpu);
|
||||
|
||||
put_kref:
|
||||
// It is OK to drop a reference on the parent GPU if a bottom half has
|
||||
// been retriggered within uvm_gpu_replayable_faults_isr_unlock, because the
|
||||
// rescheduling added an additional reference.
|
||||
uvm_parent_gpu_kref_put(parent_gpu);
|
||||
}
|
||||
|
||||
@@ -591,6 +601,51 @@ static void access_counters_isr_bottom_half_entry(void *args)
|
||||
UVM_ENTRY_VOID(access_counters_isr_bottom_half(args));
|
||||
}
|
||||
|
||||
static void replayable_faults_retrigger_bottom_half(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
bool retrigger = false;
|
||||
|
||||
// When Confidential Computing is enabled, UVM does not (indirectly) trigger
|
||||
// the replayable fault interrupt by updating GET. This is because, in this
|
||||
// configuration, GET is a dummy register used to inform GSP-RM (the owner
|
||||
// of the HW replayable fault buffer) of the latest entry consumed by the
|
||||
// UVM driver. The real GET register is owned by GSP-RM.
|
||||
//
|
||||
// The retriggering of a replayable faults bottom half happens then
|
||||
// manually, by scheduling a bottom half for later if there is any pending
|
||||
// work in the fault buffer accessible by UVM. The retriggering adddresses
|
||||
// two problematic scenarios caused by GET updates not setting any
|
||||
// interrupt:
|
||||
//
|
||||
// (1) UVM didn't process all the entries up to cached PUT
|
||||
//
|
||||
// (2) UVM did process all the entries up to cached PUT, but GPS-RM
|
||||
// added new entries such that cached PUT is out-of-date
|
||||
//
|
||||
// In both cases, re-enablement of interrupts would have caused the
|
||||
// replayable fault to be triggered in a non-CC setup, because the updated
|
||||
// value of GET is different from PUT. But this not the case in Confidential
|
||||
// Computing, so a bottom half needs to be manually scheduled in order to
|
||||
// ensure that all faults are serviced.
|
||||
//
|
||||
// While in the typical case the retriggering happens within a replayable
|
||||
// fault bottom half, it can also happen within a non-interrupt path such as
|
||||
// uvm_gpu_fault_buffer_flush.
|
||||
if (uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
retrigger = true;
|
||||
|
||||
if (!retrigger)
|
||||
return;
|
||||
|
||||
uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
|
||||
|
||||
// If there is pending work, schedule a replayable faults bottom
|
||||
// half. It is valid for a bottom half (q_item) to reschedule itself.
|
||||
(void) schedule_replayable_faults_handler(parent_gpu);
|
||||
|
||||
uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
|
||||
}
|
||||
|
||||
void uvm_gpu_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
|
||||
@@ -632,9 +687,9 @@ void uvm_gpu_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu)
|
||||
// service_lock mutex is released.
|
||||
|
||||
if (parent_gpu->isr.replayable_faults.handling) {
|
||||
// Turn page fault interrupts back on, unless remove_gpu() has already removed this GPU
|
||||
// from the GPU table. remove_gpu() indicates that situation by setting
|
||||
// gpu->replayable_faults.handling to false.
|
||||
// Turn page fault interrupts back on, unless remove_gpu() has already
|
||||
// removed this GPU from the GPU table. remove_gpu() indicates that
|
||||
// situation by setting gpu->replayable_faults.handling to false.
|
||||
//
|
||||
// This path can only be taken from the bottom half. User threads
|
||||
// calling this function must have previously retained the GPU, so they
|
||||
@@ -671,6 +726,8 @@ void uvm_gpu_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu)
|
||||
uvm_up_out_of_order(&parent_gpu->isr.replayable_faults.service_lock);
|
||||
|
||||
uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
|
||||
|
||||
replayable_faults_retrigger_bottom_half(parent_gpu);
|
||||
}
|
||||
|
||||
void uvm_gpu_non_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu)
|
||||
|
||||
@@ -435,6 +435,11 @@ static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
|
||||
service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
|
||||
service_context->num_retries = 0;
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block)) {
|
||||
uvm_hmm_service_context_init(service_context);
|
||||
uvm_hmm_migrate_begin_wait(va_block);
|
||||
}
|
||||
|
||||
uvm_mutex_lock(&va_block->lock);
|
||||
|
||||
status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
|
||||
@@ -449,6 +454,9 @@ static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
|
||||
|
||||
uvm_mutex_unlock(&va_block->lock);
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_finish(va_block);
|
||||
|
||||
return status == NV_OK? tracker_status: status;
|
||||
}
|
||||
|
||||
@@ -512,6 +520,14 @@ static void schedule_kill_channel(uvm_gpu_t *gpu,
|
||||
&user_channel->kill_channel.kill_channel_q_item);
|
||||
}
|
||||
|
||||
static void service_fault_fatal(uvm_fault_buffer_entry_t *fault_entry, NV_STATUS status)
|
||||
{
|
||||
UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
|
||||
|
||||
fault_entry->is_fatal = true;
|
||||
fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
|
||||
}
|
||||
|
||||
static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct mm_struct *mm,
|
||||
uvm_fault_buffer_entry_t *fault_entry,
|
||||
@@ -521,6 +537,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
|
||||
NV_STATUS status = lookup_status;
|
||||
NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS;
|
||||
|
||||
UVM_ASSERT(!fault_entry->is_fatal);
|
||||
|
||||
@@ -537,27 +554,63 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
|
||||
return status;
|
||||
|
||||
if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
|
||||
struct vm_area_struct *vma;
|
||||
uvm_va_range_t *va_range_next;
|
||||
NvU64 fault_address = fault_entry->fault_address;
|
||||
uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type;
|
||||
uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context;
|
||||
|
||||
uvm_page_mask_zero(&ats_context->read_fault_mask);
|
||||
uvm_page_mask_zero(&ats_context->write_fault_mask);
|
||||
|
||||
ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
|
||||
|
||||
ats_invalidate->write_faults_in_batch = false;
|
||||
|
||||
// The VA isn't managed. See if ATS knows about it.
|
||||
status = uvm_ats_service_fault_entry(gpu_va_space, fault_entry, ats_invalidate);
|
||||
va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);
|
||||
|
||||
// Invalidate ATS TLB entries if needed
|
||||
if (status == NV_OK) {
|
||||
status = uvm_ats_invalidate_tlbs(gpu_va_space,
|
||||
ats_invalidate,
|
||||
&non_replayable_faults->fault_service_tracker);
|
||||
// The VA isn't managed. See if ATS knows about it.
|
||||
vma = find_vma_intersection(mm, fault_address, fault_address + 1);
|
||||
if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) {
|
||||
|
||||
// Do not return error due to logical errors in the application
|
||||
status = NV_OK;
|
||||
}
|
||||
else {
|
||||
NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address);
|
||||
uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
|
||||
uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE;
|
||||
uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ?
|
||||
&ats_context->write_fault_mask :
|
||||
&ats_context->read_fault_mask;
|
||||
|
||||
uvm_page_mask_set(fault_mask, page_index);
|
||||
|
||||
status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context);
|
||||
if (status == NV_OK) {
|
||||
// Invalidate ATS TLB entries if needed
|
||||
if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
|
||||
status = uvm_ats_invalidate_tlbs(gpu_va_space,
|
||||
ats_invalidate,
|
||||
&non_replayable_faults->fault_service_tracker);
|
||||
fatal_fault_status = NV_OK;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fatal_fault_status = status;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
|
||||
fault_entry->is_fatal = true;
|
||||
fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
|
||||
fatal_fault_status = status;
|
||||
|
||||
// Do not return error due to logical errors in the application
|
||||
status = NV_OK;
|
||||
}
|
||||
|
||||
if (fatal_fault_status != NV_OK)
|
||||
service_fault_fatal(fault_entry, fatal_fault_status);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -670,6 +723,8 @@ void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
|
||||
// Differently to replayable faults, we do not batch up and preprocess
|
||||
// non-replayable faults since getting multiple faults on the same
|
||||
// memory region is not very likely
|
||||
//
|
||||
// TODO: Bug 2103669: [UVM/ATS] Optimize ATS fault servicing
|
||||
for (i = 0; i < cached_faults; ++i) {
|
||||
status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
|
||||
if (status != NV_OK)
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include "linux/sort.h"
|
||||
#include "nv_uvm_interface.h"
|
||||
#include "uvm_common.h"
|
||||
#include "uvm_linux.h"
|
||||
#include "uvm_global.h"
|
||||
#include "uvm_gpu_replayable_faults.h"
|
||||
@@ -296,6 +297,19 @@ void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Bug 4098289: this function can be removed, and the calls to it replaced
|
||||
// with calls to uvm_conf_computing_mode_enabled_parent, once UVM ownership is
|
||||
// dictated by Confidential Computing enablement. Currently we support a
|
||||
// non-production scenario in which Confidential Computing is enabled, but
|
||||
// UVM still owns the replayable fault buffer.
|
||||
bool uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
if (uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
return parent_gpu->fault_buffer_info.rm_info.replayable.bUvmOwnsHwFaultBuffer;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
|
||||
@@ -529,6 +543,35 @@ static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
|
||||
parent_gpu->fault_buffer_hal->write_get(parent_gpu, get);
|
||||
}
|
||||
|
||||
static NV_STATUS hw_fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
// When Confidential Computing is enabled, GSP-RM owns the HW replayable
|
||||
// fault buffer. Flushing the fault buffer implies flushing both the HW
|
||||
// buffer (using a RM API), and the SW buffer accessible by UVM ("shadow"
|
||||
// buffer).
|
||||
//
|
||||
// The HW buffer needs to be flushed first. This is because, once that
|
||||
// flush completes, any faults that were present in the HW buffer when
|
||||
// fault_buffer_flush_locked is called, are now either flushed from the HW
|
||||
// buffer, or are present in the shadow buffer and are about to be discarded
|
||||
// too.
|
||||
if (!uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
return NV_OK;
|
||||
|
||||
// nvUvmInterfaceFlushReplayableFaultBuffer relies on the caller to ensure
|
||||
// serialization for a given GPU.
|
||||
UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
|
||||
|
||||
// Flush the HW replayable buffer owned by GSP-RM.
|
||||
status = nvUvmInterfaceFlushReplayableFaultBuffer(parent_gpu->rm_device);
|
||||
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,
|
||||
uvm_gpu_buffer_flush_mode_t flush_mode,
|
||||
uvm_fault_replay_type_t fault_replay,
|
||||
@@ -537,23 +580,37 @@ static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,
|
||||
NvU32 get;
|
||||
NvU32 put;
|
||||
uvm_spin_loop_t spin;
|
||||
uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
|
||||
uvm_parent_gpu_t *parent_gpu = gpu->parent;
|
||||
uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock));
|
||||
UVM_ASSERT(gpu->parent->replayable_faults_supported);
|
||||
UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
|
||||
UVM_ASSERT(parent_gpu->replayable_faults_supported);
|
||||
|
||||
// Wait for the prior replay to flush out old fault messages
|
||||
if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) {
|
||||
status = uvm_tracker_wait(&replayable_faults->replay_tracker);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
// Read PUT pointer from the GPU if requested
|
||||
if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT)
|
||||
replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent);
|
||||
if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT || flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) {
|
||||
status = hw_fault_buffer_flush_locked(parent_gpu);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
|
||||
}
|
||||
|
||||
get = replayable_faults->cached_get;
|
||||
put = replayable_faults->cached_put;
|
||||
|
||||
while (get != put) {
|
||||
// Wait until valid bit is set
|
||||
UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin);
|
||||
UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin);
|
||||
|
||||
gpu->parent->fault_buffer_hal->entry_clear_valid(gpu->parent, get);
|
||||
parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, get);
|
||||
++get;
|
||||
if (get == replayable_faults->max_faults)
|
||||
get = 0;
|
||||
@@ -575,7 +632,7 @@ NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu)
|
||||
uvm_gpu_replayable_faults_isr_lock(gpu->parent);
|
||||
|
||||
status = fault_buffer_flush_locked(gpu,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
|
||||
UVM_FAULT_REPLAY_TYPE_START,
|
||||
NULL);
|
||||
|
||||
@@ -956,6 +1013,10 @@ static NV_STATUS translate_instance_ptrs(uvm_gpu_t *gpu,
|
||||
// If the channel is gone then we're looking at a stale fault entry.
|
||||
// The fault must have been resolved already (serviced or
|
||||
// cancelled), so we can just flush the fault buffer.
|
||||
//
|
||||
// No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since
|
||||
// there was a context preemption for the entries we want to flush,
|
||||
// meaning PUT must reflect them.
|
||||
status = fault_buffer_flush_locked(gpu,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
|
||||
UVM_FAULT_REPLAY_TYPE_START,
|
||||
@@ -1047,7 +1108,67 @@ static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_
|
||||
return is_duplicate;
|
||||
}
|
||||
|
||||
static void fault_entry_duplicate_flags(uvm_fault_buffer_entry_t *current_entry,
|
||||
static void update_batch_and_notify_fault(uvm_gpu_t *gpu,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_va_block_t *va_block,
|
||||
uvm_processor_id_t preferred_location,
|
||||
uvm_fault_buffer_entry_t *current_entry,
|
||||
bool is_duplicate)
|
||||
{
|
||||
if (is_duplicate)
|
||||
batch_context->num_duplicate_faults += current_entry->num_instances;
|
||||
else
|
||||
batch_context->num_duplicate_faults += current_entry->num_instances - 1;
|
||||
|
||||
uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events,
|
||||
va_block,
|
||||
gpu->id,
|
||||
preferred_location,
|
||||
current_entry,
|
||||
batch_context->batch_id,
|
||||
is_duplicate);
|
||||
}
|
||||
|
||||
static void mark_fault_invalid_prefetch(uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_fault_buffer_entry_t *fault_entry)
|
||||
{
|
||||
fault_entry->is_invalid_prefetch = true;
|
||||
|
||||
// For block faults, the following counter might be updated more than once
|
||||
// for the same fault if block_context->num_retries > 0. As a result, this
|
||||
// counter might be higher than the actual count. In order for this counter
|
||||
// to be always accurate, block_context needs to passed down the stack from
|
||||
// all callers. But since num_retries > 0 case is uncommon and imprecise
|
||||
// invalid_prefetch counter doesn't affect functionality (other than
|
||||
// disabling prefetching if the counter indicates lots of invalid prefetch
|
||||
// faults), this is ok.
|
||||
batch_context->num_invalid_prefetch_faults += fault_entry->num_instances;
|
||||
}
|
||||
|
||||
static void mark_fault_throttled(uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_fault_buffer_entry_t *fault_entry)
|
||||
{
|
||||
fault_entry->is_throttled = true;
|
||||
batch_context->has_throttled_faults = true;
|
||||
}
|
||||
|
||||
static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_fault_buffer_entry_t *fault_entry,
|
||||
UvmEventFatalReason fatal_reason,
|
||||
uvm_fault_cancel_va_mode_t cancel_va_mode)
|
||||
{
|
||||
uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[fault_entry->fault_source.utlb_id];
|
||||
|
||||
fault_entry->is_fatal = true;
|
||||
fault_entry->fatal_reason = fatal_reason;
|
||||
fault_entry->replayable.cancel_va_mode = cancel_va_mode;
|
||||
|
||||
utlb->has_fatal_faults = true;
|
||||
batch_context->has_fatal_faults = true;
|
||||
}
|
||||
|
||||
static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_fault_buffer_entry_t *current_entry,
|
||||
const uvm_fault_buffer_entry_t *previous_entry)
|
||||
{
|
||||
UVM_ASSERT(previous_entry);
|
||||
@@ -1056,37 +1177,11 @@ static void fault_entry_duplicate_flags(uvm_fault_buffer_entry_t *current_entry,
|
||||
// Propagate the is_invalid_prefetch flag across all prefetch faults
|
||||
// on the page
|
||||
if (previous_entry->is_invalid_prefetch)
|
||||
current_entry->is_invalid_prefetch = true;
|
||||
mark_fault_invalid_prefetch(batch_context, current_entry);
|
||||
|
||||
// If a page is throttled, all faults on the page must be skipped
|
||||
if (previous_entry->is_throttled)
|
||||
current_entry->is_throttled = true;
|
||||
}
|
||||
|
||||
static void update_batch_context(uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_fault_buffer_entry_t *current_entry,
|
||||
const uvm_fault_buffer_entry_t *previous_entry)
|
||||
{
|
||||
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
|
||||
|
||||
UVM_ASSERT(utlb->num_pending_faults > 0);
|
||||
|
||||
if (is_duplicate)
|
||||
batch_context->num_duplicate_faults += current_entry->num_instances;
|
||||
else
|
||||
batch_context->num_duplicate_faults += current_entry->num_instances - 1;
|
||||
|
||||
if (current_entry->is_invalid_prefetch)
|
||||
batch_context->num_invalid_prefetch_faults += current_entry->num_instances;
|
||||
|
||||
if (current_entry->is_fatal) {
|
||||
utlb->has_fatal_faults = true;
|
||||
batch_context->has_fatal_faults = true;
|
||||
}
|
||||
|
||||
if (current_entry->is_throttled)
|
||||
batch_context->has_throttled_faults = true;
|
||||
mark_fault_throttled(batch_context, current_entry);
|
||||
}
|
||||
|
||||
// This function computes the maximum access type that can be serviced for the
|
||||
@@ -1109,12 +1204,17 @@ static void update_batch_context(uvm_fault_service_batch_context_t *batch_contex
|
||||
// Return values:
|
||||
// - service_access_type: highest access type that can be serviced.
|
||||
static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_service_block_context_t *service_block_context,
|
||||
uvm_fault_buffer_entry_t *fault_entry,
|
||||
bool allow_migration)
|
||||
{
|
||||
NV_STATUS perm_status;
|
||||
UvmEventFatalReason fatal_reason;
|
||||
uvm_fault_cancel_va_mode_t cancel_va_mode;
|
||||
uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT;
|
||||
uvm_va_block_context_t *va_block_context = &service_block_context->block_context;
|
||||
|
||||
perm_status = uvm_va_block_check_logical_permissions(va_block,
|
||||
va_block_context,
|
||||
@@ -1127,16 +1227,20 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
|
||||
return fault_entry->fault_access_type;
|
||||
|
||||
if (fault_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
|
||||
fault_entry->is_invalid_prefetch = true;
|
||||
return UVM_FAULT_ACCESS_TYPE_COUNT;
|
||||
// Only update the count the first time since logical permissions cannot
|
||||
// change while we hold the VA space lock
|
||||
// TODO: Bug 1750144: That might not be true with HMM.
|
||||
if (service_block_context->num_retries == 0)
|
||||
mark_fault_invalid_prefetch(batch_context, fault_entry);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// At this point we know that some fault instances cannot be serviced
|
||||
fault_entry->is_fatal = true;
|
||||
fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
|
||||
fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
|
||||
|
||||
if (fault_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) {
|
||||
fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
|
||||
|
||||
// If there are pending read accesses on the same page, we have to
|
||||
// service them before we can cancel the write/atomic faults. So we
|
||||
@@ -1149,19 +1253,23 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
|
||||
fault_entry->fault_address),
|
||||
UVM_FAULT_ACCESS_TYPE_READ,
|
||||
allow_migration);
|
||||
if (perm_status == NV_OK)
|
||||
return UVM_FAULT_ACCESS_TYPE_READ;
|
||||
|
||||
// If that didn't succeed, cancel all faults
|
||||
fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
|
||||
if (perm_status == NV_OK) {
|
||||
ret = UVM_FAULT_ACCESS_TYPE_READ;
|
||||
}
|
||||
else {
|
||||
// Read accesses didn't succeed, cancel all faults
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
}
|
||||
|
||||
return UVM_FAULT_ACCESS_TYPE_COUNT;
|
||||
mark_fault_fatal(batch_context, fault_entry, fatal_reason, cancel_va_mode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// We notify the fault event for all faults within the block so that the
|
||||
@@ -1258,24 +1366,26 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
|
||||
is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
}
|
||||
|
||||
// Only update counters the first time since logical permissions cannot
|
||||
// change while we hold the VA space lock.
|
||||
// TODO: Bug 1750144: That might not be true with HMM.
|
||||
if (block_context->num_retries == 0) {
|
||||
uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
|
||||
va_block,
|
||||
gpu->id,
|
||||
block_context->block_context.policy->preferred_location,
|
||||
current_entry,
|
||||
batch_context->batch_id,
|
||||
is_duplicate);
|
||||
update_batch_and_notify_fault(gpu,
|
||||
batch_context,
|
||||
va_block,
|
||||
block_context->block_context.policy->preferred_location,
|
||||
current_entry,
|
||||
is_duplicate);
|
||||
}
|
||||
|
||||
// Service the most intrusive fault per page, only. Waive the rest
|
||||
if (is_duplicate) {
|
||||
fault_entry_duplicate_flags(current_entry, previous_entry);
|
||||
fault_entry_duplicate_flags(batch_context, current_entry, previous_entry);
|
||||
|
||||
// The previous fault was non-fatal so the page has been already
|
||||
// serviced
|
||||
if (!previous_entry->is_fatal)
|
||||
goto next;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ensure that the migratability iterator covers the current fault
|
||||
@@ -1286,15 +1396,16 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
|
||||
UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address);
|
||||
|
||||
service_access_type = check_fault_access_permissions(gpu,
|
||||
batch_context,
|
||||
va_block,
|
||||
&block_context->block_context,
|
||||
block_context,
|
||||
current_entry,
|
||||
iter.migratable);
|
||||
|
||||
// Do not exit early due to logical errors such as access permission
|
||||
// violation.
|
||||
if (service_access_type == UVM_FAULT_ACCESS_TYPE_COUNT)
|
||||
goto next;
|
||||
continue;
|
||||
|
||||
if (service_access_type != current_entry->fault_access_type) {
|
||||
// Some of the fault instances cannot be serviced due to invalid
|
||||
@@ -1313,15 +1424,21 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
|
||||
page_index,
|
||||
gpu->id,
|
||||
uvm_fault_access_type_to_prot(service_access_type)))
|
||||
goto next;
|
||||
continue;
|
||||
|
||||
thrashing_hint = uvm_perf_thrashing_get_hint(va_block, current_entry->fault_address, gpu->id);
|
||||
if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
|
||||
// Throttling is implemented by sleeping in the fault handler on
|
||||
// the CPU and by continuing to process faults on other pages on
|
||||
// the GPU
|
||||
current_entry->is_throttled = true;
|
||||
goto next;
|
||||
//
|
||||
// Only update the flag the first time since logical permissions
|
||||
// cannot change while we hold the VA space lock.
|
||||
// TODO: Bug 1750144: That might not be true with HMM.
|
||||
if (block_context->num_retries == 0)
|
||||
mark_fault_throttled(batch_context, current_entry);
|
||||
|
||||
continue;
|
||||
}
|
||||
else if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
|
||||
if (block_context->thrashing_pin_count++ == 0)
|
||||
@@ -1361,13 +1478,6 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
|
||||
first_page_index = page_index;
|
||||
if (page_index > last_page_index)
|
||||
last_page_index = page_index;
|
||||
|
||||
next:
|
||||
// Only update counters the first time since logical permissions cannot
|
||||
// change while we hold the VA space lock
|
||||
// TODO: Bug 1750144: That might not be true with HMM.
|
||||
if (block_context->num_retries == 0)
|
||||
update_batch_context(batch_context, current_entry, previous_entry);
|
||||
}
|
||||
|
||||
// Apply the changes computed in the fault service block context, if there
|
||||
@@ -1408,6 +1518,9 @@ static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu,
|
||||
fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
|
||||
fault_block_context->num_retries = 0;
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_begin_wait(va_block);
|
||||
|
||||
uvm_mutex_lock(&va_block->lock);
|
||||
|
||||
status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
|
||||
@@ -1422,6 +1535,9 @@ static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu,
|
||||
|
||||
uvm_mutex_unlock(&va_block->lock);
|
||||
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_finish(va_block);
|
||||
|
||||
return status == NV_OK? tracker_status: status;
|
||||
}
|
||||
|
||||
@@ -1435,54 +1551,11 @@ typedef enum
|
||||
FAULT_SERVICE_MODE_CANCEL,
|
||||
} fault_service_mode_t;
|
||||
|
||||
static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct mm_struct *mm,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 first_fault_index,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_gpu_t *gpu = gpu_va_space->gpu;
|
||||
uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
|
||||
const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
|
||||
batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
|
||||
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
|
||||
if (is_duplicate)
|
||||
fault_entry_duplicate_flags(current_entry, previous_entry);
|
||||
|
||||
// Generate fault events for all fault packets
|
||||
uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events,
|
||||
NULL,
|
||||
gpu->id,
|
||||
UVM_ID_INVALID,
|
||||
current_entry,
|
||||
batch_context->batch_id,
|
||||
is_duplicate);
|
||||
|
||||
// The VA isn't managed. See if ATS knows about it, unless it is a
|
||||
// duplicate and the previous fault was non-fatal so the page has
|
||||
// already been serviced
|
||||
//
|
||||
// TODO: Bug 2103669: Service more than one ATS fault at a time so we
|
||||
// don't do an unconditional VA range lookup for every ATS fault.
|
||||
if (!is_duplicate || previous_entry->is_fatal)
|
||||
status = uvm_ats_service_fault_entry(gpu_va_space, current_entry, ats_invalidate);
|
||||
else
|
||||
status = NV_OK;
|
||||
|
||||
(*block_faults)++;
|
||||
|
||||
update_batch_context(batch_context, current_entry, previous_entry);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static void service_fault_batch_fatal(uvm_gpu_t *gpu,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 first_fault_index,
|
||||
NV_STATUS status,
|
||||
uvm_fault_cancel_va_mode_t cancel_va_mode,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
|
||||
@@ -1491,60 +1564,337 @@ static void service_fault_batch_fatal(uvm_gpu_t *gpu,
|
||||
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
|
||||
if (is_duplicate)
|
||||
fault_entry_duplicate_flags(current_entry, previous_entry);
|
||||
fault_entry_duplicate_flags(batch_context, current_entry, previous_entry);
|
||||
|
||||
// The VA block cannot be found, set the fatal fault flag,
|
||||
// unless it is a prefetch fault
|
||||
if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
|
||||
current_entry->is_invalid_prefetch = true;
|
||||
}
|
||||
else {
|
||||
current_entry->is_fatal = true;
|
||||
current_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
|
||||
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
}
|
||||
|
||||
update_batch_context(batch_context, current_entry, previous_entry);
|
||||
|
||||
uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events,
|
||||
NULL,
|
||||
gpu->id,
|
||||
UVM_ID_INVALID,
|
||||
current_entry,
|
||||
batch_context->batch_id,
|
||||
is_duplicate);
|
||||
if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH)
|
||||
mark_fault_invalid_prefetch(batch_context, current_entry);
|
||||
else
|
||||
mark_fault_fatal(batch_context, current_entry, uvm_tools_status_to_fatal_fault_reason(status), cancel_va_mode);
|
||||
|
||||
(*block_faults)++;
|
||||
}
|
||||
|
||||
static void service_fault_batch_fatal_notify(uvm_gpu_t *gpu,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 first_fault_index,
|
||||
NV_STATUS status,
|
||||
uvm_fault_cancel_va_mode_t cancel_va_mode,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
|
||||
const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
|
||||
batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
|
||||
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
|
||||
service_fault_batch_fatal(gpu, batch_context, first_fault_index, status, cancel_va_mode, block_faults);
|
||||
|
||||
update_batch_and_notify_fault(gpu, batch_context, NULL, UVM_ID_INVALID, current_entry, is_duplicate);
|
||||
}
|
||||
|
||||
static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct vm_area_struct *vma,
|
||||
NvU64 base,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 fault_index_start,
|
||||
NvU32 fault_index_end,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
NvU32 i;
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_gpu_t *gpu = gpu_va_space->gpu;
|
||||
uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
|
||||
const uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
|
||||
const uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
|
||||
const uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
|
||||
const uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask;
|
||||
uvm_page_mask_t *tmp_mask = &ats_context->tmp_mask;
|
||||
|
||||
UVM_ASSERT(vma);
|
||||
|
||||
ats_context->client_type = UVM_FAULT_CLIENT_TYPE_GPC;
|
||||
|
||||
uvm_page_mask_or(tmp_mask, write_fault_mask, read_fault_mask);
|
||||
|
||||
status = uvm_ats_service_faults(gpu_va_space, vma, base, &batch_context->ats_context);
|
||||
|
||||
UVM_ASSERT(uvm_page_mask_subset(faults_serviced_mask, tmp_mask));
|
||||
|
||||
if ((status != NV_OK) || uvm_page_mask_equal(faults_serviced_mask, tmp_mask)) {
|
||||
(*block_faults) += (fault_index_end - fault_index_start);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Check faults_serviced_mask and reads_serviced_mask for precise fault
|
||||
// attribution after calling the ATS servicing routine. The
|
||||
// errors returned from ATS servicing routine should only be
|
||||
// global errors such as OOM or ECC. uvm_gpu_service_replayable_faults()
|
||||
// handles global errors by calling cancel_fault_batch(). Precise
|
||||
// attribution isn't currently supported in such cases.
|
||||
//
|
||||
// Precise fault attribution for global errors can be handled by
|
||||
// servicing one fault at a time until fault servicing encounters an
|
||||
// error.
|
||||
// TODO: Bug 3989244: Precise ATS fault attribution for global errors.
|
||||
for (i = fault_index_start; i < fault_index_end; i++) {
|
||||
uvm_page_index_t page_index;
|
||||
uvm_fault_cancel_va_mode_t cancel_va_mode;
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
|
||||
uvm_fault_access_type_t access_type = current_entry->fault_access_type;
|
||||
|
||||
page_index = (current_entry->fault_address - base) / PAGE_SIZE;
|
||||
|
||||
if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
|
||||
(*block_faults)++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) {
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
}
|
||||
else if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) {
|
||||
if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) &&
|
||||
!uvm_page_mask_test(reads_serviced_mask, page_index))
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
|
||||
else
|
||||
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
|
||||
}
|
||||
|
||||
service_fault_batch_fatal(gpu, batch_context, i, NV_ERR_INVALID_ADDRESS, cancel_va_mode, block_faults);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static void start_new_sub_batch(NvU64 *sub_batch_base,
|
||||
NvU64 address,
|
||||
NvU32 *sub_batch_fault_index,
|
||||
NvU32 fault_index,
|
||||
uvm_ats_fault_context_t *ats_context)
|
||||
{
|
||||
uvm_page_mask_zero(&ats_context->read_fault_mask);
|
||||
uvm_page_mask_zero(&ats_context->write_fault_mask);
|
||||
|
||||
*sub_batch_fault_index = fault_index;
|
||||
*sub_batch_base = UVM_VA_BLOCK_ALIGN_DOWN(address);
|
||||
}
|
||||
|
||||
static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct vm_area_struct *vma,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 fault_index,
|
||||
NvU64 outer,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
NvU32 i = fault_index;
|
||||
NvU32 sub_batch_fault_index;
|
||||
NvU64 sub_batch_base;
|
||||
uvm_fault_buffer_entry_t *previous_entry = NULL;
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
|
||||
uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
|
||||
uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
|
||||
uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
|
||||
uvm_gpu_t *gpu = gpu_va_space->gpu;
|
||||
bool replay_per_va_block =
|
||||
(gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK);
|
||||
|
||||
UVM_ASSERT(vma);
|
||||
|
||||
outer = min(outer, (NvU64) vma->vm_end);
|
||||
|
||||
start_new_sub_batch(&sub_batch_base, current_entry->fault_address, &sub_batch_fault_index, i, ats_context);
|
||||
|
||||
do {
|
||||
uvm_page_index_t page_index;
|
||||
NvU64 fault_address = current_entry->fault_address;
|
||||
uvm_fault_access_type_t access_type = current_entry->fault_access_type;
|
||||
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
|
||||
|
||||
i++;
|
||||
|
||||
update_batch_and_notify_fault(gpu_va_space->gpu,
|
||||
batch_context,
|
||||
NULL,
|
||||
UVM_ID_INVALID,
|
||||
current_entry,
|
||||
is_duplicate);
|
||||
|
||||
// End of sub-batch. Service faults gathered so far.
|
||||
if (fault_address >= (sub_batch_base + UVM_VA_BLOCK_SIZE)) {
|
||||
UVM_ASSERT(!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask));
|
||||
|
||||
status = service_fault_batch_ats_sub_vma(gpu_va_space,
|
||||
vma,
|
||||
sub_batch_base,
|
||||
batch_context,
|
||||
sub_batch_fault_index,
|
||||
i - 1,
|
||||
block_faults);
|
||||
if (status != NV_OK || replay_per_va_block)
|
||||
break;
|
||||
|
||||
start_new_sub_batch(&sub_batch_base, fault_address, &sub_batch_fault_index, i - 1, ats_context);
|
||||
}
|
||||
|
||||
page_index = (fault_address - sub_batch_base) / PAGE_SIZE;
|
||||
|
||||
if ((access_type <= UVM_FAULT_ACCESS_TYPE_READ) ||
|
||||
uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ))
|
||||
uvm_page_mask_set(read_fault_mask, page_index);
|
||||
|
||||
if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE)
|
||||
uvm_page_mask_set(write_fault_mask, page_index);
|
||||
|
||||
previous_entry = current_entry;
|
||||
current_entry = i < batch_context->num_coalesced_faults ? batch_context->ordered_fault_cache[i] : NULL;
|
||||
|
||||
} while (current_entry &&
|
||||
(current_entry->fault_address < outer) &&
|
||||
(previous_entry->va_space == current_entry->va_space));
|
||||
|
||||
// Service the last sub-batch.
|
||||
if ((status == NV_OK) && (!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask))) {
|
||||
status = service_fault_batch_ats_sub_vma(gpu_va_space,
|
||||
vma,
|
||||
sub_batch_base,
|
||||
batch_context,
|
||||
sub_batch_fault_index,
|
||||
i,
|
||||
block_faults);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space,
|
||||
struct mm_struct *mm,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 first_fault_index,
|
||||
NvU64 outer,
|
||||
NvU32 *block_faults)
|
||||
{
|
||||
NvU32 i;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
for (i = first_fault_index; i < batch_context->num_coalesced_faults;) {
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
|
||||
const uvm_fault_buffer_entry_t *previous_entry = i > first_fault_index ?
|
||||
batch_context->ordered_fault_cache[i - 1] : NULL;
|
||||
NvU64 fault_address = current_entry->fault_address;
|
||||
struct vm_area_struct *vma;
|
||||
NvU32 num_faults_before = (*block_faults);
|
||||
|
||||
if (previous_entry && (previous_entry->va_space != current_entry->va_space))
|
||||
break;
|
||||
|
||||
if (fault_address >= outer)
|
||||
break;
|
||||
|
||||
vma = find_vma_intersection(mm, fault_address, fault_address + 1);
|
||||
if (!vma) {
|
||||
// Since a vma wasn't found, cancel all accesses on the page since
|
||||
// cancelling write and atomic accesses will not cancel pending read
|
||||
// faults and this can lead to a deadlock since read faults need to
|
||||
// be serviced first before cancelling write faults.
|
||||
service_fault_batch_fatal_notify(gpu_va_space->gpu,
|
||||
batch_context,
|
||||
i,
|
||||
NV_ERR_INVALID_ADDRESS,
|
||||
UVM_FAULT_CANCEL_VA_MODE_ALL,
|
||||
block_faults);
|
||||
|
||||
// Do not fail due to logical errors.
|
||||
status = NV_OK;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
status = service_fault_batch_ats_sub(gpu_va_space, vma, batch_context, i, outer, block_faults);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
|
||||
i += ((*block_faults) - num_faults_before);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
|
||||
uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_fault_service_batch_context_t *batch_context,
|
||||
NvU32 first_fault_index,
|
||||
NvU32 *block_faults)
|
||||
NvU32 fault_index,
|
||||
NvU32 *block_faults,
|
||||
bool replay_per_va_block)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_va_range_t *va_range;
|
||||
uvm_va_range_t *va_range = NULL;
|
||||
uvm_va_range_t *va_range_next = NULL;
|
||||
uvm_va_block_t *va_block;
|
||||
uvm_gpu_t *gpu = gpu_va_space->gpu;
|
||||
uvm_va_block_context_t *va_block_context =
|
||||
&gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
|
||||
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index];
|
||||
struct mm_struct *mm = va_block_context->mm;
|
||||
NvU64 fault_address = current_entry->fault_address;
|
||||
|
||||
(*block_faults) = 0;
|
||||
|
||||
va_range = uvm_va_range_find(va_space, fault_address);
|
||||
va_range_next = uvm_va_space_iter_first(va_space, fault_address, ~0ULL);
|
||||
if (va_range_next && (fault_address >= va_range_next->node.start)) {
|
||||
UVM_ASSERT(fault_address < va_range_next->node.end);
|
||||
|
||||
va_range = va_range_next;
|
||||
va_range_next = uvm_va_space_iter_next(va_range_next, ~0ULL);
|
||||
}
|
||||
|
||||
status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, va_block_context, &va_block);
|
||||
if (status == NV_OK) {
|
||||
status = service_fault_batch_block(gpu, va_block, batch_context, first_fault_index, block_faults);
|
||||
status = service_fault_batch_block(gpu, va_block, batch_context, fault_index, block_faults);
|
||||
}
|
||||
else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) {
|
||||
status = service_fault_batch_ats(gpu_va_space, mm, batch_context, first_fault_index, block_faults);
|
||||
NvU64 outer = ~0ULL;
|
||||
|
||||
UVM_ASSERT(replay_per_va_block ==
|
||||
(gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK));
|
||||
|
||||
// Limit outer to the minimum of next va_range.start and first
|
||||
// fault_address' next UVM_GMMU_ATS_GRANULARITY alignment so that it's
|
||||
// enough to check whether the first fault in this dispatch belongs to a
|
||||
// GMMU region.
|
||||
if (va_range_next) {
|
||||
outer = min(va_range_next->node.start,
|
||||
UVM_ALIGN_DOWN(fault_address + UVM_GMMU_ATS_GRANULARITY, UVM_GMMU_ATS_GRANULARITY));
|
||||
}
|
||||
|
||||
// ATS lookups are disabled on all addresses within the same
|
||||
// UVM_GMMU_ATS_GRANULARITY as existing GMMU mappings (see documentation
|
||||
// in uvm_mmu.h). User mode is supposed to reserve VAs as appropriate to
|
||||
// prevent any system memory allocations from falling within the NO_ATS
|
||||
// range of other GMMU mappings, so this shouldn't happen during normal
|
||||
// operation. However, since this scenario may lead to infinite fault
|
||||
// loops, we handle it by canceling the fault.
|
||||
if (uvm_ats_check_in_gmmu_region(va_space, fault_address, va_range_next)) {
|
||||
service_fault_batch_fatal_notify(gpu,
|
||||
batch_context,
|
||||
fault_index,
|
||||
NV_ERR_INVALID_ADDRESS,
|
||||
UVM_FAULT_CANCEL_VA_MODE_ALL,
|
||||
block_faults);
|
||||
|
||||
// Do not fail due to logical errors
|
||||
status = NV_OK;
|
||||
}
|
||||
else {
|
||||
status = service_fault_batch_ats(gpu_va_space, mm, batch_context, fault_index, outer, block_faults);
|
||||
}
|
||||
}
|
||||
else {
|
||||
service_fault_batch_fatal(gpu_va_space->gpu, batch_context, first_fault_index, status, block_faults);
|
||||
service_fault_batch_fatal_notify(gpu,
|
||||
batch_context,
|
||||
fault_index,
|
||||
status,
|
||||
UVM_FAULT_CANCEL_VA_MODE_ALL,
|
||||
block_faults);
|
||||
|
||||
// Do not fail due to logical errors
|
||||
status = NV_OK;
|
||||
@@ -1573,12 +1923,14 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
|
||||
struct mm_struct *mm = NULL;
|
||||
const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL &&
|
||||
gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
|
||||
uvm_va_block_context_t *va_block_context =
|
||||
&gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
|
||||
uvm_service_block_context_t *service_context =
|
||||
&gpu->parent->fault_buffer_info.replayable.block_service_context;
|
||||
uvm_va_block_context_t *va_block_context = &service_context->block_context;
|
||||
|
||||
UVM_ASSERT(gpu->parent->replayable_faults_supported);
|
||||
|
||||
ats_invalidate->write_faults_in_batch = false;
|
||||
uvm_hmm_service_context_init(service_context);
|
||||
|
||||
for (i = 0; i < batch_context->num_coalesced_faults;) {
|
||||
NvU32 block_faults;
|
||||
@@ -1616,7 +1968,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
|
||||
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
|
||||
if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
|
||||
status = fault_buffer_flush_locked(gpu,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
|
||||
UVM_FAULT_REPLAY_TYPE_START,
|
||||
batch_context);
|
||||
if (status == NV_OK)
|
||||
@@ -1649,7 +2001,12 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
|
||||
continue;
|
||||
}
|
||||
|
||||
status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults);
|
||||
status = service_fault_batch_dispatch(va_space,
|
||||
gpu_va_space,
|
||||
batch_context,
|
||||
i,
|
||||
&block_faults,
|
||||
replay_per_va_block);
|
||||
// TODO: Bug 3900733: clean up locking in service_fault_batch().
|
||||
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
|
||||
uvm_va_space_up_read(va_space);
|
||||
@@ -2095,7 +2452,11 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
|
||||
// arriving. Therefore, in each iteration we just try to cancel faults
|
||||
// from uTLBs that contained fatal faults in the previous iterations
|
||||
// and will cause the TLB to stop generating new page faults after the
|
||||
// following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL
|
||||
// following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL.
|
||||
//
|
||||
// No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since we
|
||||
// don't care too much about old faults, just new faults from uTLBs
|
||||
// which faulted before the replay.
|
||||
status = fault_buffer_flush_locked(gpu,
|
||||
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
|
||||
UVM_FAULT_REPLAY_TYPE_START_ACK_ALL,
|
||||
@@ -2204,6 +2565,8 @@ static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fau
|
||||
|
||||
// If more than 66% of faults are invalid prefetch accesses, disable
|
||||
// prefetch faults for a while.
|
||||
// num_invalid_prefetch_faults may be higher than the actual count. See the
|
||||
// comment in mark_fault_invalid_prefetch(..).
|
||||
// Some tests rely on this logic (and ratio) to correctly disable prefetch
|
||||
// fault reporting. If the logic changes, the tests will have to be changed.
|
||||
if (parent_gpu->fault_buffer_info.prefetch_faults_enabled &&
|
||||
|
||||
@@ -75,4 +75,7 @@ void uvm_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
|
||||
// only called from the ISR bottom half
|
||||
void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu);
|
||||
|
||||
// Returns true if UVM owns the hardware replayable fault buffer
|
||||
bool uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(uvm_parent_gpu_t *parent_gpu);
|
||||
|
||||
#endif // __UVM_GPU_PAGE_FAULT_H__
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "uvm_global.h"
|
||||
#include "uvm_kvmalloc.h"
|
||||
#include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
#define UVM_SEMAPHORE_SIZE 4
|
||||
#define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
|
||||
@@ -44,6 +45,9 @@ struct uvm_gpu_semaphore_pool_struct
|
||||
// List of all the semaphore pages belonging to the pool
|
||||
struct list_head pages;
|
||||
|
||||
// Pages aperture.
|
||||
uvm_aperture_t aperture;
|
||||
|
||||
// Count of free semaphores among all the pages
|
||||
NvU32 free_semaphores_count;
|
||||
|
||||
@@ -66,11 +70,24 @@ struct uvm_gpu_semaphore_pool_page_struct
|
||||
DECLARE_BITMAP(free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
|
||||
};
|
||||
|
||||
static bool gpu_semaphore_pool_is_secure(uvm_gpu_semaphore_pool_t *pool)
|
||||
{
|
||||
return uvm_conf_computing_mode_enabled(pool->gpu) && (pool->aperture == UVM_APERTURE_VID);
|
||||
}
|
||||
|
||||
static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
return gpu_semaphore_pool_is_secure(semaphore->page->pool);
|
||||
}
|
||||
|
||||
static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
NvU32 offset;
|
||||
NvU32 index;
|
||||
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
return semaphore->conf_computing.index;
|
||||
|
||||
UVM_ASSERT(semaphore->payload != NULL);
|
||||
UVM_ASSERT(semaphore->page != NULL);
|
||||
|
||||
@@ -118,6 +135,14 @@ static bool is_canary(NvU32 val)
|
||||
return (val & ~UVM_SEMAPHORE_CANARY_MASK) == UVM_SEMAPHORE_CANARY_BASE;
|
||||
}
|
||||
|
||||
static bool semaphore_uses_canary(uvm_gpu_semaphore_pool_t *pool)
|
||||
{
|
||||
// A pool allocated in the CPR of vidmem cannot be read/written from the
|
||||
// CPU.
|
||||
return !gpu_semaphore_pool_is_secure(pool) && UVM_IS_DEBUG();
|
||||
return UVM_IS_DEBUG();
|
||||
}
|
||||
|
||||
// Can the GPU access the semaphore, i.e., can Host/Esched address the semaphore
|
||||
// pool?
|
||||
static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
|
||||
@@ -125,13 +150,34 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
|
||||
return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
|
||||
}
|
||||
|
||||
// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
|
||||
// the owning GPU as no other processor have access to it.
|
||||
static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
|
||||
uvm_gpu_semaphore_pool_page_t *pool_page,
|
||||
uvm_rm_mem_type_t memory_type)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
|
||||
status = uvm_rm_mem_alloc(pool->gpu,
|
||||
memory_type,
|
||||
UVM_SEMAPHORE_PAGE_SIZE,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&pool_page->memory);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_gpu_semaphore_pool_page_t *pool_page;
|
||||
NvU32 *payloads;
|
||||
size_t i;
|
||||
uvm_rm_mem_type_t rm_mem_type = UVM_RM_MEM_TYPE_SYS;
|
||||
uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
|
||||
|
||||
uvm_assert_mutex_locked(&pool->mutex);
|
||||
|
||||
@@ -142,13 +188,24 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
|
||||
pool_page->pool = pool;
|
||||
|
||||
// Whenever the Confidential Computing feature is enabled, engines can
|
||||
// access semaphores only in the CPR of vidmem. Mapping to other GPUs is
|
||||
// also disabled.
|
||||
if (gpu_semaphore_pool_is_secure(pool)) {
|
||||
status = pool_alloc_secure_page(pool, pool_page, memory_type);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
|
||||
rm_mem_type,
|
||||
memory_type,
|
||||
UVM_SEMAPHORE_PAGE_SIZE,
|
||||
0,
|
||||
&pool_page->memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Verify the GPU can access the semaphore pool.
|
||||
UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
|
||||
@@ -159,8 +216,7 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
list_add(&pool_page->all_pages_node, &pool->pages);
|
||||
pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;
|
||||
|
||||
// Initialize the semaphore payloads to known values
|
||||
if (UVM_IS_DEBUG()) {
|
||||
if (semaphore_uses_canary(pool)) {
|
||||
payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
|
||||
for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
|
||||
payloads[i] = make_canary(0);
|
||||
@@ -176,8 +232,6 @@ error:
|
||||
static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)
|
||||
{
|
||||
uvm_gpu_semaphore_pool_t *pool;
|
||||
NvU32 *payloads;
|
||||
size_t i;
|
||||
|
||||
UVM_ASSERT(page);
|
||||
pool = page->pool;
|
||||
@@ -190,9 +244,9 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)
|
||||
"count: %u\n",
|
||||
pool->free_semaphores_count);
|
||||
|
||||
// Check for semaphore release-after-free
|
||||
if (UVM_IS_DEBUG()) {
|
||||
payloads = uvm_rm_mem_get_cpu_va(page->memory);
|
||||
if (semaphore_uses_canary(pool)) {
|
||||
size_t i;
|
||||
NvU32 *payloads = uvm_rm_mem_get_cpu_va(page->memory);
|
||||
for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
|
||||
UVM_ASSERT(is_canary(payloads[i]));
|
||||
}
|
||||
@@ -223,11 +277,18 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
|
||||
if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
|
||||
continue;
|
||||
|
||||
semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) + semaphore_index * UVM_SEMAPHORE_SIZE);
|
||||
if (gpu_semaphore_pool_is_secure(pool)) {
|
||||
semaphore->conf_computing.index = semaphore_index;
|
||||
}
|
||||
else {
|
||||
semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
|
||||
semaphore_index * UVM_SEMAPHORE_SIZE);
|
||||
}
|
||||
|
||||
semaphore->page = page;
|
||||
|
||||
// Check for semaphore release-after-free
|
||||
UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
|
||||
if (semaphore_uses_canary(pool))
|
||||
UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
|
||||
|
||||
uvm_gpu_semaphore_set_payload(semaphore, 0);
|
||||
|
||||
@@ -266,7 +327,7 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
|
||||
|
||||
// Write a known value lower than the current payload in an attempt to catch
|
||||
// release-after-free and acquire-after-free.
|
||||
if (UVM_IS_DEBUG())
|
||||
if (semaphore_uses_canary(pool))
|
||||
uvm_gpu_semaphore_set_payload(semaphore, make_canary(uvm_gpu_semaphore_get_payload(semaphore)));
|
||||
|
||||
uvm_mutex_lock(&pool->mutex);
|
||||
@@ -294,12 +355,26 @@ NV_STATUS uvm_gpu_semaphore_pool_create(uvm_gpu_t *gpu, uvm_gpu_semaphore_pool_t
|
||||
|
||||
pool->free_semaphores_count = 0;
|
||||
pool->gpu = gpu;
|
||||
pool->aperture = UVM_APERTURE_SYS;
|
||||
|
||||
*pool_out = pool;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_gpu_semaphore_secure_pool_create(uvm_gpu_t *gpu, uvm_gpu_semaphore_pool_t **pool_out)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
|
||||
|
||||
status = uvm_gpu_semaphore_pool_create(gpu, pool_out);
|
||||
if (status == NV_OK)
|
||||
(*pool_out)->aperture = UVM_APERTURE_VID;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_gpu_semaphore_pool_destroy(uvm_gpu_semaphore_pool_t *pool)
|
||||
{
|
||||
uvm_gpu_semaphore_pool_page_t *page;
|
||||
@@ -375,13 +450,16 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu
|
||||
NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
|
||||
{
|
||||
NvU32 index = get_index(semaphore);
|
||||
NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space);
|
||||
NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;
|
||||
|
||||
return base_va + UVM_SEMAPHORE_SIZE * index;
|
||||
}
|
||||
|
||||
NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
|
||||
|
||||
return UVM_GPU_READ_ONCE(*semaphore->payload);
|
||||
}
|
||||
|
||||
@@ -398,6 +476,10 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
|
||||
// being optimized out on non-SMP configs (we need them for interacting with
|
||||
// the GPU correctly even on non-SMP).
|
||||
mb();
|
||||
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
|
||||
else
|
||||
UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
|
||||
}
|
||||
|
||||
@@ -425,9 +507,22 @@ static bool tracking_semaphore_check_gpu(uvm_gpu_tracking_semaphore_t *tracking_
|
||||
return true;
|
||||
}
|
||||
|
||||
bool tracking_semaphore_uses_mutex(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
|
||||
{
|
||||
uvm_gpu_t *gpu = tracking_semaphore->semaphore.page->pool->gpu;
|
||||
|
||||
UVM_ASSERT(tracking_semaphore_check_gpu(tracking_semaphore));
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
NV_STATUS uvm_gpu_tracking_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_tracking_semaphore_t *tracking_sem)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_lock_order_t order = UVM_LOCK_ORDER_LEAF;
|
||||
|
||||
memset(tracking_sem, 0, sizeof(*tracking_sem));
|
||||
|
||||
@@ -437,7 +532,14 @@ NV_STATUS uvm_gpu_tracking_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_g
|
||||
|
||||
UVM_ASSERT(uvm_gpu_semaphore_get_payload(&tracking_sem->semaphore) == 0);
|
||||
|
||||
uvm_spin_lock_init(&tracking_sem->lock, UVM_LOCK_ORDER_LEAF);
|
||||
if (uvm_conf_computing_mode_enabled(pool->gpu))
|
||||
order = UVM_LOCK_ORDER_SECURE_SEMAPHORE;
|
||||
|
||||
if (tracking_semaphore_uses_mutex(tracking_sem))
|
||||
uvm_mutex_init(&tracking_sem->m_lock, order);
|
||||
else
|
||||
uvm_spin_lock_init(&tracking_sem->s_lock, order);
|
||||
|
||||
atomic64_set(&tracking_sem->completed_value, 0);
|
||||
tracking_sem->queued_value = 0;
|
||||
|
||||
@@ -449,15 +551,117 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
|
||||
uvm_gpu_semaphore_free(&tracking_sem->semaphore);
|
||||
}
|
||||
|
||||
static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
|
||||
{
|
||||
// No new value, or the GPU is currently writing the new encrypted material
|
||||
// and no change in value would still result in corrupted data.
|
||||
return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
|
||||
}
|
||||
|
||||
static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
UvmCslIv local_iv;
|
||||
NvU32 local_payload;
|
||||
NvU32 new_sem_value;
|
||||
NvU32 gpu_notifier;
|
||||
NvU32 last_observed_notifier;
|
||||
NvU32 new_gpu_notifier = 0;
|
||||
NvU32 iv_index = 0;
|
||||
|
||||
// A channel can have multiple entries pending and the tracking semaphore
|
||||
// update of each entry can race with this function. Since the semaphore
|
||||
// needs to be updated to release a used entry, we never need more
|
||||
// than 'num_gpfifo_entries' re-tries.
|
||||
unsigned tries_left = channel->num_gpfifo_entries;
|
||||
NV_STATUS status = NV_OK;
|
||||
NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
|
||||
UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
|
||||
void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
|
||||
NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
|
||||
NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);
|
||||
|
||||
UVM_ASSERT(uvm_channel_is_secure_ce(channel));
|
||||
|
||||
last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
|
||||
gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
UVM_ASSERT(last_observed_notifier <= gpu_notifier);
|
||||
|
||||
if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
|
||||
return;
|
||||
|
||||
do {
|
||||
gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
|
||||
// Odd notifier value means there's an update in progress.
|
||||
if (gpu_notifier % 2)
|
||||
continue;
|
||||
|
||||
// Make sure no memory accesses happen before we read the notifier
|
||||
smp_mb__after_atomic();
|
||||
|
||||
iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
|
||||
memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
|
||||
local_payload = UVM_READ_ONCE(*payload_cpu_addr);
|
||||
memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));
|
||||
|
||||
// Make sure the second read of notifier happens after
|
||||
// all memory accesses.
|
||||
smp_mb__before_atomic();
|
||||
new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
tries_left--;
|
||||
} while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));
|
||||
|
||||
if (!tries_left) {
|
||||
status = NV_ERR_INVALID_STATE;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (gpu_notifier == new_gpu_notifier) {
|
||||
status = uvm_conf_computing_cpu_decrypt(channel,
|
||||
&new_sem_value,
|
||||
&local_payload,
|
||||
&local_iv,
|
||||
sizeof(new_sem_value),
|
||||
&local_auth_tag);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
|
||||
UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
error:
|
||||
// Decryption failure is a fatal error as well as running out of try left.
|
||||
// Upon testing, all decryption happened within one try, anything that
|
||||
// would require ten retry would be considered active tampering with the
|
||||
// data structures.
|
||||
uvm_global_set_fatal_error(status);
|
||||
}
|
||||
|
||||
static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
|
||||
{
|
||||
NvU64 old_value = atomic64_read(&tracking_semaphore->completed_value);
|
||||
// The semaphore value is the bottom 32 bits of completed_value
|
||||
NvU32 old_sem_value = (NvU32)old_value;
|
||||
NvU32 new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
|
||||
NvU32 new_sem_value;
|
||||
NvU64 new_value;
|
||||
|
||||
uvm_assert_spinlock_locked(&tracking_semaphore->lock);
|
||||
if (tracking_semaphore_uses_mutex(tracking_semaphore))
|
||||
uvm_assert_mutex_locked(&tracking_semaphore->m_lock);
|
||||
else
|
||||
uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);
|
||||
|
||||
if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
|
||||
// TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
|
||||
// mechanism to all semaphore
|
||||
uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
|
||||
uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
|
||||
}
|
||||
|
||||
new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
|
||||
|
||||
// The following logic to update the completed value is very subtle, it
|
||||
// helps to read https://www.kernel.org/doc/Documentation/memory-barriers.txt
|
||||
@@ -466,7 +670,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
|
||||
if (old_sem_value == new_sem_value) {
|
||||
// No progress since the last update.
|
||||
// No additional memory barrier required in this case as completed_value
|
||||
// is always updated under the spinlock that this thread just acquired.
|
||||
// is always updated under the lock that this thread just acquired.
|
||||
// That guarantees full ordering with all the accesses the thread that
|
||||
// updated completed_value did under the lock including the GPU
|
||||
// semaphore read.
|
||||
@@ -493,7 +697,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
|
||||
(NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
|
||||
old_value, new_value);
|
||||
|
||||
// Use an atomic write even though the spinlock is held so that the value can
|
||||
// Use an atomic write even though the lock is held so that the value can
|
||||
// be (carefully) read atomically outside of the lock.
|
||||
//
|
||||
// atomic64_set() on its own doesn't imply any memory barriers and we need
|
||||
@@ -521,9 +725,9 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
|
||||
// guarantees that no accesses will be ordered above the atomic (and hence
|
||||
// the GPU semaphore read).
|
||||
//
|
||||
// Notably the soon following uvm_spin_unlock() is a release barrier that
|
||||
// allows later memory accesses to be reordered above it and hence doesn't
|
||||
// provide the necessary ordering with the GPU semaphore read.
|
||||
// Notably the soon following unlock is a release barrier that allows later
|
||||
// memory accesses to be reordered above it and hence doesn't provide the
|
||||
// necessary ordering with the GPU semaphore read.
|
||||
//
|
||||
// Also notably this would still need to be handled if we ever switch to
|
||||
// atomic64_set_release() and atomic64_read_acquire() for accessing
|
||||
@@ -540,11 +744,17 @@ NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semapho
|
||||
// Check that the GPU which owns the semaphore is still present
|
||||
UVM_ASSERT(tracking_semaphore_check_gpu(tracking_semaphore));
|
||||
|
||||
uvm_spin_lock(&tracking_semaphore->lock);
|
||||
if (tracking_semaphore_uses_mutex(tracking_semaphore))
|
||||
uvm_mutex_lock(&tracking_semaphore->m_lock);
|
||||
else
|
||||
uvm_spin_lock(&tracking_semaphore->s_lock);
|
||||
|
||||
completed = update_completed_value_locked(tracking_semaphore);
|
||||
|
||||
uvm_spin_unlock(&tracking_semaphore->lock);
|
||||
if (tracking_semaphore_uses_mutex(tracking_semaphore))
|
||||
uvm_mutex_unlock(&tracking_semaphore->m_lock);
|
||||
else
|
||||
uvm_spin_unlock(&tracking_semaphore->s_lock);
|
||||
|
||||
return completed;
|
||||
}
|
||||
|
||||
@@ -47,6 +47,16 @@ struct uvm_gpu_semaphore_struct
|
||||
|
||||
// Pointer to the memory location
|
||||
NvU32 *payload;
|
||||
struct {
|
||||
NvU16 index;
|
||||
NvU32 cached_payload;
|
||||
uvm_rm_mem_t *encrypted_payload;
|
||||
uvm_rm_mem_t *notifier;
|
||||
uvm_rm_mem_t *auth_tag;
|
||||
UvmCslIv *ivs;
|
||||
NvU32 last_pushed_notifier;
|
||||
NvU32 last_observed_notifier;
|
||||
} conf_computing;
|
||||
};
|
||||
|
||||
// A primitive used for tracking progress of the GPU
|
||||
@@ -67,7 +77,10 @@ struct uvm_gpu_tracking_semaphore_struct
|
||||
atomic64_t completed_value;
|
||||
|
||||
// Lock protecting updates to the completed_value
|
||||
uvm_spinlock_t lock;
|
||||
union {
|
||||
uvm_spinlock_t s_lock;
|
||||
uvm_mutex_t m_lock;
|
||||
};
|
||||
|
||||
// Last queued value
|
||||
// All accesses to the queued value should be handled by the user of the GPU
|
||||
@@ -78,6 +91,12 @@ struct uvm_gpu_tracking_semaphore_struct
|
||||
// Create a semaphore pool for a GPU.
|
||||
NV_STATUS uvm_gpu_semaphore_pool_create(uvm_gpu_t *gpu, uvm_gpu_semaphore_pool_t **pool_out);
|
||||
|
||||
// When the Confidential Computing feature is enabled, pools associated with
|
||||
// secure CE channels are allocated in the CPR of vidmem and as such have
|
||||
// all the associated access restrictions. Because of this, they're called
|
||||
// secure pools and secure semaphores are allocated out of said secure pools.
|
||||
NV_STATUS uvm_gpu_semaphore_secure_pool_create(uvm_gpu_t *gpu, uvm_gpu_semaphore_pool_t **pool_out);
|
||||
|
||||
// Destroy a semaphore pool
|
||||
// Locking:
|
||||
// - Global lock needs to be held in read mode (for unmapping from all GPUs)
|
||||
@@ -90,6 +109,9 @@ void uvm_gpu_semaphore_pool_destroy(uvm_gpu_semaphore_pool_t *pool);
|
||||
// Allocate a semaphore from the pool.
|
||||
// The semaphore will be mapped on all GPUs currently registered with the UVM
|
||||
// driver, and on all new GPUs which will be registered in the future.
|
||||
// Unless the Confidential Computing feature is enabled and the pool is a
|
||||
// secure pool. In this case, it is only mapped to the GPU that holds the
|
||||
// allocation.
|
||||
// The mappings are added to UVM's internal address space, and (in SR-IOV heavy)
|
||||
// to the proxy address space.
|
||||
//
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -53,6 +53,7 @@ MODULE_PARM_DESC(uvm_downgrade_force_membar_sys, "Force all TLB invalidation dow
|
||||
#define ARCH_OP_COUNT (sizeof(uvm_arch_hal_t) / sizeof(void *))
|
||||
#define FAULT_BUFFER_OP_COUNT (sizeof(uvm_fault_buffer_hal_t) / sizeof(void *))
|
||||
#define ACCESS_COUNTER_BUFFER_OP_COUNT (sizeof(uvm_access_counter_buffer_hal_t) / sizeof(void *))
|
||||
#define SEC2_OP_COUNT (sizeof(uvm_sec2_hal_t) / sizeof(void *))
|
||||
|
||||
// Table for copy engine functions.
|
||||
// Each entry is associated with a copy engine class through the 'class' field.
|
||||
@@ -73,6 +74,7 @@ static uvm_hal_class_ops_t ce_table[] =
|
||||
.offset_in_out = uvm_hal_maxwell_ce_offset_in_out,
|
||||
.phys_mode = uvm_hal_maxwell_ce_phys_mode,
|
||||
.plc_mode = uvm_hal_maxwell_ce_plc_mode,
|
||||
.memcopy_copy_type = uvm_hal_maxwell_ce_memcopy_copy_type,
|
||||
.memcopy_is_valid = uvm_hal_ce_memcopy_is_valid_stub,
|
||||
.memcopy_patch_src = uvm_hal_ce_memcopy_patch_src_stub,
|
||||
.memcopy = uvm_hal_maxwell_ce_memcopy,
|
||||
@@ -82,6 +84,8 @@ static uvm_hal_class_ops_t ce_table[] =
|
||||
.memset_4 = uvm_hal_maxwell_ce_memset_4,
|
||||
.memset_8 = uvm_hal_maxwell_ce_memset_8,
|
||||
.memset_v_4 = uvm_hal_maxwell_ce_memset_v_4,
|
||||
.encrypt = uvm_hal_maxwell_ce_encrypt_unsupported,
|
||||
.decrypt = uvm_hal_maxwell_ce_decrypt_unsupported,
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -149,11 +153,14 @@ static uvm_hal_class_ops_t ce_table[] =
|
||||
.semaphore_reduction_inc = uvm_hal_hopper_ce_semaphore_reduction_inc,
|
||||
.offset_out = uvm_hal_hopper_ce_offset_out,
|
||||
.offset_in_out = uvm_hal_hopper_ce_offset_in_out,
|
||||
.memcopy_copy_type = uvm_hal_hopper_ce_memcopy_copy_type,
|
||||
.memset_1 = uvm_hal_hopper_ce_memset_1,
|
||||
.memset_4 = uvm_hal_hopper_ce_memset_4,
|
||||
.memset_8 = uvm_hal_hopper_ce_memset_8,
|
||||
.memcopy_is_valid = uvm_hal_hopper_ce_memcopy_is_valid,
|
||||
.memset_is_valid = uvm_hal_hopper_ce_memset_is_valid,
|
||||
.encrypt = uvm_hal_hopper_ce_encrypt,
|
||||
.decrypt = uvm_hal_hopper_ce_decrypt,
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -371,6 +378,7 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
|
||||
.entry_clear_valid = uvm_hal_maxwell_fault_buffer_entry_clear_valid_unsupported,
|
||||
.entry_size = uvm_hal_maxwell_fault_buffer_entry_size_unsupported,
|
||||
.parse_non_replayable_entry = uvm_hal_maxwell_fault_buffer_parse_non_replayable_entry_unsupported,
|
||||
.get_fault_type = uvm_hal_maxwell_fault_buffer_get_fault_type_unsupported,
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -392,6 +400,7 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
|
||||
.entry_is_valid = uvm_hal_pascal_fault_buffer_entry_is_valid,
|
||||
.entry_clear_valid = uvm_hal_pascal_fault_buffer_entry_clear_valid,
|
||||
.entry_size = uvm_hal_pascal_fault_buffer_entry_size,
|
||||
.get_fault_type = uvm_hal_pascal_fault_buffer_get_fault_type,
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -404,6 +413,7 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
|
||||
.get_ve_id = uvm_hal_volta_fault_buffer_get_ve_id,
|
||||
.parse_entry = uvm_hal_volta_fault_buffer_parse_entry,
|
||||
.parse_non_replayable_entry = uvm_hal_volta_fault_buffer_parse_non_replayable_entry,
|
||||
.get_fault_type = uvm_hal_volta_fault_buffer_get_fault_type,
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -495,6 +505,59 @@ static uvm_hal_class_ops_t access_counter_buffer_table[] =
|
||||
},
|
||||
};
|
||||
|
||||
static uvm_hal_class_ops_t sec2_table[] =
|
||||
{
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM000,
|
||||
.u.sec2_ops = {
|
||||
.init = uvm_hal_maxwell_sec2_init_noop,
|
||||
.decrypt = uvm_hal_maxwell_sec2_decrypt_unsupported,
|
||||
.semaphore_release = uvm_hal_maxwell_sec2_semaphore_release_unsupported,
|
||||
.semaphore_timestamp = uvm_hal_maxwell_sec2_semaphore_timestamp_unsupported,
|
||||
}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM200,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM000,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GP100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM200,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GV100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GP100,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_TU100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GV100,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_TU100,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100,
|
||||
.u.sec2_ops = {}
|
||||
},
|
||||
{
|
||||
.id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
|
||||
.parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100,
|
||||
.u.sec2_ops = {
|
||||
.init = uvm_hal_hopper_sec2_init,
|
||||
.semaphore_release = uvm_hal_hopper_sec2_semaphore_release,
|
||||
.semaphore_timestamp = uvm_hal_hopper_sec2_semaphore_timestamp_unsupported,
|
||||
.decrypt = uvm_hal_hopper_sec2_decrypt,
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
static inline uvm_hal_class_ops_t *ops_find_by_id(uvm_hal_class_ops_t *table, NvU32 row_count, NvU32 id)
|
||||
{
|
||||
NvLength i;
|
||||
@@ -598,6 +661,15 @@ NV_STATUS uvm_hal_init_table(void)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = ops_init_from_parent(sec2_table,
|
||||
ARRAY_SIZE(sec2_table),
|
||||
SEC2_OP_COUNT,
|
||||
offsetof(uvm_hal_class_ops_t, u.sec2_ops));
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("ops_init_from_parent(sec2_table) failed: %s\n", nvstatusToString(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -648,6 +720,14 @@ NV_STATUS uvm_hal_init_gpu(uvm_parent_gpu_t *parent_gpu)
|
||||
|
||||
parent_gpu->access_counter_buffer_hal = &class_ops->u.access_counter_buffer_ops;
|
||||
|
||||
class_ops = ops_find_by_id(sec2_table, ARRAY_SIZE(sec2_table), gpu_info->gpuArch);
|
||||
if (class_ops == NULL) {
|
||||
UVM_ERR_PRINT("SEC2 HAL not found, GPU %s, arch: 0x%X\n", parent_gpu->name, gpu_info->gpuArch);
|
||||
return NV_ERR_INVALID_CLASS;
|
||||
}
|
||||
|
||||
parent_gpu->sec2_hal = &class_ops->u.sec2_ops;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -658,6 +738,9 @@ static void hal_override_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// TODO: Bug 200692962: Add support for access counters in vGPU
|
||||
if (parent_gpu->virt_mode != UVM_VIRT_MODE_NONE)
|
||||
parent_gpu->access_counters_supported = false;
|
||||
// Access counters are not supported in CC.
|
||||
else if (uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
parent_gpu->access_counters_supported = false;
|
||||
}
|
||||
|
||||
void uvm_hal_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
@@ -711,7 +794,7 @@ uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem)
|
||||
// memory, including those from other processors like the CPU or peer GPUs,
|
||||
// must come through this GPU's L2. In all current architectures, MEMBAR_GPU
|
||||
// is sufficient to resolve ordering at the L2 level.
|
||||
if (is_local_vidmem && !gpu->parent->numa_info.enabled && !uvm_downgrade_force_membar_sys)
|
||||
if (is_local_vidmem && !uvm_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
|
||||
return UVM_MEMBAR_GPU;
|
||||
|
||||
// If the mapped memory was remote, or if a coherence protocol can cache
|
||||
@@ -895,7 +978,7 @@ void uvm_hal_ce_memcopy_patch_src_stub(uvm_push_t *push, uvm_gpu_address_t *src)
|
||||
{
|
||||
}
|
||||
|
||||
bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
|
||||
bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t num_elements, size_t element_size)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -40,6 +40,8 @@ typedef void (*uvm_hal_init_t)(uvm_push_t *push);
|
||||
void uvm_hal_maxwell_ce_init(uvm_push_t *push);
|
||||
void uvm_hal_maxwell_host_init_noop(uvm_push_t *push);
|
||||
void uvm_hal_pascal_host_init(uvm_push_t *push);
|
||||
void uvm_hal_maxwell_sec2_init_noop(uvm_push_t *push);
|
||||
void uvm_hal_hopper_sec2_init(uvm_push_t *push);
|
||||
|
||||
// Host method validation
|
||||
typedef bool (*uvm_hal_host_method_is_valid)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
|
||||
@@ -207,9 +209,11 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
typedef void (*uvm_hal_semaphore_release_t)(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_maxwell_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_maxwell_sec2_semaphore_release_unsupported(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_pascal_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_volta_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_turing_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_hopper_sec2_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_hopper_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
|
||||
@@ -228,15 +232,30 @@ void uvm_hal_maxwell_host_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
|
||||
void uvm_hal_volta_host_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
|
||||
void uvm_hal_hopper_host_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
|
||||
|
||||
void uvm_hal_maxwell_sec2_semaphore_timestamp_unsupported(uvm_push_t *push, NvU64 gpu_va);
|
||||
void uvm_hal_hopper_sec2_semaphore_timestamp_unsupported(uvm_push_t *push, NvU64 gpu_va);
|
||||
|
||||
typedef void (*uvm_hal_semaphore_acquire_t)(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_maxwell_host_semaphore_acquire(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_turing_host_semaphore_acquire(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
void uvm_hal_hopper_host_semaphore_acquire(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
|
||||
|
||||
typedef void (*uvm_hal_host_set_gpfifo_entry_t)(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length);
|
||||
void uvm_hal_maxwell_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length);
|
||||
void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length);
|
||||
void uvm_hal_hopper_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length);
|
||||
typedef void (*uvm_hal_host_set_gpfifo_entry_t)(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag);
|
||||
void uvm_hal_maxwell_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag);
|
||||
void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag);
|
||||
void uvm_hal_hopper_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag);
|
||||
|
||||
typedef void (*uvm_hal_host_set_gpfifo_noop_t)(NvU64 *fifo_entry);
|
||||
void uvm_hal_maxwell_host_set_gpfifo_noop(NvU64 *fifo_entry);
|
||||
@@ -273,6 +292,10 @@ typedef NvU32 (*uvm_hal_ce_plc_mode_t)(void);
|
||||
NvU32 uvm_hal_maxwell_ce_plc_mode(void);
|
||||
NvU32 uvm_hal_ampere_ce_plc_mode_c7b5(void);
|
||||
|
||||
typedef NvU32 (*uvm_hal_ce_memcopy_type_t)(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
|
||||
NvU32 uvm_hal_maxwell_ce_memcopy_copy_type(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
|
||||
NvU32 uvm_hal_hopper_ce_memcopy_copy_type(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
|
||||
|
||||
// CE method validation
|
||||
typedef bool (*uvm_hal_ce_method_is_valid)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
|
||||
bool uvm_hal_ampere_ce_method_is_valid_c6b5(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
|
||||
@@ -309,10 +332,19 @@ void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst, NvU64 src, s
|
||||
// The validation happens at the start of the memset (uvm_hal_memset_*_t)
|
||||
// execution. Use uvm_hal_ce_memset_is_valid_stub to skip the validation for
|
||||
// a given architecture.
|
||||
typedef bool (*uvm_hal_ce_memset_is_valid)(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
|
||||
bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
|
||||
bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
|
||||
bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
|
||||
typedef bool (*uvm_hal_ce_memset_is_valid)(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
size_t num_elements,
|
||||
size_t element_size);
|
||||
bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t num_elements, size_t element_size);
|
||||
bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
size_t num_elements,
|
||||
size_t element_size);
|
||||
bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
size_t num_elements,
|
||||
size_t element_size);
|
||||
|
||||
// Memset size bytes at dst to a given N-byte input value.
|
||||
//
|
||||
@@ -342,6 +374,54 @@ void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 va
|
||||
void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size);
|
||||
void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size);
|
||||
|
||||
// Encrypts the contents of the source buffer into the destination buffer, up to
|
||||
// the given size. The authentication tag of the encrypted contents is written
|
||||
// to auth_tag, so it can be verified later on by a decrypt operation.
|
||||
//
|
||||
// The addressing modes of the destination and authentication tag addresses
|
||||
// should match. If the addressing mode is physical, then the address apertures
|
||||
// should also match.
|
||||
typedef void (*uvm_hal_ce_encrypt_t)(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
|
||||
// Decrypts the contents of the source buffer into the destination buffer, up to
|
||||
// the given size. The method also verifies the integrity of the encrypted
|
||||
// buffer by calculating its authentication tag, and comparing it with the one
|
||||
// provided as argument.
|
||||
//
|
||||
// The addressing modes of the source and authentication tag addresses should
|
||||
// match. If the addressing mode is physical, then the address apertures should
|
||||
// also match.
|
||||
typedef void (*uvm_hal_ce_decrypt_t)(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
|
||||
void uvm_hal_maxwell_ce_encrypt_unsupported(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
void uvm_hal_maxwell_ce_decrypt_unsupported(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
void uvm_hal_hopper_ce_decrypt(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag);
|
||||
|
||||
// Increments the semaphore by 1, or resets to 0 if the incremented value would
|
||||
// exceed the payload.
|
||||
//
|
||||
@@ -414,6 +494,7 @@ typedef bool (*uvm_hal_fault_buffer_entry_is_valid_t)(uvm_parent_gpu_t *parent_g
|
||||
typedef void (*uvm_hal_fault_buffer_entry_clear_valid_t)(uvm_parent_gpu_t *parent_gpu, NvU32 index);
|
||||
typedef NvU32 (*uvm_hal_fault_buffer_entry_size_t)(uvm_parent_gpu_t *parent_gpu);
|
||||
typedef void (*uvm_hal_fault_buffer_replay_t)(uvm_push_t *push, uvm_fault_replay_type_t type);
|
||||
typedef uvm_fault_type_t (*uvm_hal_fault_buffer_get_fault_type_t)(const NvU32 *fault_entry);
|
||||
typedef void (*uvm_hal_fault_cancel_global_t)(uvm_push_t *push, uvm_gpu_phys_address_t instance_ptr);
|
||||
typedef void (*uvm_hal_fault_cancel_targeted_t)(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t instance_ptr,
|
||||
@@ -430,6 +511,8 @@ NvU8 uvm_hal_maxwell_fault_buffer_get_ve_id_unsupported(NvU16 mmu_engine_id, uvm
|
||||
void uvm_hal_maxwell_fault_buffer_parse_entry_unsupported(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 index,
|
||||
uvm_fault_buffer_entry_t *buffer_entry);
|
||||
uvm_fault_type_t uvm_hal_maxwell_fault_buffer_get_fault_type_unsupported(const NvU32 *fault_entry);
|
||||
|
||||
void uvm_hal_pascal_enable_replayable_faults(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_pascal_disable_replayable_faults(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_pascal_clear_replayable_faults(uvm_parent_gpu_t *parent_gpu, NvU32 get);
|
||||
@@ -439,6 +522,8 @@ void uvm_hal_pascal_fault_buffer_write_get(uvm_parent_gpu_t *parent_gpu, NvU32 i
|
||||
void uvm_hal_pascal_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 index,
|
||||
uvm_fault_buffer_entry_t *buffer_entry);
|
||||
uvm_fault_type_t uvm_hal_pascal_fault_buffer_get_fault_type(const NvU32 *fault_entry);
|
||||
|
||||
NvU32 uvm_hal_volta_fault_buffer_read_put(uvm_parent_gpu_t *parent_gpu);
|
||||
NvU32 uvm_hal_volta_fault_buffer_read_get(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_volta_fault_buffer_write_get(uvm_parent_gpu_t *parent_gpu, NvU32 index);
|
||||
@@ -446,6 +531,8 @@ NvU8 uvm_hal_volta_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_ty
|
||||
void uvm_hal_volta_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 index,
|
||||
uvm_fault_buffer_entry_t *buffer_entry);
|
||||
uvm_fault_type_t uvm_hal_volta_fault_buffer_get_fault_type(const NvU32 *fault_entry);
|
||||
|
||||
void uvm_hal_turing_disable_replayable_faults(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_turing_clear_replayable_faults(uvm_parent_gpu_t *parent_gpu, NvU32 get);
|
||||
NvU8 uvm_hal_hopper_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type);
|
||||
@@ -586,6 +673,28 @@ void uvm_hal_volta_access_counter_clear_targeted(uvm_push_t *push,
|
||||
void uvm_hal_turing_disable_access_counter_notifications(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_turing_clear_access_counter_notifications(uvm_parent_gpu_t *parent_gpu, NvU32 get);
|
||||
|
||||
// The source and destination addresses must be 16-byte aligned. Note that the
|
||||
// best performance is achieved with 256-byte alignment. The decrypt size must
|
||||
// be larger than 0, and a multiple of 4 bytes.
|
||||
//
|
||||
// The authentication tag address must also be 16-byte aligned.
|
||||
// The authentication tag buffer size is UVM_CONF_COMPUTING_AUTH_TAG_SIZE bytes
|
||||
// defined in uvm_conf_computing.h.
|
||||
//
|
||||
// Decrypts the src buffer into the dst buffer of the given size.
|
||||
// The method also verifies integrity of the src buffer by calculating its
|
||||
// authentication tag and comparing it with the provided one.
|
||||
//
|
||||
// Note: SEC2 does not support encryption.
|
||||
typedef void (*uvm_hal_sec2_decrypt_t)(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, NvU32 size, NvU64 auth_tag_va);
|
||||
|
||||
void uvm_hal_maxwell_sec2_decrypt_unsupported(uvm_push_t *push,
|
||||
NvU64 dst_va,
|
||||
NvU64 src_va,
|
||||
NvU32 size,
|
||||
NvU64 auth_tag_va);
|
||||
void uvm_hal_hopper_sec2_decrypt(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, NvU32 size, NvU64 auth_tag_va);
|
||||
|
||||
struct uvm_host_hal_struct
|
||||
{
|
||||
uvm_hal_init_t init;
|
||||
@@ -629,6 +738,7 @@ struct uvm_ce_hal_struct
|
||||
uvm_hal_ce_offset_in_out_t offset_in_out;
|
||||
uvm_hal_ce_phys_mode_t phys_mode;
|
||||
uvm_hal_ce_plc_mode_t plc_mode;
|
||||
uvm_hal_ce_memcopy_type_t memcopy_copy_type;
|
||||
uvm_hal_ce_memcopy_is_valid memcopy_is_valid;
|
||||
uvm_hal_ce_memcopy_patch_src memcopy_patch_src;
|
||||
uvm_hal_memcopy_t memcopy;
|
||||
@@ -639,6 +749,8 @@ struct uvm_ce_hal_struct
|
||||
uvm_hal_memset_8_t memset_8;
|
||||
uvm_hal_memset_v_4_t memset_v_4;
|
||||
uvm_hal_semaphore_reduction_inc_t semaphore_reduction_inc;
|
||||
uvm_hal_ce_encrypt_t encrypt;
|
||||
uvm_hal_ce_decrypt_t decrypt;
|
||||
};
|
||||
|
||||
struct uvm_arch_hal_struct
|
||||
@@ -665,6 +777,7 @@ struct uvm_fault_buffer_hal_struct
|
||||
uvm_hal_fault_buffer_entry_clear_valid_t entry_clear_valid;
|
||||
uvm_hal_fault_buffer_entry_size_t entry_size;
|
||||
uvm_hal_fault_buffer_parse_non_replayable_entry_t parse_non_replayable_entry;
|
||||
uvm_hal_fault_buffer_get_fault_type_t get_fault_type;
|
||||
};
|
||||
|
||||
struct uvm_access_counter_buffer_hal_struct
|
||||
@@ -678,6 +791,14 @@ struct uvm_access_counter_buffer_hal_struct
|
||||
uvm_hal_access_counter_buffer_entry_size_t entry_size;
|
||||
};
|
||||
|
||||
struct uvm_sec2_hal_struct
|
||||
{
|
||||
uvm_hal_init_t init;
|
||||
uvm_hal_sec2_decrypt_t decrypt;
|
||||
uvm_hal_semaphore_release_t semaphore_release;
|
||||
uvm_hal_semaphore_timestamp_t semaphore_timestamp;
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// id is either a hardware class or GPU architecture
|
||||
@@ -700,6 +821,8 @@ typedef struct
|
||||
// access_counter_buffer_ops: id is an architecture
|
||||
uvm_access_counter_buffer_hal_t access_counter_buffer_ops;
|
||||
|
||||
// sec2_ops: id is an architecture
|
||||
uvm_sec2_hal_t sec2_ops;
|
||||
} u;
|
||||
} uvm_hal_class_ops_t;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2019 NVIDIA Corporation
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -111,6 +111,11 @@ typedef struct
|
||||
|
||||
// Whether the address is virtual
|
||||
bool is_virtual;
|
||||
|
||||
// Whether the address resides in a non-protected memory region when the
|
||||
// Confidential Computing feature is enabled. Default is protected.
|
||||
// Ignored if the feature is disabled and should not be used.
|
||||
bool is_unprotected;
|
||||
} uvm_gpu_address_t;
|
||||
|
||||
// Create a virtual GPU address
|
||||
@@ -258,8 +263,8 @@ typedef enum
|
||||
UVM_FAULT_CANCEL_VA_MODE_COUNT,
|
||||
} uvm_fault_cancel_va_mode_t;
|
||||
|
||||
// Types of faults that can show up in the fault buffer. Non-UVM related faults are grouped in FATAL category
|
||||
// since we don't care about the specific type
|
||||
// Types of faults that can show up in the fault buffer. Non-UVM related faults
|
||||
// are grouped in FATAL category since we don't care about the specific type.
|
||||
typedef enum
|
||||
{
|
||||
UVM_FAULT_TYPE_INVALID_PDE = 0,
|
||||
@@ -272,7 +277,8 @@ typedef enum
|
||||
// READ to WRITE-ONLY (ATS)
|
||||
UVM_FAULT_TYPE_READ,
|
||||
|
||||
// The next values are considered fatal and are not handled by the UVM driver
|
||||
// The next values are considered fatal and are not handled by the UVM
|
||||
// driver
|
||||
UVM_FAULT_TYPE_FATAL,
|
||||
|
||||
// Values required for tools
|
||||
@@ -311,10 +317,24 @@ typedef enum
|
||||
UVM_MMU_ENGINE_TYPE_COUNT,
|
||||
} uvm_mmu_engine_type_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
// Allow entry to be fetched before the previous entry finishes ESCHED
|
||||
// execution.
|
||||
UVM_GPFIFO_SYNC_PROCEED = 0,
|
||||
|
||||
// Fetch of this entry has to wait until the previous entry has finished
|
||||
// executing by ESCHED.
|
||||
// For a complete engine sync the previous entry needs to include
|
||||
// WAIT_FOR_IDLE command or other engine synchronization.
|
||||
UVM_GPFIFO_SYNC_WAIT,
|
||||
} uvm_gpfifo_sync_t;
|
||||
|
||||
const char *uvm_mmu_engine_type_string(uvm_mmu_engine_type_t mmu_engine_type);
|
||||
|
||||
// HW unit that triggered the fault. We include the fields required for fault cancelling. Including more information
|
||||
// might be useful for performance heuristics in the future
|
||||
// HW unit that triggered the fault. We include the fields required for fault
|
||||
// cancelling. Including more information might be useful for performance
|
||||
// heuristics in the future.
|
||||
typedef struct
|
||||
{
|
||||
uvm_fault_client_type_t client_type : order_base_2(UVM_FAULT_CLIENT_TYPE_COUNT) + 1;
|
||||
@@ -429,7 +449,8 @@ typedef enum
|
||||
// Completes when all fault replays are in-flight
|
||||
UVM_FAULT_REPLAY_TYPE_START = 0,
|
||||
|
||||
// Completes when all faulting accesses have been correctly translated or faulted again
|
||||
// Completes when all faulting accesses have been correctly translated or
|
||||
// faulted again
|
||||
UVM_FAULT_REPLAY_TYPE_START_ACK_ALL,
|
||||
|
||||
UVM_FAULT_REPLAY_TYPE_MAX
|
||||
@@ -467,18 +488,18 @@ struct uvm_access_counter_buffer_entry_struct
|
||||
{
|
||||
struct
|
||||
{
|
||||
// Instance pointer of one of the channels in the TSG that triggered the
|
||||
// notification
|
||||
// Instance pointer of one of the channels in the TSG that triggered
|
||||
// the notification.
|
||||
uvm_gpu_phys_address_t instance_ptr;
|
||||
|
||||
uvm_mmu_engine_type_t mmu_engine_type;
|
||||
|
||||
NvU32 mmu_engine_id;
|
||||
|
||||
// Identifier of the subcontext that performed the memory accesses that
|
||||
// triggered the notification. This value, combined with the instance_ptr,
|
||||
// is needed to obtain the GPU VA space of the process that triggered the
|
||||
// notification.
|
||||
// Identifier of the subcontext that performed the memory accesses
|
||||
// that triggered the notification. This value, combined with the
|
||||
// instance_ptr, is needed to obtain the GPU VA space of the process
|
||||
// that triggered the notification.
|
||||
NvU32 ve_id;
|
||||
|
||||
// VA space for the address that triggered the notification
|
||||
@@ -524,8 +545,8 @@ static uvm_prot_t uvm_fault_access_type_to_prot(uvm_fault_access_type_t access_t
|
||||
return UVM_PROT_READ_WRITE;
|
||||
|
||||
default:
|
||||
// Prefetch faults, if not ignored, are handled like read faults and require
|
||||
// a mapping with, at least, READ_ONLY access permission
|
||||
// Prefetch faults, if not ignored, are handled like read faults and
|
||||
// requirea mapping with, at least, READ_ONLY access permission.
|
||||
return UVM_PROT_READ_ONLY;
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -84,25 +84,6 @@ typedef struct
|
||||
NvU64 addr,
|
||||
uvm_va_block_t **va_block_ptr);
|
||||
|
||||
// Find an existing HMM va_block when processing a CPU fault and try to
|
||||
// isolate and lock the faulting page.
|
||||
// Return NV_ERR_INVALID_ADDRESS if the block is not found,
|
||||
// NV_ERR_BUSY_RETRY if the page could not be locked, and
|
||||
// NV_OK if the block is found and the page is locked. Also,
|
||||
// uvm_hmm_cpu_fault_finish() must be called if NV_OK is returned.
|
||||
// Locking: This must be called with the vma->vm_mm locked and the va_space
|
||||
// read locked.
|
||||
NV_STATUS uvm_hmm_va_block_cpu_find(uvm_va_space_t *va_space,
|
||||
uvm_service_block_context_t *service_context,
|
||||
struct vm_fault *vmf,
|
||||
uvm_va_block_t **va_block_ptr);
|
||||
|
||||
// This must be called after uvm_va_block_cpu_fault() if
|
||||
// uvm_hmm_va_block_cpu_find() returns NV_OK.
|
||||
// Locking: This must be called with the vma->vm_mm locked and the va_space
|
||||
// read locked.
|
||||
void uvm_hmm_cpu_fault_finish(uvm_service_block_context_t *service_context);
|
||||
|
||||
// Find or create a new HMM va_block.
|
||||
//
|
||||
// Return NV_ERR_INVALID_ADDRESS if there is no VMA associated with the
|
||||
@@ -136,6 +117,26 @@ typedef struct
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_va_block_region_t region);
|
||||
|
||||
// Initialize the HMM portion of the service_context.
|
||||
// This should be called one time before any retry loops calling
|
||||
// uvm_va_block_service_locked().
|
||||
void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context);
|
||||
|
||||
// Begin a migration critical section. When calling into the kernel it is
|
||||
// sometimes necessary to drop the va_block lock. This function returns
|
||||
// NV_OK when no other thread has started a migration critical section.
|
||||
// Otherwise, it returns NV_ERR_BUSY_RETRY and threads should then retry
|
||||
// this function to begin a critical section.
|
||||
// Locking: va_block lock must not be held.
|
||||
NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block);
|
||||
|
||||
// Same as uvm_hmm_migrate_begin() but waits if required before beginning a
|
||||
// critical section.
|
||||
void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block);
|
||||
|
||||
// Finish a migration critical section.
|
||||
void uvm_hmm_migrate_finish(uvm_va_block_t *va_block);
|
||||
|
||||
// Find or create a HMM va_block and mark it so the next va_block split
|
||||
// will fail for testing purposes.
|
||||
// Locking: This function must be called with mm retained and locked for
|
||||
@@ -314,6 +315,11 @@ typedef struct
|
||||
uvm_migrate_mode_t mode,
|
||||
uvm_tracker_t *out_tracker);
|
||||
|
||||
// Evicts all va_blocks in the va_space to the CPU. Unlike the
|
||||
// other va_block eviction functions this is based on virtual
|
||||
// address and therefore takes mmap_lock for read.
|
||||
void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space);
|
||||
|
||||
// This sets the va_block_context->hmm.src_pfns[] to the ZONE_DEVICE private
|
||||
// PFN for the GPU chunk memory.
|
||||
NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
|
||||
@@ -345,13 +351,13 @@ typedef struct
|
||||
const uvm_page_mask_t *pages_to_evict,
|
||||
uvm_va_block_region_t region);
|
||||
|
||||
// Migrate a GPU chunk to system memory. This called to remove CPU page
|
||||
// table references to device private struct pages for the given GPU after
|
||||
// all other references in va_blocks have been released and the GPU is
|
||||
// in the process of being removed/torn down. Note that there is no mm,
|
||||
// VMA, va_block or any user channel activity on this GPU.
|
||||
NV_STATUS uvm_hmm_pmm_gpu_evict_chunk(uvm_gpu_t *gpu,
|
||||
uvm_gpu_chunk_t *gpu_chunk);
|
||||
// Migrate a GPU device-private page to system memory. This is
|
||||
// called to remove CPU page table references to device private
|
||||
// struct pages for the given GPU after all other references in
|
||||
// va_blocks have been released and the GPU is in the process of
|
||||
// being removed/torn down. Note that there is no mm, VMA,
|
||||
// va_block or any user channel activity on this GPU.
|
||||
NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn);
|
||||
|
||||
// This returns what would be the intersection of va_block start/end and
|
||||
// VMA start/end-1 for the given 'lookup_address' if
|
||||
@@ -432,18 +438,6 @@ typedef struct
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_va_block_cpu_find(uvm_va_space_t *va_space,
|
||||
uvm_service_block_context_t *service_context,
|
||||
struct vm_fault *vmf,
|
||||
uvm_va_block_t **va_block_ptr)
|
||||
{
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
}
|
||||
|
||||
static void uvm_hmm_cpu_fault_finish(uvm_service_block_context_t *service_context)
|
||||
{
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
|
||||
NvU64 addr,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
@@ -464,6 +458,23 @@ typedef struct
|
||||
return true;
|
||||
}
|
||||
|
||||
static void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
|
||||
{
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
|
||||
{
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block)
|
||||
{
|
||||
}
|
||||
|
||||
static void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
|
||||
{
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
|
||||
{
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
@@ -586,6 +597,10 @@ typedef struct
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
}
|
||||
|
||||
static void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
|
||||
{
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_gpu_chunk_t *gpu_chunk,
|
||||
@@ -612,8 +627,7 @@ typedef struct
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_hmm_pmm_gpu_evict_chunk(uvm_gpu_t *gpu,
|
||||
uvm_gpu_chunk_t *gpu_chunk)
|
||||
static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
|
||||
{
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -49,11 +49,17 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// A single top level PDE on Hopper covers 64 PB and that's the minimum
|
||||
// size that can be used.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 64ull * 1024 * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 64 * UVM_SIZE_1PB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = parent_gpu->rm_va_size + 384ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = parent_gpu->rm_va_size + 384 * UVM_SIZE_1TB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
// See uvm_mmu.h for mapping placement
|
||||
parent_gpu->flat_vidmem_va_base = (64 * UVM_SIZE_1PB) + (8 * UVM_SIZE_1TB);
|
||||
|
||||
// Physical CE writes to vidmem are non-coherent with respect to the CPU on
|
||||
// GH180.
|
||||
parent_gpu->ce_phys_vidmem_write_supported = !uvm_gpu_is_coherent(parent_gpu);
|
||||
|
||||
parent_gpu->peer_copy_mode = g_uvm_global.peer_copy_mode;
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "uvm_hal.h"
|
||||
#include "uvm_push.h"
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
#include "clc8b5.h"
|
||||
|
||||
static NvU32 ce_aperture(uvm_aperture_t aperture)
|
||||
@@ -97,7 +98,8 @@ void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 p
|
||||
|
||||
NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
|
||||
launch_dma_plc_mode);
|
||||
}
|
||||
|
||||
@@ -114,7 +116,8 @@ void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, N
|
||||
|
||||
NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE) |
|
||||
@@ -135,7 +138,8 @@ void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
|
||||
|
||||
NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_WITH_TIMESTAMP) |
|
||||
launch_dma_plc_mode);
|
||||
}
|
||||
|
||||
@@ -148,12 +152,46 @@ static NvU32 hopper_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t ds
|
||||
return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
|
||||
}
|
||||
|
||||
static bool hopper_scrub_enable(uvm_gpu_address_t dst, size_t size)
|
||||
static bool va_is_flat_vidmem(uvm_gpu_t *gpu, NvU64 va)
|
||||
{
|
||||
return !dst.is_virtual &&
|
||||
dst.aperture == UVM_APERTURE_VID &&
|
||||
IS_ALIGNED(dst.address, UVM_PAGE_SIZE_4K) &&
|
||||
IS_ALIGNED(size, UVM_PAGE_SIZE_4K);
|
||||
return (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu)) &&
|
||||
va >= gpu->parent->flat_vidmem_va_base &&
|
||||
va < gpu->parent->flat_vidmem_va_base + UVM_GPU_MAX_PHYS_MEM;
|
||||
}
|
||||
|
||||
// Return whether a memset should use the fast scrubber. If so, convert dst to
|
||||
// the address needed by the fast scrubber.
|
||||
static bool hopper_scrub_enable(uvm_gpu_t *gpu, uvm_gpu_address_t *dst, size_t size)
|
||||
{
|
||||
if (!IS_ALIGNED(dst->address, UVM_PAGE_SIZE_4K) || !IS_ALIGNED(size, UVM_PAGE_SIZE_4K))
|
||||
return false;
|
||||
|
||||
// When CE physical writes are disallowed, higher layers will convert
|
||||
// physical memsets to virtual using the flat mapping. Those layers are
|
||||
// unaware of the fast scrubber, which is safe to use specifically when CE
|
||||
// physical access is disallowed. Detect such memsets within the flat vidmem
|
||||
// region and convert them back to physical, since the fast scrubber only
|
||||
// works with physical addressing.
|
||||
if (dst->is_virtual && !gpu->parent->ce_phys_vidmem_write_supported && va_is_flat_vidmem(gpu, dst->address)) {
|
||||
*dst = uvm_gpu_address_physical(UVM_APERTURE_VID, dst->address - gpu->parent->flat_vidmem_va_base);
|
||||
return true;
|
||||
}
|
||||
|
||||
return !dst->is_virtual && dst->aperture == UVM_APERTURE_VID;
|
||||
}
|
||||
|
||||
static NvU32 hopper_memset_copy_type(uvm_push_t *push, uvm_gpu_address_t dst)
|
||||
{
|
||||
if (uvm_conf_computing_mode_enabled(uvm_push_get_gpu(push)) && dst.is_unprotected)
|
||||
return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
|
||||
return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
|
||||
}
|
||||
|
||||
NvU32 uvm_hal_hopper_ce_memcopy_copy_type(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
|
||||
{
|
||||
if (uvm_conf_computing_mode_enabled(uvm_push_get_gpu(push)) && dst.is_unprotected && src.is_unprotected)
|
||||
return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
|
||||
return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
|
||||
}
|
||||
|
||||
static void hopper_memset_common(uvm_push_t *push,
|
||||
@@ -172,8 +210,10 @@ static void hopper_memset_common(uvm_push_t *push,
|
||||
NvU32 launch_dma_remap_enable;
|
||||
NvU32 launch_dma_scrub_enable;
|
||||
NvU32 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
|
||||
NvU32 copy_type_value = hopper_memset_copy_type(push, dst);
|
||||
bool is_scrub = hopper_scrub_enable(gpu, &dst, num_elements * memset_element_size);
|
||||
|
||||
UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
|
||||
UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, num_elements, memset_element_size),
|
||||
"Memset validation failed in channel %s, GPU %s",
|
||||
push->channel->name,
|
||||
uvm_gpu_name(gpu));
|
||||
@@ -186,7 +226,7 @@ static void hopper_memset_common(uvm_push_t *push,
|
||||
else
|
||||
pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
|
||||
|
||||
if (memset_element_size == 8 && hopper_scrub_enable(dst, num_elements * memset_element_size)) {
|
||||
if (memset_element_size == 8 && is_scrub) {
|
||||
launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE);
|
||||
launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, TRUE);
|
||||
|
||||
@@ -223,6 +263,7 @@ static void hopper_memset_common(uvm_push_t *push,
|
||||
launch_dma_scrub_enable |
|
||||
launch_dma_dst_type |
|
||||
launch_dma_plc_mode |
|
||||
copy_type_value |
|
||||
pipelined_value);
|
||||
|
||||
dst.address += memset_this_time * memset_element_size;
|
||||
@@ -250,7 +291,7 @@ void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 v
|
||||
|
||||
void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size)
|
||||
{
|
||||
if (hopper_scrub_enable(dst, size)) {
|
||||
if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
|
||||
NvU64 value64 = value;
|
||||
|
||||
value64 |= value64 << 8;
|
||||
@@ -274,7 +315,7 @@ void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 v
|
||||
{
|
||||
UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size);
|
||||
|
||||
if (hopper_scrub_enable(dst, size)) {
|
||||
if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
|
||||
NvU64 value64 = value;
|
||||
|
||||
value64 |= value64 << 32;
|
||||
@@ -294,15 +335,235 @@ void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 v
|
||||
hopper_memset_common(push, dst, size, 4);
|
||||
}
|
||||
|
||||
bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
|
||||
bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
size_t num_elements,
|
||||
size_t element_size)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
// In HCC, if a memset uses physical addressing for the destination, then
|
||||
// it must write to (protected) vidmem. If the memset uses virtual
|
||||
// addressing, and the backing storage is not vidmem, the access is only
|
||||
// legal if the copy type is NONPROT2NONPROT, and the destination is
|
||||
// unprotected sysmem, but the validation does not detect it.
|
||||
if (uvm_conf_computing_mode_is_hcc(gpu) && !dst.is_virtual && dst.aperture != UVM_APERTURE_VID)
|
||||
return false;
|
||||
|
||||
if (!gpu->parent->ce_phys_vidmem_write_supported) {
|
||||
size_t size = num_elements * element_size;
|
||||
uvm_gpu_address_t temp = dst;
|
||||
|
||||
// Physical vidmem writes are disallowed, unless using the scrubber
|
||||
if (!dst.is_virtual && dst.aperture == UVM_APERTURE_VID && !hopper_scrub_enable(gpu, &temp, size)) {
|
||||
UVM_ERR_PRINT("Destination address of vidmem memset must be virtual, not physical: {%s, 0x%llx} size %zu\n",
|
||||
uvm_gpu_address_aperture_string(dst),
|
||||
dst.address,
|
||||
size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
if (uvm_conf_computing_mode_is_hcc(gpu)) {
|
||||
// In HCC, if a memcopy uses physical addressing for either the
|
||||
// destination or the source, then the corresponding aperture must be
|
||||
// vidmem. If virtual addressing is used, and the backing storage is
|
||||
// sysmem the access is only legal if the copy type is NONPROT2NONPROT,
|
||||
// but the validation does not detect it. In other words the copy
|
||||
// source and destination is unprotected sysmem.
|
||||
if (!src.is_virtual && (src.aperture != UVM_APERTURE_VID))
|
||||
return false;
|
||||
|
||||
if (!dst.is_virtual && (dst.aperture != UVM_APERTURE_VID))
|
||||
return false;
|
||||
|
||||
if (dst.is_unprotected != src.is_unprotected)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!gpu->parent->ce_phys_vidmem_write_supported && !dst.is_virtual && dst.aperture == UVM_APERTURE_VID) {
|
||||
UVM_ERR_PRINT("Destination address of vidmem memcopy must be virtual, not physical: {%s, 0x%llx}\n",
|
||||
uvm_gpu_address_aperture_string(dst),
|
||||
dst.address);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Specialized version of uvm_hal_volta_ce_memcopy used for encryption and
|
||||
// decryption. Pre-Hopper functionality, such as validation or address patching,
|
||||
// has been removed.
|
||||
static void encrypt_or_decrypt(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, NvU32 size)
|
||||
{
|
||||
NvU32 pipelined_value;
|
||||
NvU32 launch_dma_src_dst_type;
|
||||
NvU32 launch_dma_plc_mode;
|
||||
NvU32 flush_value;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
// HW allows unaligned operations only if the entire buffer is in one 32B
|
||||
// sector. Operations on buffers larger than 32B have to be aligned.
|
||||
if (size > UVM_CONF_COMPUTING_BUF_ALIGNMENT) {
|
||||
UVM_ASSERT(IS_ALIGNED(src.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
|
||||
UVM_ASSERT(IS_ALIGNED(dst.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT((dst.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
|
||||
((dst.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
|
||||
UVM_ASSERT((src.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
|
||||
((src.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
|
||||
}
|
||||
|
||||
launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
|
||||
launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
|
||||
|
||||
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
|
||||
pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
|
||||
else
|
||||
pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
|
||||
|
||||
flush_value = hopper_get_flush_value(push);
|
||||
|
||||
gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);
|
||||
|
||||
NV_PUSH_1U(C8B5, LINE_LENGTH_IN, size);
|
||||
|
||||
NV_PUSH_1U(C8B5, LAUNCH_DMA, HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) |
|
||||
HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, SECURE) |
|
||||
flush_value |
|
||||
launch_dma_src_dst_type |
|
||||
launch_dma_plc_mode |
|
||||
pipelined_value);
|
||||
}
|
||||
|
||||
// The GPU CE encrypt operation requires clients to pass a valid
|
||||
// address where the used IV will be written. But this requirement is
|
||||
// unnecessary, because UVM should instead rely on the CSL
|
||||
// nvUvmInterfaceCslLogDeviceEncryption API to independently track
|
||||
// the expected IV.
|
||||
//
|
||||
// To satisfy the HW requirement the same unprotected sysmem address is
|
||||
// passed to all GPU-side encryptions. This dummy buffer is allocated at
|
||||
// GPU initialization time.
|
||||
static NvU64 encrypt_iv_address(uvm_push_t *push, uvm_gpu_address_t dst)
|
||||
{
|
||||
NvU64 iv_address;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
// Match addressing mode of destination and IV
|
||||
if (dst.is_virtual) {
|
||||
iv_address = uvm_rm_mem_get_gpu_va(gpu->conf_computing.iv_rm_mem, gpu, false).address;
|
||||
}
|
||||
else {
|
||||
iv_address = uvm_mem_gpu_physical(gpu->conf_computing.iv_mem,
|
||||
gpu,
|
||||
0,
|
||||
gpu->conf_computing.iv_mem->size).address;
|
||||
}
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(iv_address, UVM_CONF_COMPUTING_IV_ALIGNMENT));
|
||||
|
||||
return iv_address;
|
||||
}
|
||||
|
||||
// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
|
||||
void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag)
|
||||
{
|
||||
|
||||
NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
|
||||
NvU64 iv_address;
|
||||
NvU32 iv_address_hi32, iv_address_lo32;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
|
||||
UVM_ASSERT(uvm_push_is_fake(push) || uvm_channel_is_secure(push->channel));
|
||||
UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
|
||||
|
||||
if (!src.is_virtual)
|
||||
UVM_ASSERT(src.aperture == UVM_APERTURE_VID);
|
||||
|
||||
// The addressing mode (and aperture, if applicable) of the destination
|
||||
// pointer determines the addressing mode and aperture used by the
|
||||
// encryption to reference the other two addresses written by it:
|
||||
// authentication tag, and IV. If the client passes a sysmem physical
|
||||
// address as destination, then the authentication tag must also be a sysmem
|
||||
// physical address.
|
||||
UVM_ASSERT(dst.is_virtual == auth_tag.is_virtual);
|
||||
|
||||
if (!dst.is_virtual) {
|
||||
UVM_ASSERT(dst.aperture == UVM_APERTURE_SYS);
|
||||
UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
|
||||
}
|
||||
|
||||
NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, ENCRYPT));
|
||||
|
||||
auth_tag_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
|
||||
auth_tag_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
|
||||
|
||||
iv_address = encrypt_iv_address(push, dst);
|
||||
|
||||
iv_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_UPPER, UPPER, NvU64_HI32(iv_address));
|
||||
iv_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_LOWER, LOWER, NvU64_LO32(iv_address));
|
||||
|
||||
NV_PUSH_4U(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, auth_tag_address_hi32,
|
||||
SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, auth_tag_address_lo32,
|
||||
SET_ENCRYPT_IV_ADDR_UPPER, iv_address_hi32,
|
||||
SET_ENCRYPT_IV_ADDR_LOWER, iv_address_lo32);
|
||||
|
||||
encrypt_or_decrypt(push, dst, src, size);
|
||||
}
|
||||
|
||||
// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
|
||||
void uvm_hal_hopper_ce_decrypt(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag)
|
||||
{
|
||||
|
||||
NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
|
||||
UVM_ASSERT(!push->channel || uvm_channel_is_secure(push->channel));
|
||||
UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
|
||||
|
||||
// The addressing mode (and aperture, if applicable) of the source and
|
||||
// authentication pointers should match. But unlike in the encryption case,
|
||||
// clients are not forced to pass a valid IV address.
|
||||
UVM_ASSERT(src.is_virtual == auth_tag.is_virtual);
|
||||
|
||||
if (!src.is_virtual) {
|
||||
UVM_ASSERT(src.aperture == UVM_APERTURE_SYS);
|
||||
UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
|
||||
}
|
||||
|
||||
if (!dst.is_virtual)
|
||||
UVM_ASSERT(dst.aperture == UVM_APERTURE_VID);
|
||||
|
||||
NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, DECRYPT));
|
||||
|
||||
auth_tag_address_hi32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
|
||||
auth_tag_address_lo32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
|
||||
|
||||
NV_PUSH_2U(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, auth_tag_address_hi32,
|
||||
SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, auth_tag_address_lo32);
|
||||
|
||||
encrypt_or_decrypt(push, dst, src, size);
|
||||
}
|
||||
|
||||
|
||||
@@ -391,10 +391,15 @@ void uvm_hal_hopper_host_set_gpfifo_pushbuffer_segment_base(NvU64 *fifo_entry, N
|
||||
*fifo_entry |= (NvU64)(HWCONST(C86F, GP_ENTRY1, OPCODE, SET_PB_SEGMENT_EXTENDED_BASE)) << 32;
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length)
|
||||
void uvm_hal_hopper_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag)
|
||||
{
|
||||
NvU64 fifo_entry_value;
|
||||
NvU64 pb_low_bits_mask = (1ull << 40) - 1;
|
||||
const NvU32 sync_value = (sync_flag == UVM_GPFIFO_SYNC_WAIT) ? HWCONST(C86F, GP_ENTRY1, SYNC, WAIT) :
|
||||
HWCONST(C86F, GP_ENTRY1, SYNC, PROCEED);
|
||||
|
||||
UVM_ASSERT(!uvm_global_is_suspended());
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(pushbuffer_va, 4), "pushbuffer va unaligned: %llu\n", pushbuffer_va);
|
||||
@@ -406,7 +411,8 @@ void uvm_hal_hopper_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va
|
||||
pushbuffer_va &= pb_low_bits_mask;
|
||||
fifo_entry_value = HWVALUE(C86F, GP_ENTRY0, GET, NvU64_LO32(pushbuffer_va) >> 2);
|
||||
fifo_entry_value |= (NvU64)(HWVALUE(C86F, GP_ENTRY1, GET_HI, NvU64_HI32(pushbuffer_va)) |
|
||||
HWVALUE(C86F, GP_ENTRY1, LENGTH, pushbuffer_length >> 2)) << 32;
|
||||
HWVALUE(C86F, GP_ENTRY1, LENGTH, pushbuffer_length >> 2) |
|
||||
sync_value) << 32;
|
||||
|
||||
*fifo_entry = fifo_entry_value;
|
||||
}
|
||||
|
||||
212
kernel-open/nvidia-uvm/uvm_hopper_sec2.c
Normal file
212
kernel-open/nvidia-uvm/uvm_hopper_sec2.c
Normal file
@@ -0,0 +1,212 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2022-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_hal.h"
|
||||
#include "uvm_hal_types.h"
|
||||
#include "uvm_push.h"
|
||||
#include "uvm_push_macros.h"
|
||||
#include "nv_uvm_types.h"
|
||||
#include "nv_uvm_interface.h"
|
||||
#include "clcba2.h"
|
||||
#include "clc86f.h"
|
||||
#include "clb06f.h"
|
||||
|
||||
#define UVM_CSL_SIGN_AUTH_TAG_ALIGNMENT_BYTES (1 << HWSHIFT(CBA2, METHOD_STREAM_AUTH_TAG_ADDR_LO, DATA))
|
||||
|
||||
static void sign_push(uvm_push_t *push, NvU32 *init_method, NvU8 *auth_tag)
|
||||
{
|
||||
NvU32 *sign_input_buf = push->begin + UVM_METHOD_SIZE / sizeof(*push->begin);
|
||||
NvU32 sign_size = 0;
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(init_method < push->next);
|
||||
|
||||
while (init_method < push->next) {
|
||||
NvU8 subch = READ_HWVALUE(*init_method, B06F, DMA, METHOD_SUBCHANNEL);
|
||||
NvU32 count = READ_HWVALUE(*init_method, B06F, DMA, METHOD_COUNT);
|
||||
|
||||
if (subch == UVM_SUBCHANNEL_CBA2) {
|
||||
NvU32 method_addr = READ_HWVALUE(*init_method, B06F, DMA, METHOD_ADDRESS) << 2;
|
||||
|
||||
UVM_ASSERT(count == 1);
|
||||
UVM_ASSERT((sign_size + 2) * UVM_METHOD_SIZE <= UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE);
|
||||
|
||||
sign_input_buf[sign_size] = method_addr;
|
||||
sign_input_buf[sign_size + 1] = init_method[1];
|
||||
|
||||
// We consume the method address and the method data from the input,
|
||||
// we advance the sign_input_buf by 2.
|
||||
sign_size += 2;
|
||||
}
|
||||
|
||||
init_method += (count + 1);
|
||||
}
|
||||
|
||||
UVM_ASSERT(sign_size > 0);
|
||||
|
||||
status = nvUvmInterfaceCslSign(&push->channel->csl.ctx,
|
||||
sign_size * UVM_METHOD_SIZE,
|
||||
(NvU8 *)sign_input_buf,
|
||||
auth_tag);
|
||||
|
||||
UVM_ASSERT_MSG(status == NV_OK,
|
||||
"Failure to sign method stream auth tag, err: %s, GPU: %s.\n",
|
||||
nvstatusToString(status),
|
||||
uvm_gpu_name(uvm_push_get_gpu(push)));
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_sec2_init(uvm_push_t *push)
|
||||
{
|
||||
// Commonly, we would push a SET_OBJECT HOPPER_SEC2_WORK_LAUNCH_A in the
|
||||
// init function. During channel initialization, this method would be sent
|
||||
// to ESCHED to notify the expected SEC2 class ID. ESCHED forwards this
|
||||
// method to the SEC2 engine. SEC2 is not guaranteed to support the
|
||||
// SET_OBJECT method, so we shouldn't submit it.
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_sec2_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
|
||||
{
|
||||
NvU32 sem_lo;
|
||||
NvU32 flush_value;
|
||||
NvU8 *sign_auth_tag_ptr;
|
||||
NvU32 sign_auth_tag_addr_lo;
|
||||
uvm_gpu_address_t sign_auth_tag_gpu_va;
|
||||
NvU32 *csl_sign_init = push->next;
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(NvU64_LO32(gpu_va), 1 << HWSHIFT(CBA2, SEMAPHORE_B, LOWER)));
|
||||
|
||||
sem_lo = READ_HWVALUE(NvU64_LO32(gpu_va), CBA2, SEMAPHORE_B, LOWER);
|
||||
|
||||
flush_value = uvm_hal_membar_before_semaphore(push) ? HWCONST(CBA2, SEMAPHORE_D, FLUSH_DISABLE, FALSE) :
|
||||
HWCONST(CBA2, SEMAPHORE_D, FLUSH_DISABLE, TRUE);
|
||||
|
||||
// The push and the method stream signature have the same lifetime, we
|
||||
// reserve space in the pushbuffer to store the signature. After the push is
|
||||
// processed by the GPU, the pushbuffer is recycled entirely, including the
|
||||
// signature buffer.
|
||||
sign_auth_tag_ptr = uvm_push_get_single_inline_buffer(push,
|
||||
UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES,
|
||||
UVM_CSL_SIGN_AUTH_TAG_ALIGNMENT_BYTES,
|
||||
&sign_auth_tag_gpu_va);
|
||||
|
||||
NV_PUSH_1U(CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_HI,
|
||||
HWVALUE(CBA2, METHOD_STREAM_AUTH_TAG_ADDR_HI, DATA, NvU64_HI32(sign_auth_tag_gpu_va.address)));
|
||||
|
||||
sign_auth_tag_addr_lo = READ_HWVALUE(NvU64_LO32(sign_auth_tag_gpu_va.address),
|
||||
CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_LO,
|
||||
DATA);
|
||||
NV_PUSH_1U(CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_LO,
|
||||
HWVALUE(CBA2, METHOD_STREAM_AUTH_TAG_ADDR_LO, DATA, sign_auth_tag_addr_lo));
|
||||
|
||||
NV_PUSH_1U(CBA2, SEMAPHORE_A, HWVALUE(CBA2, SEMAPHORE_A, UPPER, NvU64_HI32(gpu_va)));
|
||||
NV_PUSH_1U(CBA2, SEMAPHORE_B, HWVALUE(CBA2, SEMAPHORE_B, LOWER, sem_lo));
|
||||
NV_PUSH_1U(CBA2, SET_SEMAPHORE_PAYLOAD_LOWER, payload);
|
||||
|
||||
NV_PUSH_1U(CBA2, SEMAPHORE_D, HWCONST(CBA2, SEMAPHORE_D, TIMESTAMP, DISABLE) |
|
||||
HWCONST(CBA2, SEMAPHORE_D, PAYLOAD_SIZE, 32_BIT) |
|
||||
flush_value);
|
||||
|
||||
sign_push(push, csl_sign_init, sign_auth_tag_ptr);
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_sec2_semaphore_timestamp_unsupported(uvm_push_t *push, NvU64 gpu_va)
|
||||
{
|
||||
// TODO: Bug 3804752: [uvm][HCC] Add support for Hopper SEC2 HAL in UVM.
|
||||
// Semaphore_timestamp is not implemented in the SEC2 engine yet. We will
|
||||
// add support in UVM when they become available.
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
UVM_ASSERT_MSG(false, "SEC2 semaphore_timestamp is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
|
||||
static void execute_with_membar(uvm_push_t *push)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
uvm_membar_t membar = uvm_push_get_and_reset_membar_flag(push);
|
||||
|
||||
NvU32 flush_value = (membar == UVM_MEMBAR_SYS) ? HWCONST(CBA2, EXECUTE, FLUSH_DISABLE, FALSE) :
|
||||
HWCONST(CBA2, EXECUTE, FLUSH_DISABLE, TRUE);
|
||||
|
||||
NV_PUSH_1U(CBA2, EXECUTE, flush_value |
|
||||
HWCONST(CBA2, EXECUTE, NOTIFY, DISABLE));
|
||||
|
||||
if (membar == UVM_MEMBAR_GPU) {
|
||||
gpu->parent->host_hal->wait_for_idle(push);
|
||||
gpu->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_sec2_decrypt(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, NvU32 size, NvU64 auth_tag_va)
|
||||
{
|
||||
NvU8 *sign_auth_tag_ptr;
|
||||
NvU32 sign_auth_tag_addr_lo;
|
||||
uvm_gpu_address_t sign_auth_tag_gpu_va;
|
||||
NvU32 *csl_sign_init = push->next;
|
||||
|
||||
// Check that the provided alignment matches HW
|
||||
BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT < (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
|
||||
BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT % (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)) != 0);
|
||||
|
||||
// No overlapping.
|
||||
UVM_ASSERT(!uvm_ranges_overlap(src_va, src_va + size - 1, dst_va, dst_va + size - 1));
|
||||
|
||||
// Alignment requirements.
|
||||
UVM_ASSERT(IS_ALIGNED(NvU64_LO32(src_va), 1 << HWSHIFT(CBA2, DECRYPT_COPY_SRC_ADDR_LO, DATA)));
|
||||
UVM_ASSERT(IS_ALIGNED(NvU64_LO32(dst_va), 1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
|
||||
UVM_ASSERT(IS_ALIGNED(NvU64_LO32(auth_tag_va), 1 << HWSHIFT(CBA2, DECRYPT_COPY_AUTH_TAG_ADDR_LO, DATA)));
|
||||
UVM_ASSERT(IS_ALIGNED(size, 1 << HWSHIFT(CBA2, DECRYPT_COPY_SIZE, DATA)));
|
||||
|
||||
// See comments in SEC2 semaphore_release.
|
||||
sign_auth_tag_ptr = uvm_push_get_single_inline_buffer(push,
|
||||
UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES,
|
||||
UVM_CSL_SIGN_AUTH_TAG_ALIGNMENT_BYTES,
|
||||
&sign_auth_tag_gpu_va);
|
||||
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_SRC_ADDR_HI, NvU64_HI32(src_va));
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_SRC_ADDR_LO, NvU64_LO32(src_va));
|
||||
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_DST_ADDR_HI, NvU64_HI32(dst_va));
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_DST_ADDR_LO, NvU64_LO32(dst_va));
|
||||
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_SIZE, size);
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_AUTH_TAG_ADDR_HI, NvU64_HI32(auth_tag_va));
|
||||
NV_PUSH_1U(CBA2, DECRYPT_COPY_AUTH_TAG_ADDR_LO, NvU64_LO32(auth_tag_va));
|
||||
|
||||
NV_PUSH_1U(CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_HI,
|
||||
HWVALUE(CBA2, METHOD_STREAM_AUTH_TAG_ADDR_HI, DATA, NvU64_HI32(sign_auth_tag_gpu_va.address)));
|
||||
|
||||
sign_auth_tag_addr_lo = READ_HWVALUE(NvU64_LO32(sign_auth_tag_gpu_va.address),
|
||||
CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_LO,
|
||||
DATA);
|
||||
NV_PUSH_1U(CBA2,
|
||||
METHOD_STREAM_AUTH_TAG_ADDR_LO,
|
||||
HWVALUE(CBA2, METHOD_STREAM_AUTH_TAG_ADDR_LO, DATA, sign_auth_tag_addr_lo));
|
||||
|
||||
execute_with_membar(push);
|
||||
|
||||
sign_push(push, csl_sign_init, sign_auth_tag_ptr);
|
||||
}
|
||||
@@ -1048,6 +1048,41 @@ typedef struct
|
||||
NV_STATUS rmStatus; // OUT
|
||||
} UVM_MAP_EXTERNAL_SPARSE_PARAMS;
|
||||
|
||||
//
|
||||
// Used to initialise a secondary UVM file-descriptor which holds a
|
||||
// reference on the memory map to prevent it being torn down without
|
||||
// first notifying UVM. This is achieved by preventing mmap() calls on
|
||||
// the secondary file-descriptor so that on process exit
|
||||
// uvm_mm_release() will be called while the memory map is present
|
||||
// such that UVM can cleanly shutdown the GPU by handling faults
|
||||
// instead of cancelling them.
|
||||
//
|
||||
// This ioctl must be called after the primary file-descriptor has
|
||||
// been initialised with the UVM_INITIALIZE ioctl. The primary FD
|
||||
// should be passed in the uvmFd field and the UVM_MM_INITIALIZE ioctl
|
||||
// will hold a reference on the primary FD. Therefore uvm_release() is
|
||||
// guaranteed to be called after uvm_mm_release().
|
||||
//
|
||||
// Once this file-descriptor has been closed the UVM context is
|
||||
// effectively dead and subsequent operations requiring a memory map
|
||||
// will fail. Calling UVM_MM_INITIALIZE on a context that has already
|
||||
// been initialized via any FD will return NV_ERR_INVALID_STATE.
|
||||
//
|
||||
// Calling this with a non-UVM file-descriptor in uvmFd will return
|
||||
// NV_ERR_INVALID_ARGUMENT. Calling this on the same file-descriptor
|
||||
// as UVM_INITIALIZE or more than once on the same FD will return
|
||||
// NV_ERR_IN_USE.
|
||||
//
|
||||
// Not all platforms require this secondary file-descriptor. On those
|
||||
// platforms NV_WARN_NOTHING_TO_DO will be returned and users may
|
||||
// close the file-descriptor at anytime.
|
||||
#define UVM_MM_INITIALIZE UVM_IOCTL_BASE(75)
|
||||
typedef struct
|
||||
{
|
||||
NvS32 uvmFd; // IN
|
||||
NV_STATUS rmStatus; // OUT
|
||||
} UVM_MM_INITIALIZE_PARAMS;
|
||||
|
||||
//
|
||||
// Temporary ioctls which should be removed before UVM 8 release
|
||||
// Number backwards from 2047 - highest custom ioctl function number
|
||||
|
||||
@@ -540,6 +540,7 @@ typedef struct
|
||||
#endif // NV_IS_EXPORT_SYMBOL_PRESENT_int_active_memcg
|
||||
|
||||
#if defined(NVCPU_X86) || defined(NVCPU_X86_64)
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#endif
|
||||
|
||||
@@ -547,6 +548,27 @@ typedef struct
|
||||
#define PAGE_KERNEL_NOENC PAGE_KERNEL
|
||||
#endif
|
||||
|
||||
// uvm_pgprot_decrypted is a GPL-aware version of pgprot_decrypted that returns
|
||||
// the given input when UVM cannot use GPL symbols, or pgprot_decrypted is not
|
||||
// defined. Otherwise, the function is equivalent to pgprot_decrypted. UVM only
|
||||
// depends on pgprot_decrypted when the driver is allowed to use GPL symbols:
|
||||
// both AMD's SEV and Intel's TDX are only supported in conjunction with OpenRM.
|
||||
//
|
||||
// It is safe to invoke uvm_pgprot_decrypted in KVM + AMD SEV-SNP guests, even
|
||||
// if the call is not required, because pgprot_decrypted(PAGE_KERNEL_NOENC) ==
|
||||
// PAGE_KERNEL_NOENC.
|
||||
//
|
||||
// pgprot_decrypted was added by commit 21729f81ce8a ("x86/mm: Provide general
|
||||
// kernel support for memory encryption") in v4.14 (2017-07-18)
|
||||
static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
|
||||
{
|
||||
#if defined(pgprot_decrypted)
|
||||
return pgprot_decrypted(prot);
|
||||
#endif
|
||||
|
||||
return prot;
|
||||
}
|
||||
|
||||
// Commit 1dff8083a024650c75a9c961c38082473ceae8cf (v4.7).
|
||||
//
|
||||
// Archs with CONFIG_MMU should have their own page.h, and can't include
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
|
||||
const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
|
||||
{
|
||||
BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 26);
|
||||
BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 33);
|
||||
|
||||
switch (lock_order) {
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
|
||||
@@ -43,18 +43,25 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_GPU_SEMAPHORE_POOL);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_RM_API);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_RM_GPUS);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_VA_BLOCK_MIGRATE);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_VA_BLOCK);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PMM);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PMM_PMA);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PMM_ROOT_CHUNK);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHANNEL);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_WLC_CHANNEL);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_TOOLS_VA_SPACE_LIST);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_VA_SPACE_EVENTS);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_VA_SPACE_TOOLS);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_SEMA_POOL_TRACKER);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_SECURE_SEMAPHORE);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_LEAF);
|
||||
UVM_ENUM_STRING_DEFAULT();
|
||||
}
|
||||
|
||||
@@ -279,6 +279,14 @@
|
||||
// Operations not allowed while holding the lock:
|
||||
// - GPU memory allocation which can evict memory (would require nesting
|
||||
// block locks)
|
||||
// - GPU DMA Allocation pool lock (gpu->conf_computing.dma_buffer_pool.lock)
|
||||
// Order: UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL
|
||||
// Exclusive lock (mutex)
|
||||
//
|
||||
// Protects:
|
||||
// - Protect the state of the uvm_conf_computing_dma_buffer_pool_t
|
||||
// when the Confidential Computing feature is enabled on the system.
|
||||
//
|
||||
// - Chunk mapping lock (gpu->root_chunk_mappings.bitlocks and
|
||||
// gpu->sysmem_mappings.bitlock)
|
||||
// Order: UVM_LOCK_ORDER_CHUNK_MAPPING
|
||||
@@ -313,6 +321,58 @@
|
||||
// Operations not allowed while holding this lock
|
||||
// - GPU memory allocation which can evict
|
||||
//
|
||||
// - Secure channel CSL channel pool semaphore
|
||||
// Order: UVM_LOCK_ORDER_CSL_PUSH
|
||||
// Semaphore per SEC2 channel pool
|
||||
//
|
||||
// The semaphore controls concurrent pushes to secure channels. Secure work
|
||||
// submission depends on channel availability in GPFIFO entries (as in any
|
||||
// other channel type) but also on channel locking. Each secure channel has a
|
||||
// lock to enforce ordering of pushes. The channel's CSL lock is taken on
|
||||
// channel reservation until uvm_push_end. Secure channels are stateful
|
||||
// channels and the CSL lock protects their CSL state/context.
|
||||
//
|
||||
// Operations allowed while holding this lock
|
||||
// - Pushing work to CE secure channels
|
||||
//
|
||||
// - WLC CSL channel pool semaphore
|
||||
// Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
|
||||
// Semaphore per WLC channel pool
|
||||
//
|
||||
// The semaphore controls concurrent pushes to WLC channels. WLC work
|
||||
// submission depends on channel availability in GPFIFO entries (as in any
|
||||
// other channel type) but also on channel locking. Each WLC channel has a
|
||||
// lock to enforce ordering of pushes. The channel's CSL lock is taken on
|
||||
// channel reservation until uvm_push_end. SEC2 channels are stateful
|
||||
// channels and the CSL lock protects their CSL state/context.
|
||||
//
|
||||
// This lock ORDER is different and sits below generic secure channel CSL
|
||||
// lock and above SEC2 CSL lock. This reflects the dual nature of WLC
|
||||
// channels; they use SEC2 indirect work launch during initialization,
|
||||
// and after their schedule is initialized they provide indirect launch
|
||||
// functionality to other CE channels.
|
||||
//
|
||||
// Operations allowed while holding this lock
|
||||
// - Pushing work to WLC channels
|
||||
//
|
||||
// - SEC2 CSL channel pool semaphore
|
||||
// Order: UVM_LOCK_ORDER_SEC2_CSL_PUSH
|
||||
// Semaphore per SEC2 channel pool
|
||||
//
|
||||
// The semaphore controls concurrent pushes to SEC2 channels. SEC2 work
|
||||
// submission depends on channel availability in GPFIFO entries (as in any
|
||||
// other channel type) but also on channel locking. Each SEC2 channel has a
|
||||
// lock to enforce ordering of pushes. The channel's CSL lock is taken on
|
||||
// channel reservation until uvm_push_end. SEC2 channels are stateful
|
||||
// channels and the CSL lock protects their CSL state/context.
|
||||
//
|
||||
// This lock ORDER is different and lower than the generic secure channel
|
||||
// lock to allow secure work submission to use a SEC2 channel to submit
|
||||
// work before releasing the CSL lock of the originating secure channel.
|
||||
//
|
||||
// Operations allowed while holding this lock
|
||||
// - Pushing work to SEC2 channels
|
||||
//
|
||||
// - Concurrent push semaphore
|
||||
// Order: UVM_LOCK_ORDER_PUSH
|
||||
// Semaphore (uvm_semaphore_t)
|
||||
@@ -346,6 +406,15 @@
|
||||
// channel pool lock documentation contains the guidelines about which lock
|
||||
// type (mutex or spinlock) to use.
|
||||
//
|
||||
// - WLC Channel lock
|
||||
// Order: UVM_LOCK_ORDER_WLC_CHANNEL
|
||||
// Spinlock (uvm_spinlock_t)
|
||||
//
|
||||
// Lock protecting the state of WLC channels in a channel pool. This lock
|
||||
// is separate from the above generic channel lock to allow for indirect
|
||||
// worklaunch pushes while holding the main channel lock.
|
||||
// (WLC pushes don't need any of the pushbuffer locks described above)
|
||||
//
|
||||
// - Tools global VA space list lock (g_tools_va_space_list_lock)
|
||||
// Order: UVM_LOCK_ORDER_TOOLS_VA_SPACE_LIST
|
||||
// Reader/writer lock (rw_sempahore)
|
||||
@@ -366,6 +435,12 @@
|
||||
// events come from perf events, both VA_SPACE_EVENTS and VA_SPACE_TOOLS
|
||||
// must be taken to register/report some tools events.
|
||||
//
|
||||
// - Tracking semaphores
|
||||
// Order: UVM_LOCK_ORDER_SECURE_SEMAPHORE
|
||||
// When the Confidential Computing feature is enabled, CE semaphores are
|
||||
// encrypted, and require to take the CSL lock (UVM_LOCK_ORDER_LEAF) to
|
||||
// decrypt the payload.
|
||||
//
|
||||
// - Leaf locks
|
||||
// Order: UVM_LOCK_ORDER_LEAF
|
||||
//
|
||||
@@ -390,18 +465,25 @@ typedef enum
|
||||
UVM_LOCK_ORDER_GPU_SEMAPHORE_POOL,
|
||||
UVM_LOCK_ORDER_RM_API,
|
||||
UVM_LOCK_ORDER_RM_GPUS,
|
||||
UVM_LOCK_ORDER_VA_BLOCK_MIGRATE,
|
||||
UVM_LOCK_ORDER_VA_BLOCK,
|
||||
UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
|
||||
UVM_LOCK_ORDER_CHUNK_MAPPING,
|
||||
UVM_LOCK_ORDER_PAGE_TREE,
|
||||
UVM_LOCK_ORDER_CSL_PUSH,
|
||||
UVM_LOCK_ORDER_CSL_WLC_PUSH,
|
||||
UVM_LOCK_ORDER_CSL_SEC2_PUSH,
|
||||
UVM_LOCK_ORDER_PUSH,
|
||||
UVM_LOCK_ORDER_PMM,
|
||||
UVM_LOCK_ORDER_PMM_PMA,
|
||||
UVM_LOCK_ORDER_PMM_ROOT_CHUNK,
|
||||
UVM_LOCK_ORDER_CHANNEL,
|
||||
UVM_LOCK_ORDER_WLC_CHANNEL,
|
||||
UVM_LOCK_ORDER_TOOLS_VA_SPACE_LIST,
|
||||
UVM_LOCK_ORDER_VA_SPACE_EVENTS,
|
||||
UVM_LOCK_ORDER_VA_SPACE_TOOLS,
|
||||
UVM_LOCK_ORDER_SEMA_POOL_TRACKER,
|
||||
UVM_LOCK_ORDER_SECURE_SEMAPHORE,
|
||||
UVM_LOCK_ORDER_LEAF,
|
||||
UVM_LOCK_ORDER_COUNT,
|
||||
} uvm_lock_order_t;
|
||||
@@ -914,9 +996,10 @@ typedef struct
|
||||
// be the same as the string passed to "spinlock".
|
||||
// See uvm_spin_lock() and uvm_spin_unlock() below as examples.
|
||||
//
|
||||
#define uvm_assert_spinlock_locked(spinlock) ({ \
|
||||
typeof(spinlock) _lock_ = (spinlock); \
|
||||
UVM_ASSERT(spin_is_locked(&_lock_->lock) && uvm_check_locked(_lock_, UVM_LOCK_FLAGS_MODE_EXCLUSIVE)); \
|
||||
#define uvm_assert_spinlock_locked(spinlock) ({ \
|
||||
typeof(spinlock) _lock_ = (spinlock); \
|
||||
UVM_ASSERT(spin_is_locked(&_lock_->lock)); \
|
||||
UVM_ASSERT(uvm_check_locked(_lock_, UVM_LOCK_FLAGS_MODE_EXCLUSIVE)); \
|
||||
})
|
||||
|
||||
#define uvm_assert_spinlock_unlocked(spinlock) UVM_ASSERT(!spin_is_locked(&(spinlock)->lock))
|
||||
|
||||
@@ -33,11 +33,13 @@ void uvm_hal_maxwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// space for UVM internal mappings.
|
||||
// A single top level PDE covers 64 or 128 MB on Maxwell so 128 GB is fine to use.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 128ull * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 128 * UVM_SIZE_1GB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = 768ull * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = 768 * UVM_SIZE_1GB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
parent_gpu->ce_phys_vidmem_write_supported = true;
|
||||
|
||||
// We don't have a compelling use case in UVM-Lite for direct peer
|
||||
// migrations between GPUs, so don't bother setting them up.
|
||||
parent_gpu->peer_copy_mode = UVM_GPU_PEER_COPY_MODE_UNSUPPORTED;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2022 NVIDIA Corporation
|
||||
Copyright (c) 2021-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -29,10 +29,10 @@
|
||||
void uvm_hal_maxwell_ce_init(uvm_push_t *push)
|
||||
{
|
||||
// Notably this sends SET_OBJECT with the CE class on subchannel 0 instead
|
||||
// of the recommended by HW subchannel 4 (subchannel 4 is recommended to
|
||||
// of the recommended by HW subchannel 4 (subchannel 4 is required to
|
||||
// match CE usage on GRCE). For the UVM driver using subchannel 0 has the
|
||||
// benefit of also verifying that we ended up on the right PBDMA though as
|
||||
// SET_OBJECT with CE class on subchannel 0 would fail on GRCE.
|
||||
// benefit of also verifying that we ended up on the right CE engine type
|
||||
// though as SET_OBJECT with CE class on subchannel 0 would fail on GRCE.
|
||||
NV_PUSH_1U(B06F, SET_OBJECT, uvm_push_get_gpu(push)->parent->rm_info.ceClass);
|
||||
}
|
||||
|
||||
@@ -185,6 +185,12 @@ NvU32 uvm_hal_maxwell_ce_plc_mode(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Noop, since COPY_TYPE doesn't exist in Maxwell.
|
||||
NvU32 uvm_hal_maxwell_ce_memcopy_copy_type(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size)
|
||||
{
|
||||
// If >4GB copies ever become an important use case, this function should
|
||||
@@ -195,6 +201,7 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
|
||||
NvU32 pipelined_value;
|
||||
NvU32 launch_dma_src_dst_type;
|
||||
NvU32 launch_dma_plc_mode;
|
||||
NvU32 copy_type_value;
|
||||
|
||||
UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_is_valid(push, dst, src),
|
||||
"Memcopy validation failed in channel %s, GPU %s.\n",
|
||||
@@ -205,6 +212,7 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
|
||||
|
||||
launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
|
||||
launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
|
||||
copy_type_value = gpu->parent->ce_hal->memcopy_copy_type(push, dst, src);
|
||||
|
||||
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
|
||||
pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
|
||||
@@ -226,6 +234,7 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
|
||||
HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
|
||||
launch_dma_src_dst_type |
|
||||
launch_dma_plc_mode |
|
||||
copy_type_value |
|
||||
pipelined_value);
|
||||
|
||||
pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
|
||||
@@ -266,7 +275,7 @@ static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size,
|
||||
NvU32 launch_dma_dst_type;
|
||||
NvU32 launch_dma_plc_mode;
|
||||
|
||||
UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
|
||||
UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, size, memset_element_size),
|
||||
"Memset validation failed in channel %s, GPU %s.\n",
|
||||
push->channel->name,
|
||||
uvm_gpu_name(gpu));
|
||||
@@ -352,3 +361,24 @@ void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value,
|
||||
uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, uvm_gpu_address_virtual(dst_va), value, size);
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_ce_encrypt_unsupported(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(false, "CE encrypt is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_ce_decrypt_unsupported(uvm_push_t *push,
|
||||
uvm_gpu_address_t dst,
|
||||
uvm_gpu_address_t src,
|
||||
NvU32 size,
|
||||
uvm_gpu_address_t auth_tag)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(false, "CE decrypt is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021 NVIDIA Corporation
|
||||
Copyright (c) 2021-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -62,6 +62,12 @@ NvU8 uvm_hal_maxwell_fault_buffer_get_ve_id_unsupported(NvU16 mmu_engine_id, uvm
|
||||
return 0;
|
||||
}
|
||||
|
||||
uvm_fault_type_t uvm_hal_maxwell_fault_buffer_get_fault_type_unsupported(const NvU32 *fault_entry)
|
||||
{
|
||||
UVM_ASSERT_MSG(false, "fault_buffer_get_fault_type is not supported.\n");
|
||||
return UVM_FAULT_TYPE_COUNT;
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_fault_buffer_parse_entry_unsupported(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 index,
|
||||
uvm_fault_buffer_entry_t *buffer_entry)
|
||||
|
||||
@@ -217,9 +217,14 @@ void uvm_hal_maxwell_host_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
|
||||
HWCONST(A16F, SEMAPHORED, RELEASE_WFI, DIS));
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length)
|
||||
void uvm_hal_maxwell_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag)
|
||||
{
|
||||
NvU64 fifo_entry_value;
|
||||
const NvU32 sync_value = (sync_flag == UVM_GPFIFO_SYNC_WAIT) ? HWCONST(A16F, GP_ENTRY1, SYNC, WAIT) :
|
||||
HWCONST(A16F, GP_ENTRY1, SYNC, PROCEED);
|
||||
|
||||
UVM_ASSERT(!uvm_global_is_suspended());
|
||||
UVM_ASSERT_MSG(pushbuffer_va % 4 == 0, "pushbuffer va unaligned: %llu\n", pushbuffer_va);
|
||||
@@ -228,7 +233,8 @@ void uvm_hal_maxwell_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_v
|
||||
fifo_entry_value = HWVALUE(A16F, GP_ENTRY0, GET, NvU64_LO32(pushbuffer_va) >> 2);
|
||||
fifo_entry_value |= (NvU64)(HWVALUE(A16F, GP_ENTRY1, GET_HI, NvU64_HI32(pushbuffer_va)) |
|
||||
HWVALUE(A16F, GP_ENTRY1, LENGTH, pushbuffer_length >> 2) |
|
||||
HWCONST(A16F, GP_ENTRY1, PRIV, KERNEL)) << 32;
|
||||
HWCONST(A16F, GP_ENTRY1, PRIV, KERNEL) |
|
||||
sync_value) << 32;
|
||||
|
||||
*fifo_entry = fifo_entry_value;
|
||||
}
|
||||
|
||||
53
kernel-open/nvidia-uvm/uvm_maxwell_sec2.c
Normal file
53
kernel-open/nvidia-uvm/uvm_maxwell_sec2.c
Normal file
@@ -0,0 +1,53 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_hal.h"
|
||||
#include "uvm_push.h"
|
||||
#include "uvm_common.h"
|
||||
|
||||
void uvm_hal_maxwell_sec2_init_noop(uvm_push_t *push)
|
||||
{
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_sec2_semaphore_release_unsupported(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
UVM_ASSERT_MSG(false, "SEC2 semaphore_release is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_sec2_semaphore_timestamp_unsupported(uvm_push_t *push, NvU64 gpu_va)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
UVM_ASSERT_MSG(false, "SEC2 semaphore_timestamp is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
|
||||
void uvm_hal_maxwell_sec2_decrypt_unsupported(uvm_push_t *push,
|
||||
NvU64 dst_va,
|
||||
NvU64 src_va,
|
||||
NvU32 size,
|
||||
NvU64 auth_tag_va)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
UVM_ASSERT_MSG(false, "SEC2 decrypt is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
|
||||
}
|
||||
@@ -22,6 +22,7 @@
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_hal_types.h"
|
||||
#include "uvm_mmu.h"
|
||||
#include "uvm_processors.h"
|
||||
#include "uvm_va_space.h"
|
||||
@@ -67,26 +68,15 @@ static bool vidmem_can_be_mapped(uvm_mem_t *vidmem, bool is_user_space)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool sysmem_can_be_mapped(uvm_mem_t *sysmem)
|
||||
{
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(sysmem));
|
||||
|
||||
// If SEV is enabled, only unprotected memory can be mapped
|
||||
if (g_uvm_global.sev_enabled)
|
||||
return uvm_mem_is_sysmem_dma(sysmem);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool mem_can_be_mapped_on_cpu(uvm_mem_t *mem, bool is_user_space)
|
||||
{
|
||||
if (uvm_mem_is_sysmem(mem))
|
||||
return sysmem_can_be_mapped(mem);
|
||||
return true;
|
||||
|
||||
if (!vidmem_can_be_mapped(mem, is_user_space))
|
||||
return false;
|
||||
|
||||
return mem->backing_gpu->parent->numa_info.enabled && PAGE_ALIGNED(mem->chunk_size);
|
||||
return mem->backing_gpu->mem_info.numa.enabled && PAGE_ALIGNED(mem->chunk_size);
|
||||
}
|
||||
|
||||
static bool mem_can_be_mapped_on_cpu_kernel(uvm_mem_t *mem)
|
||||
@@ -99,10 +89,21 @@ static bool mem_can_be_mapped_on_cpu_user(uvm_mem_t *mem)
|
||||
return mem_can_be_mapped_on_cpu(mem, true);
|
||||
}
|
||||
|
||||
static bool sysmem_can_be_mapped_on_gpu(uvm_mem_t *sysmem)
|
||||
{
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(sysmem));
|
||||
|
||||
// If SEV is enabled, only unprotected memory can be mapped
|
||||
if (g_uvm_global.sev_enabled)
|
||||
return uvm_mem_is_sysmem_dma(sysmem);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool mem_can_be_mapped_on_gpu(uvm_mem_t *mem, uvm_gpu_t *gpu, bool is_user_space)
|
||||
{
|
||||
if (uvm_mem_is_sysmem(mem))
|
||||
return sysmem_can_be_mapped(mem);
|
||||
return sysmem_can_be_mapped_on_gpu(mem);
|
||||
|
||||
if (!vidmem_can_be_mapped(mem, is_user_space))
|
||||
return false;
|
||||
@@ -312,7 +313,7 @@ static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
|
||||
|
||||
// When UVM_PAGE_SIZE_DEFAULT is used on NUMA-enabled GPUs, we force
|
||||
// chunk_size to be PAGE_SIZE at least, to allow CPU mappings.
|
||||
if (mem->backing_gpu->parent->numa_info.enabled)
|
||||
if (mem->backing_gpu->mem_info.numa.enabled)
|
||||
chunk_size = max(chunk_size, (NvU32)PAGE_SIZE);
|
||||
|
||||
return chunk_size;
|
||||
@@ -449,6 +450,9 @@ static gfp_t sysmem_allocation_gfp_flags(int order, bool zero)
|
||||
return gfp_flags;
|
||||
}
|
||||
|
||||
// This allocation is a non-protected memory allocation under Confidential
|
||||
// Computing.
|
||||
//
|
||||
// There is a tighter coupling between allocation and mapping because of the
|
||||
// allocator UVM must use. Hence, this function does the equivalent of
|
||||
// uvm_mem_map_gpu_phys().
|
||||
@@ -523,9 +527,10 @@ static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
|
||||
|
||||
// In case of failure, the caller is required to handle cleanup by calling
|
||||
// uvm_mem_free
|
||||
static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_protected)
|
||||
static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unprotected)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_pmm_gpu_memory_type_t mem_type;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
@@ -542,14 +547,23 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_prot
|
||||
if (!mem->vidmem.chunks)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
status = uvm_pmm_gpu_alloc_kernel(&mem->backing_gpu->pmm,
|
||||
mem->chunks_count,
|
||||
mem->chunk_size,
|
||||
UVM_PMM_ALLOC_FLAGS_NONE,
|
||||
mem->vidmem.chunks,
|
||||
NULL);
|
||||
// When CC is disabled the behavior is identical to that of PMM, and the
|
||||
// protection flag is ignored (squashed by PMM internally).
|
||||
if (is_unprotected)
|
||||
mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED;
|
||||
else
|
||||
mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED;
|
||||
|
||||
status = uvm_pmm_gpu_alloc(&mem->backing_gpu->pmm,
|
||||
mem->chunks_count,
|
||||
mem->chunk_size,
|
||||
mem_type,
|
||||
UVM_PMM_ALLOC_FLAGS_NONE,
|
||||
mem->vidmem.chunks,
|
||||
NULL);
|
||||
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("pmm_gpu_alloc(count=%zd, size=0x%x) failed: %s\n",
|
||||
UVM_ERR_PRINT("uvm_pmm_gpu_alloc (count=%zd, size=0x%x) failed: %s\n",
|
||||
mem->chunks_count,
|
||||
mem->chunk_size,
|
||||
nvstatusToString(status));
|
||||
@@ -559,7 +573,7 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_prot
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_protected)
|
||||
static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_unprotected)
|
||||
{
|
||||
if (uvm_mem_is_sysmem(mem)) {
|
||||
gfp_t gfp_flags;
|
||||
@@ -581,7 +595,7 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
|
||||
return status;
|
||||
}
|
||||
|
||||
return mem_alloc_vidmem_chunks(mem, zero, is_protected);
|
||||
return mem_alloc_vidmem_chunks(mem, zero, is_unprotected);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_global_processor_mask_t *mask)
|
||||
@@ -611,7 +625,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
|
||||
NV_STATUS status;
|
||||
NvU64 physical_size;
|
||||
uvm_mem_t *mem = NULL;
|
||||
bool is_protected = false;
|
||||
bool is_unprotected = false;
|
||||
|
||||
UVM_ASSERT(params->size > 0);
|
||||
|
||||
@@ -633,7 +647,12 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
|
||||
physical_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
|
||||
mem->chunks_count = physical_size / mem->chunk_size;
|
||||
|
||||
status = mem_alloc_chunks(mem, params->mm, params->zero, is_protected);
|
||||
if (params->is_unprotected)
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
is_unprotected = params->is_unprotected;
|
||||
|
||||
status = mem_alloc_chunks(mem, params->mm, params->zero, is_unprotected);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
@@ -718,8 +737,8 @@ static NV_STATUS mem_map_cpu_to_sysmem_kernel(uvm_mem_t *mem)
|
||||
pages[page_index] = mem_cpu_page(mem, page_index * PAGE_SIZE);
|
||||
}
|
||||
|
||||
if (g_uvm_global.sev_enabled)
|
||||
prot = PAGE_KERNEL_NOENC;
|
||||
if (g_uvm_global.sev_enabled && uvm_mem_is_sysmem_dma(mem))
|
||||
prot = uvm_pgprot_decrypted(PAGE_KERNEL_NOENC);
|
||||
|
||||
mem->kernel.cpu_addr = vmap(pages, num_pages, VM_MAP, prot);
|
||||
|
||||
@@ -982,7 +1001,7 @@ uvm_gpu_address_t uvm_mem_gpu_address_copy(uvm_mem_t *mem, uvm_gpu_t *accessing_
|
||||
UVM_ASSERT(uvm_mem_is_physically_contiguous(mem, offset, size));
|
||||
|
||||
if (uvm_mem_is_sysmem(mem) || uvm_mem_is_local_vidmem(mem, accessing_gpu))
|
||||
return uvm_mem_gpu_address_physical(mem, accessing_gpu, offset, size);
|
||||
return uvm_gpu_address_copy(accessing_gpu, uvm_mem_gpu_physical(mem, accessing_gpu, offset, size));
|
||||
|
||||
// Peer GPUs may need to use some form of translation (identity mappings,
|
||||
// indirect peers) to copy.
|
||||
@@ -1041,6 +1060,16 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
|
||||
page_size = mem_pick_gpu_page_size(mem, gpu, tree);
|
||||
UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x\n", page_size);
|
||||
|
||||
// When the Confidential Computing feature is enabled, DMA allocations are
|
||||
// majoritarily allocated and managed by a per-GPU DMA buffer pool
|
||||
// (uvm_conf_computing_dma_buffer_pool_t). Because we would typically
|
||||
// already hold the DMA_BUFFER_POOL lock at this time, we cannot hold
|
||||
// the block lock. Allocate PTEs without eviction in this context.
|
||||
//
|
||||
// See uvm_pmm_gpu_alloc()
|
||||
if (uvm_mem_is_sysmem_dma(mem))
|
||||
pmm_flags = UVM_PMM_ALLOC_FLAGS_NONE;
|
||||
|
||||
status = uvm_page_table_range_vec_create(tree,
|
||||
gpu_va,
|
||||
uvm_mem_physical_size(mem),
|
||||
@@ -1205,7 +1234,7 @@ void uvm_mem_unmap_gpu_kernel(uvm_mem_t *mem, uvm_gpu_t *gpu)
|
||||
static bool mem_can_be_phys_mapped_on_gpu(uvm_mem_t *mem, uvm_gpu_t *gpu)
|
||||
{
|
||||
if (uvm_mem_is_sysmem(mem))
|
||||
return sysmem_can_be_mapped(mem);
|
||||
return sysmem_can_be_mapped_on_gpu(mem);
|
||||
else
|
||||
return uvm_mem_is_local_vidmem(mem, gpu);
|
||||
}
|
||||
@@ -1306,10 +1335,16 @@ NvU64 uvm_mem_get_gpu_va_kernel(uvm_mem_t *mem, uvm_gpu_t *gpu)
|
||||
|
||||
uvm_gpu_address_t uvm_mem_gpu_address_virtual_kernel(uvm_mem_t *mem, uvm_gpu_t *gpu)
|
||||
{
|
||||
return uvm_gpu_address_virtual(uvm_mem_get_gpu_va_kernel(mem, gpu));
|
||||
uvm_gpu_address_t addr = uvm_gpu_address_virtual(uvm_mem_get_gpu_va_kernel(mem, gpu));
|
||||
if (uvm_conf_computing_mode_enabled(gpu) && mem->dma_owner)
|
||||
addr.is_unprotected = true;
|
||||
return addr;
|
||||
}
|
||||
|
||||
uvm_gpu_address_t uvm_mem_gpu_address_physical(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU64 offset, NvU64 size)
|
||||
{
|
||||
return uvm_gpu_address_from_phys(uvm_mem_gpu_physical(mem, gpu, offset, size));
|
||||
uvm_gpu_address_t addr = uvm_gpu_address_from_phys(uvm_mem_gpu_physical(mem, gpu, offset, size));
|
||||
if (uvm_conf_computing_mode_enabled(gpu) && mem->dma_owner)
|
||||
addr.is_unprotected = true;
|
||||
return addr;
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
// The size of the VA used for mapping uvm_mem_t allocations
|
||||
// 128 GBs should be plenty for internal allocations and fits easily on all
|
||||
// supported architectures.
|
||||
#define UVM_MEM_VA_SIZE (128ull * 1024 * 1024 * 1024)
|
||||
#define UVM_MEM_VA_SIZE (128 * UVM_SIZE_1GB)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -128,6 +128,11 @@ typedef struct
|
||||
// has to be aligned to PAGE_SIZE.
|
||||
NvU32 page_size;
|
||||
|
||||
// The protection flag is only observed for vidmem allocations when CC is
|
||||
// enabled. If set to true, the allocation returns unprotected vidmem;
|
||||
// otherwise, the allocation returns protected vidmem.
|
||||
bool is_unprotected;
|
||||
|
||||
// If true, the allocation is zeroed (scrubbed).
|
||||
bool zero;
|
||||
} uvm_mem_alloc_params_t;
|
||||
@@ -161,6 +166,8 @@ struct uvm_mem_struct
|
||||
// lifetime of the GPU. For CPU allocations there is no lifetime limitation.
|
||||
uvm_gpu_t *backing_gpu;
|
||||
|
||||
// For Confidential Computing, the accessing GPU needs to be known at alloc
|
||||
// time for sysmem allocations.
|
||||
uvm_gpu_t *dma_owner;
|
||||
|
||||
union
|
||||
@@ -385,6 +392,12 @@ static NV_STATUS uvm_mem_alloc_vidmem(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **me
|
||||
return uvm_mem_alloc(¶ms, mem_out);
|
||||
}
|
||||
|
||||
// Helper for allocating protected vidmem with the default page size
|
||||
static NV_STATUS uvm_mem_alloc_vidmem_protected(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **mem_out)
|
||||
{
|
||||
return uvm_mem_alloc_vidmem(size, gpu, mem_out);
|
||||
}
|
||||
|
||||
// Helper for allocating sysmem and mapping it on the CPU
|
||||
static NV_STATUS uvm_mem_alloc_sysmem_and_map_cpu_kernel(NvU64 size, struct mm_struct *mm, uvm_mem_t **mem_out)
|
||||
{
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "uvm_kvmalloc.h"
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_push.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
#include "uvm_test.h"
|
||||
#include "uvm_test_ioctl.h"
|
||||
#include "uvm_va_space.h"
|
||||
@@ -80,17 +81,14 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
|
||||
for (offset = 0; offset < verif_size; offset += mem->chunk_size) {
|
||||
uvm_gpu_address_t sys_mem_gpu_address, mem_gpu_address;
|
||||
size_t size_this_time = min((NvU64)mem->chunk_size, verif_size - offset);
|
||||
bool should_use_pa;
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, &push, " "), done);
|
||||
|
||||
sys_mem_gpu_address = uvm_mem_gpu_address_virtual_kernel(sys_mem, gpu);
|
||||
sys_mem_gpu_address.address += offset;
|
||||
|
||||
should_use_pa = uvm_channel_is_privileged(push.channel);
|
||||
|
||||
if (should_use_pa) {
|
||||
mem_gpu_address = uvm_mem_gpu_address_physical(mem, gpu, offset, size_this_time);
|
||||
if (uvm_channel_is_privileged(push.channel)) {
|
||||
mem_gpu_address = uvm_mem_gpu_address_copy(mem, gpu, offset, size_this_time);
|
||||
}
|
||||
else {
|
||||
mem_gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
|
||||
@@ -130,7 +128,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
|
||||
mem_gpu_address.address += offset;
|
||||
|
||||
if (uvm_channel_is_privileged(push.channel)) {
|
||||
sys_mem_gpu_address = uvm_mem_gpu_address_physical(sys_mem, gpu, offset, size_this_time);
|
||||
sys_mem_gpu_address = uvm_mem_gpu_address_copy(sys_mem, gpu, offset, size_this_time);
|
||||
}
|
||||
else {
|
||||
sys_mem_gpu_address = uvm_mem_gpu_address_virtual_kernel(sys_mem, gpu);
|
||||
@@ -212,7 +210,7 @@ static NV_STATUS test_map_cpu(uvm_mem_t *mem)
|
||||
char *cpu_addr;
|
||||
|
||||
if (uvm_mem_is_vidmem(mem))
|
||||
UVM_ASSERT(mem->backing_gpu->parent->numa_info.enabled);
|
||||
UVM_ASSERT(mem->backing_gpu->mem_info.numa.enabled);
|
||||
|
||||
// Map
|
||||
TEST_NV_CHECK_RET(uvm_mem_map_cpu_kernel(mem));
|
||||
@@ -315,7 +313,7 @@ static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU32 page_size, size_t size,
|
||||
TEST_CHECK_GOTO(status == NV_OK, error);
|
||||
|
||||
if (page_size == UVM_PAGE_SIZE_DEFAULT) {
|
||||
if (gpu->parent->numa_info.enabled)
|
||||
if (gpu->mem_info.numa.enabled)
|
||||
TEST_CHECK_GOTO(mem->chunk_size >= PAGE_SIZE && mem->chunk_size <= max(size, (size_t)PAGE_SIZE), error);
|
||||
else
|
||||
TEST_CHECK_GOTO(mem->chunk_size == UVM_PAGE_SIZE_4K || mem->chunk_size <= size, error);
|
||||
@@ -323,7 +321,7 @@ static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU32 page_size, size_t size,
|
||||
|
||||
TEST_NV_CHECK_GOTO(test_map_gpu(mem, gpu), error);
|
||||
|
||||
if (gpu->parent->numa_info.enabled && (page_size == UVM_PAGE_SIZE_DEFAULT || page_size >= PAGE_SIZE))
|
||||
if (gpu->mem_info.numa.enabled && (page_size == UVM_PAGE_SIZE_DEFAULT || page_size >= PAGE_SIZE))
|
||||
TEST_CHECK_GOTO(test_map_cpu(mem) == NV_OK, error);
|
||||
|
||||
*mem_out = mem;
|
||||
@@ -371,6 +369,11 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
|
||||
int i;
|
||||
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(uvm_va_space_find_first_gpu(va_space)))
|
||||
return NV_OK;
|
||||
|
||||
gpu_count = uvm_processor_mask_get_gpu_count(&va_space->registered_gpus);
|
||||
|
||||
// +1 for the CPU
|
||||
@@ -469,7 +472,7 @@ static NV_STATUS test_basic_vidmem(uvm_gpu_t *gpu)
|
||||
page_sizes &= UVM_CHUNK_SIZES_MASK;
|
||||
for_each_page_size(page_size, page_sizes) {
|
||||
TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(page_size - 1, gpu, &mem) == NV_OK, done);
|
||||
if (gpu->parent->numa_info.enabled)
|
||||
if (gpu->mem_info.numa.enabled)
|
||||
TEST_CHECK_GOTO(mem->chunk_size >= PAGE_SIZE && mem->chunk_size <= max(page_size, (NvU32)PAGE_SIZE), done);
|
||||
else
|
||||
TEST_CHECK_GOTO(mem->chunk_size < page_size || page_size == smallest_page_size, done);
|
||||
@@ -477,7 +480,7 @@ static NV_STATUS test_basic_vidmem(uvm_gpu_t *gpu)
|
||||
mem = NULL;
|
||||
|
||||
TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(page_size, gpu, &mem) == NV_OK, done);
|
||||
if (gpu->parent->numa_info.enabled)
|
||||
if (gpu->mem_info.numa.enabled)
|
||||
TEST_CHECK_GOTO(mem->chunk_size == max(page_size, (NvU32)PAGE_SIZE), done);
|
||||
else
|
||||
TEST_CHECK_GOTO(mem->chunk_size == page_size, done);
|
||||
@@ -493,6 +496,41 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic_vidmem_unprotected(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *mem = NULL;
|
||||
|
||||
uvm_mem_alloc_params_t params = { 0 };
|
||||
params.size = UVM_PAGE_SIZE_4K;
|
||||
params.backing_gpu = gpu;
|
||||
params.page_size = UVM_PAGE_SIZE_4K;
|
||||
|
||||
// If CC is enabled, the protection flag is observed. Because currently all
|
||||
// vidmem is in the protected region, the allocation should succeed.
|
||||
//
|
||||
// If CC is disabled, the protection flag is ignored.
|
||||
params.is_unprotected = false;
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc(¶ms, &mem));
|
||||
|
||||
uvm_mem_free(mem);
|
||||
mem = NULL;
|
||||
|
||||
// If CC is enabled, the allocation should fail because currently the
|
||||
// unprotected region is empty.
|
||||
//
|
||||
// If CC is disabled, the behavior should be identical to that of a
|
||||
// protected allocation.
|
||||
params.is_unprotected = true;
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
TEST_CHECK_RET(uvm_mem_alloc(¶ms, &mem) == NV_ERR_NO_MEMORY);
|
||||
else
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc(¶ms, &mem));
|
||||
|
||||
uvm_mem_free(mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic_sysmem(void)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
@@ -531,15 +569,54 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic_dma_pool(uvm_gpu_t *gpu)
|
||||
{
|
||||
size_t i, j;
|
||||
size_t num_buffers;
|
||||
size_t status = NV_OK;
|
||||
uvm_conf_computing_dma_buffer_t **dma_buffers;
|
||||
|
||||
// If the Confidential Computing feature is disabled, the DMA buffers
|
||||
// pool is not initialized.
|
||||
if (!uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
// We're going to reclaim one more chunks that the pool have. Triggerring
|
||||
// one expansion.
|
||||
num_buffers = gpu->conf_computing.dma_buffer_pool.num_dma_buffers + 1;
|
||||
dma_buffers = uvm_kvmalloc_zero(sizeof(*dma_buffers) * num_buffers);
|
||||
if (dma_buffers == NULL)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
for (i = 0; i < num_buffers; ++i) {
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffers[i], NULL);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
}
|
||||
|
||||
TEST_CHECK_GOTO(gpu->conf_computing.dma_buffer_pool.num_dma_buffers >= num_buffers, done);
|
||||
TEST_CHECK_GOTO(i == num_buffers, done);
|
||||
|
||||
done:
|
||||
j = i;
|
||||
for (i = 0; i < j; ++i)
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffers[i], NULL);
|
||||
|
||||
uvm_kvfree(dma_buffers);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic(uvm_va_space_t *va_space)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
|
||||
TEST_CHECK_RET(test_basic_sysmem() == NV_OK);
|
||||
TEST_NV_CHECK_RET(test_basic_sysmem());
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
TEST_CHECK_RET(test_basic_vidmem(gpu) == NV_OK);
|
||||
TEST_CHECK_RET(test_basic_sysmem_dma(gpu) == NV_OK);
|
||||
TEST_NV_CHECK_RET(test_basic_vidmem(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_sysmem_dma(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_vidmem_unprotected(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_dma_pool(gpu));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
|
||||
@@ -933,20 +933,15 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
|
||||
tracker_ptr = &tracker;
|
||||
|
||||
if (params->length > 0) {
|
||||
status = uvm_api_range_type_check(va_space, mm, params->base, params->length);
|
||||
if (status == NV_OK) {
|
||||
status = uvm_migrate(va_space,
|
||||
mm,
|
||||
params->base,
|
||||
params->length,
|
||||
(dest_gpu ? dest_gpu->id : UVM_ID_CPU),
|
||||
params->flags,
|
||||
uvm_va_space_iter_first(va_space,
|
||||
params->base,
|
||||
params->base),
|
||||
tracker_ptr);
|
||||
uvm_api_range_type_t type;
|
||||
|
||||
type = uvm_api_range_type_check(va_space, mm, params->base, params->length);
|
||||
if (type == UVM_API_RANGE_TYPE_INVALID) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
goto done;
|
||||
}
|
||||
else if (status == NV_WARN_NOTHING_TO_DO) {
|
||||
|
||||
if (type == UVM_API_RANGE_TYPE_ATS) {
|
||||
uvm_migrate_args_t uvm_migrate_args =
|
||||
{
|
||||
.va_space = va_space,
|
||||
@@ -964,6 +959,18 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
|
||||
|
||||
status = uvm_migrate_pageable(&uvm_migrate_args);
|
||||
}
|
||||
else {
|
||||
status = uvm_migrate(va_space,
|
||||
mm,
|
||||
params->base,
|
||||
params->length,
|
||||
(dest_gpu ? dest_gpu->id : UVM_ID_CPU),
|
||||
params->flags,
|
||||
uvm_va_space_iter_first(va_space,
|
||||
params->base,
|
||||
params->base),
|
||||
tracker_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
@@ -63,7 +63,7 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
|
||||
|
||||
if (owning_gpu == copying_gpu) {
|
||||
// Local vidmem address
|
||||
*gpu_addr = uvm_gpu_address_from_phys(uvm_gpu_page_to_phys_address(owning_gpu, page));
|
||||
*gpu_addr = uvm_gpu_address_copy(owning_gpu, uvm_gpu_page_to_phys_address(owning_gpu, page));
|
||||
}
|
||||
else if (direct_peer) {
|
||||
// Direct GPU peer
|
||||
@@ -88,25 +88,13 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
|
||||
|
||||
__set_bit(page_index, state->dma.page_mask);
|
||||
|
||||
*gpu_addr = uvm_gpu_address_physical(UVM_APERTURE_SYS, state->dma.addrs[page_index]);
|
||||
*gpu_addr = uvm_gpu_address_copy(copying_gpu,
|
||||
uvm_gpu_phys_address(UVM_APERTURE_SYS, state->dma.addrs[page_index]));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Return the GPU identified with the given NUMA node id
|
||||
static uvm_gpu_t *get_gpu_from_node_id(uvm_va_space_t *va_space, int node_id)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
if (uvm_gpu_numa_info(gpu)->node_id == node_id)
|
||||
return gpu;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Create a new push to zero pages on dst_id
|
||||
static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space,
|
||||
uvm_processor_id_t dst_id,
|
||||
@@ -169,7 +157,7 @@ static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,
|
||||
// NUMA-enabled GPUs can copy to any other NUMA node in the system even if
|
||||
// P2P access has not been explicitly enabled (ie va_space->can_copy_from
|
||||
// is not set).
|
||||
if (!gpu->parent->numa_info.enabled) {
|
||||
if (!gpu->mem_info.numa.enabled) {
|
||||
UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id),
|
||||
"GPU %s dst %s src %s\n",
|
||||
uvm_va_space_processor_name(va_space, gpu->id),
|
||||
@@ -281,7 +269,7 @@ static void migrate_vma_compute_masks(struct vm_area_struct *vma, const unsigned
|
||||
continue;
|
||||
}
|
||||
|
||||
src_gpu = get_gpu_from_node_id(uvm_migrate_args->va_space, src_nid);
|
||||
src_gpu = uvm_va_space_find_gpu_with_memory_node_id(uvm_migrate_args->va_space, src_nid);
|
||||
|
||||
// Already resident on a node with no CPUs that doesn't belong to a
|
||||
// GPU, don't move
|
||||
@@ -980,13 +968,14 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
|
||||
// wanted to call this function from a bottom half with CPU dst_id.
|
||||
UVM_ASSERT(!(current->flags & PF_KTHREAD));
|
||||
|
||||
if (!nv_numa_node_has_memory(dst_node_id) || get_gpu_from_node_id(va_space, dst_node_id) != NULL)
|
||||
if (!nv_numa_node_has_memory(dst_node_id) ||
|
||||
uvm_va_space_find_gpu_with_memory_node_id(va_space, dst_node_id) != NULL)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
}
|
||||
else {
|
||||
// Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use
|
||||
// dst_node_id as the GPU node id if dst_id doesn't belong to the CPU.
|
||||
uvm_migrate_args->dst_node_id = uvm_gpu_numa_info(uvm_va_space_get_gpu(va_space, dst_id))->node_id;
|
||||
uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_va_space_get_gpu(va_space, dst_id));
|
||||
}
|
||||
|
||||
state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
|
||||
|
||||
@@ -58,11 +58,13 @@ typedef struct
|
||||
#ifdef UVM_MIGRATE_VMA_SUPPORTED
|
||||
#include <linux/migrate.h>
|
||||
|
||||
// The calls to migrate_vma are capped at 32MB to set an upper bound on the
|
||||
// The calls to migrate_vma are capped at 512 pages to set an upper bound on the
|
||||
// amount of metadata that needs to be allocated for the operation. This number
|
||||
// was chosen because performance seems to plateau at this size.
|
||||
#define UVM_MIGRATE_VMA_MAX_SIZE (32UL * 1024 * 1024)
|
||||
#define UVM_MIGRATE_VMA_MAX_PAGES (UVM_MIGRATE_VMA_MAX_SIZE >> PAGE_SHIFT)
|
||||
// was chosen because performance seems to plateau at this size on 64K-pages
|
||||
// kernels. On kernels with PAGE_SIZE == 4K, 512 pages correspond to 2M VA block,
|
||||
// which is also a standard size for batch operations.
|
||||
#define UVM_MIGRATE_VMA_MAX_PAGES (512UL)
|
||||
#define UVM_MIGRATE_VMA_MAX_SIZE (UVM_MIGRATE_VMA_MAX_PAGES * PAGE_SIZE)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "uvm_mem.h"
|
||||
#include "uvm_va_space.h"
|
||||
|
||||
#include <linux/mm.h>
|
||||
|
||||
// The page tree has 6 levels on Hopper+ GPUs, and the root is never freed by a
|
||||
// normal 'put' operation which leaves a maximum of 5 levels.
|
||||
@@ -102,7 +103,7 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
|
||||
NvU64 dma_addr;
|
||||
unsigned long flags = __GFP_ZERO;
|
||||
uvm_memcg_context_t memcg_context;
|
||||
uvm_va_space_t *va_space;
|
||||
uvm_va_space_t *va_space = NULL;
|
||||
struct mm_struct *mm = NULL;
|
||||
|
||||
if (tree->type == UVM_PAGE_TREE_TYPE_USER && tree->gpu_va_space && UVM_CGROUP_ACCOUNTING_SUPPORTED()) {
|
||||
@@ -244,6 +245,84 @@ static void page_table_range_init(uvm_page_table_range_t *range,
|
||||
dir->ref_count += range->entry_count;
|
||||
}
|
||||
|
||||
static bool uvm_mmu_use_cpu(uvm_page_tree_t *tree)
|
||||
{
|
||||
// When physical CE writes can't be used for vidmem we use a flat virtual
|
||||
// mapping instead. The GPU PTEs for that flat mapping have to be
|
||||
// bootstrapped using the CPU.
|
||||
return tree->location != UVM_APERTURE_SYS &&
|
||||
!tree->gpu->parent->ce_phys_vidmem_write_supported &&
|
||||
!tree->gpu->static_flat_mapping.ready;
|
||||
}
|
||||
|
||||
// uvm_mmu_page_table_page() and the uvm_mmu_page_table_cpu_* family of
|
||||
// functions can only be used when uvm_mmu_use_cpu() returns true, which implies
|
||||
// a coherent system.
|
||||
|
||||
static struct page *uvm_mmu_page_table_page(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
|
||||
{
|
||||
// All platforms that require CPU PTE writes for bootstrapping can fit
|
||||
// tables within a page.
|
||||
UVM_ASSERT(phys_alloc->size <= PAGE_SIZE);
|
||||
|
||||
if (phys_alloc->addr.aperture == UVM_APERTURE_SYS)
|
||||
return phys_alloc->handle.page;
|
||||
|
||||
return uvm_gpu_chunk_to_page(&gpu->pmm, phys_alloc->handle.chunk);
|
||||
}
|
||||
|
||||
static void *uvm_mmu_page_table_cpu_map(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
|
||||
{
|
||||
struct page *page = uvm_mmu_page_table_page(gpu, phys_alloc);
|
||||
NvU64 page_offset = offset_in_page(phys_alloc->addr.address);
|
||||
return (char *)kmap(page) + page_offset;
|
||||
}
|
||||
|
||||
static void uvm_mmu_page_table_cpu_unmap(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
|
||||
{
|
||||
kunmap(uvm_mmu_page_table_page(gpu, phys_alloc));
|
||||
}
|
||||
|
||||
static void uvm_mmu_page_table_cpu_memset_8(uvm_gpu_t *gpu,
|
||||
uvm_mmu_page_table_alloc_t *phys_alloc,
|
||||
NvU32 start_index,
|
||||
NvU64 pattern,
|
||||
NvU32 num_entries)
|
||||
{
|
||||
NvU64 *ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
|
||||
size_t i;
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
|
||||
UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
|
||||
|
||||
for (i = 0; i < num_entries; i++)
|
||||
ptr[start_index + i] = pattern;
|
||||
|
||||
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
|
||||
}
|
||||
|
||||
static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
|
||||
uvm_mmu_page_table_alloc_t *phys_alloc,
|
||||
NvU32 start_index,
|
||||
NvU64 *pattern,
|
||||
NvU32 num_entries)
|
||||
{
|
||||
struct
|
||||
{
|
||||
NvU64 u0, u1;
|
||||
} *ptr;
|
||||
size_t i;
|
||||
|
||||
ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
|
||||
UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
|
||||
UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
|
||||
|
||||
for (i = 0; i < num_entries; i++)
|
||||
memcpy(&ptr[start_index + i], pattern, sizeof(*ptr));
|
||||
|
||||
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
|
||||
}
|
||||
|
||||
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
|
||||
{
|
||||
NvU64 clear_bits[2];
|
||||
@@ -262,10 +341,19 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
|
||||
}
|
||||
|
||||
// initialize the memory to a reasonable value
|
||||
tree->gpu->parent->ce_hal->memset_8(push,
|
||||
uvm_gpu_address_from_phys(dir->phys_alloc.addr),
|
||||
if (push) {
|
||||
tree->gpu->parent->ce_hal->memset_8(push,
|
||||
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
|
||||
*clear_bits,
|
||||
dir->phys_alloc.size);
|
||||
}
|
||||
else {
|
||||
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
|
||||
&dir->phys_alloc,
|
||||
0,
|
||||
*clear_bits,
|
||||
dir->phys_alloc.size);
|
||||
dir->phys_alloc.size / sizeof(*clear_bits));
|
||||
}
|
||||
}
|
||||
|
||||
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
|
||||
@@ -321,25 +409,44 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
|
||||
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
|
||||
}
|
||||
|
||||
// pde_fill() populates pde_count PDE entries (starting at start_index) with
|
||||
// the same mapping, i.e., with the same physical address (phys_addr).
|
||||
static void pde_fill(uvm_page_tree_t *tree,
|
||||
NvU32 depth,
|
||||
uvm_mmu_page_table_alloc_t *directory,
|
||||
NvU32 start_index,
|
||||
NvU32 pde_count,
|
||||
uvm_mmu_page_table_alloc_t **phys_addr,
|
||||
uvm_push_t *push)
|
||||
static void pde_fill_cpu(uvm_page_tree_t *tree,
|
||||
NvU32 depth,
|
||||
uvm_mmu_page_table_alloc_t *directory,
|
||||
NvU32 start_index,
|
||||
NvU32 pde_count,
|
||||
uvm_mmu_page_table_alloc_t **phys_addr)
|
||||
{
|
||||
NvU64 pde_data[2], entry_size;
|
||||
uvm_gpu_address_t pde_entry_addr;
|
||||
|
||||
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
|
||||
UVM_ASSERT(uvm_mmu_use_cpu(tree));
|
||||
entry_size = tree->hal->entry_size(depth);
|
||||
UVM_ASSERT(sizeof(pde_data) >= entry_size);
|
||||
|
||||
tree->hal->make_pde(pde_data, phys_addr, depth);
|
||||
|
||||
if (entry_size == sizeof(pde_data[0]))
|
||||
uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
|
||||
else
|
||||
uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
|
||||
}
|
||||
|
||||
static void pde_fill_gpu(uvm_page_tree_t *tree,
|
||||
NvU32 depth,
|
||||
uvm_mmu_page_table_alloc_t *directory,
|
||||
NvU32 start_index,
|
||||
NvU32 pde_count,
|
||||
uvm_mmu_page_table_alloc_t **phys_addr,
|
||||
uvm_push_t *push)
|
||||
{
|
||||
NvU64 pde_data[2], entry_size;
|
||||
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
|
||||
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
entry_size = tree->hal->entry_size(depth);
|
||||
UVM_ASSERT(sizeof(pde_data) >= entry_size);
|
||||
|
||||
tree->hal->make_pde(pde_data, phys_addr, depth);
|
||||
pde_entry_addr = uvm_gpu_address_from_phys(directory->addr);
|
||||
pde_entry_addr.address += start_index * entry_size;
|
||||
|
||||
if (entry_size == sizeof(pde_data[0])) {
|
||||
@@ -386,6 +493,24 @@ static void pde_fill(uvm_page_tree_t *tree,
|
||||
}
|
||||
}
|
||||
|
||||
// pde_fill() populates pde_count PDE entries (starting at start_index) with
|
||||
// the same mapping, i.e., with the same physical address (phys_addr).
|
||||
static void pde_fill(uvm_page_tree_t *tree,
|
||||
NvU32 depth,
|
||||
uvm_mmu_page_table_alloc_t *directory,
|
||||
NvU32 start_index,
|
||||
NvU32 pde_count,
|
||||
uvm_mmu_page_table_alloc_t **phys_addr,
|
||||
uvm_push_t *push)
|
||||
{
|
||||
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
|
||||
|
||||
if (push)
|
||||
pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
|
||||
else
|
||||
pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
|
||||
}
|
||||
|
||||
static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
|
||||
uvm_page_directory_t *parent,
|
||||
NvU32 index_in_parent)
|
||||
@@ -426,7 +551,11 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
|
||||
dir->ref_count--;
|
||||
}
|
||||
|
||||
static void pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size, void *push)
|
||||
static void pde_clear(uvm_page_tree_t *tree,
|
||||
uvm_page_directory_t *dir,
|
||||
NvU32 entry_index,
|
||||
NvU32 page_size,
|
||||
uvm_push_t *push)
|
||||
{
|
||||
host_pde_clear(tree, dir, entry_index, page_size);
|
||||
pde_write(tree, dir, entry_index, false, push);
|
||||
@@ -492,12 +621,62 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// initialize new page tables and insert them into the tree
|
||||
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
{
|
||||
NvS32 i;
|
||||
uvm_push_t push;
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_assert_mutex_locked(&tree->lock);
|
||||
UVM_ASSERT(uvm_mmu_use_cpu(tree));
|
||||
|
||||
if (used_count == 0)
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_tracker_wait(&tree->tracker);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
for (i = 0; i < used_count; i++)
|
||||
phys_mem_init(tree, page_size, dirs_used[i], NULL);
|
||||
|
||||
// Only a single membar is needed between the memsets of the page tables
|
||||
// and the writes of the PDEs pointing to those page tables.
|
||||
mb();
|
||||
|
||||
// write entries bottom up, so that they are valid once they're inserted
|
||||
// into the tree
|
||||
for (i = used_count - 1; i >= 0; i--)
|
||||
pde_write(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, false, NULL);
|
||||
|
||||
// A CPU membar is needed between the PDE writes and the subsequent TLB
|
||||
// invalidate. Work submission guarantees such a membar.
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
UVM_ASSERT(invalidate_depth >= 0);
|
||||
|
||||
// See the comments in write_gpu_state_gpu()
|
||||
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
|
||||
uvm_page_tree_pdb(tree)->addr,
|
||||
invalidate_depth,
|
||||
UVM_MEMBAR_NONE);
|
||||
page_tree_end(tree, &push);
|
||||
page_tree_tracker_overwrite_with_push(tree, &push);
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
{
|
||||
NvS32 i;
|
||||
uvm_push_t push;
|
||||
@@ -508,11 +687,12 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
uvm_membar_t membar_after_writes = UVM_MEMBAR_GPU;
|
||||
|
||||
uvm_assert_mutex_locked(&tree->lock);
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
if (used_count == 0)
|
||||
return NV_OK;
|
||||
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "write_gpu_state: %u dirs", used_count);
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
@@ -533,15 +713,15 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
|
||||
// Only a single membar is needed between the memsets of the page tables
|
||||
// and the writes of the PDEs pointing to those page tables.
|
||||
// The membar can be local if all of the page tables and PDEs are in GPU memory,
|
||||
// but must be a sysmembar if any of them are in sysmem.
|
||||
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
||||
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
|
||||
// The membar can be local if all of the page tables and PDEs are in GPU
|
||||
// memory, but must be a sysmembar if any of them are in sysmem.
|
||||
uvm_hal_wfi_membar(&push, membar_after_writes);
|
||||
|
||||
// Reset back to a local membar by default
|
||||
membar_after_writes = UVM_MEMBAR_GPU;
|
||||
|
||||
// write entries bottom up, so that they are valid once they're inserted into the tree
|
||||
// write entries bottom up, so that they are valid once they're inserted
|
||||
// into the tree
|
||||
for (i = used_count - 1; i >= 0; i--) {
|
||||
uvm_page_directory_t *dir = dirs_used[i];
|
||||
|
||||
@@ -553,18 +733,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
|
||||
// If any of the written PDEs is in sysmem, a sysmembar is needed before
|
||||
// the TLB invalidate.
|
||||
// Notably sysmembar is needed even though the writer (CE) and reader (MMU) are
|
||||
// on the same GPU, because CE physical writes take the L2 bypass path.
|
||||
// Notably sysmembar is needed even though the writer (CE) and reader
|
||||
// (MMU) are on the same GPU, because CE physical writes take the L2
|
||||
// bypass path.
|
||||
if (dir->host_parent->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
|
||||
membar_after_writes = UVM_MEMBAR_SYS;
|
||||
}
|
||||
|
||||
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
||||
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
|
||||
uvm_hal_wfi_membar(&push, membar_after_writes);
|
||||
|
||||
UVM_ASSERT(invalidate_depth >= 0);
|
||||
|
||||
// Upgrades don't have to flush out accesses, so no membar is needed on the TLB invalidate.
|
||||
// Upgrades don't have to flush out accesses, so no membar is needed on the
|
||||
// TLB invalidate.
|
||||
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
|
||||
uvm_page_tree_pdb(tree)->addr,
|
||||
invalidate_depth,
|
||||
@@ -582,6 +763,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// initialize new page tables and insert them into the tree
|
||||
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
{
|
||||
if (uvm_mmu_use_cpu(tree))
|
||||
return write_gpu_state_cpu(tree, page_size, invalidate_depth, used_count, dirs_used);
|
||||
else
|
||||
return write_gpu_state_gpu(tree, page_size, invalidate_depth, used_count, dirs_used);
|
||||
}
|
||||
|
||||
static void free_unused_directories(uvm_page_tree_t *tree,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used,
|
||||
@@ -633,6 +827,8 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
|
||||
uvm_pte_batch_t batch;
|
||||
NvU32 entry_size;
|
||||
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
// Allocate the ptes_invalid_4k.
|
||||
status = allocate_page_table(tree, UVM_PAGE_SIZE_4K, &tree->map_remap.ptes_invalid_4k);
|
||||
if (status != NV_OK)
|
||||
@@ -718,6 +914,9 @@ error:
|
||||
//
|
||||
// In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
|
||||
// from updating GPU page tables without hypervisor knowledge.
|
||||
// When the Confidential Computing feature is enabled, all kernel
|
||||
// allocations must be made in the CPR of vidmem. This is a hardware security
|
||||
// constraint.
|
||||
// Inputs Outputs
|
||||
// init location | uvm_page_table_location || tree->location | tree->location_sys_fallback
|
||||
// -------------|-------------------------||----------------|----------------
|
||||
@@ -734,7 +933,8 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
|
||||
(location == UVM_APERTURE_DEFAULT),
|
||||
"Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);
|
||||
|
||||
should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu);
|
||||
should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
|
||||
|| uvm_conf_computing_mode_enabled(tree->gpu);
|
||||
|
||||
// The page tree of a "fake" GPU used during page tree testing can be in
|
||||
// sysmem even if should_location_be_vidmem is true. A fake GPU can be
|
||||
@@ -798,6 +998,11 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
|
||||
return status;
|
||||
}
|
||||
|
||||
if (uvm_mmu_use_cpu(tree)) {
|
||||
phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, NULL);
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "init page tree");
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
@@ -858,7 +1063,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
|
||||
uvm_page_directory_t *free_queue[MAX_OPERATION_DEPTH];
|
||||
uvm_page_directory_t *dir = range->table;
|
||||
uvm_push_t push;
|
||||
NV_STATUS status;
|
||||
NV_STATUS status = NV_OK;
|
||||
NvU32 invalidate_depth = 0;
|
||||
|
||||
// The logic of what membar is needed when is pretty subtle, please refer to
|
||||
@@ -880,34 +1085,58 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
|
||||
uvm_membar_t this_membar;
|
||||
|
||||
if (free_count == 0) {
|
||||
if (uvm_mmu_use_cpu(tree))
|
||||
status = uvm_tracker_wait(&tree->tracker);
|
||||
|
||||
// begin a push which will be submitted before the memory gets freed
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "put ptes: start: %u, count: %u",
|
||||
range->start_index, range->entry_count);
|
||||
// Failure to get a push can only happen if we've hit a fatal UVM
|
||||
// channel error. We can't perform the unmap, so just leave things
|
||||
// in place for debug.
|
||||
if (status == NV_OK) {
|
||||
// Begin a push which will be submitted before the memory gets
|
||||
// freed.
|
||||
//
|
||||
// When writing with the CPU we don't strictly need to begin
|
||||
// this push until after the writes are done, but doing it here
|
||||
// doesn't hurt and makes the function's logic simpler.
|
||||
status = page_tree_begin_acquire(tree,
|
||||
&tree->tracker,
|
||||
&push,
|
||||
"put ptes: start: %u, count: %u",
|
||||
range->start_index,
|
||||
range->entry_count);
|
||||
}
|
||||
|
||||
// Failure to wait for a tracker or get a push can only happen if
|
||||
// we've hit a fatal UVM channel error. We can't perform the unmap,
|
||||
// so just leave things in place for debug.
|
||||
if (status != NV_OK) {
|
||||
UVM_ASSERT(status == uvm_global_get_status());
|
||||
dir->ref_count += range->entry_count;
|
||||
uvm_mutex_unlock(&tree->lock);
|
||||
return;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
// All writes can be pipelined as put_ptes() cannot be called with any
|
||||
// operations pending on the affected PTEs and PDEs.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
if (uvm_mmu_use_cpu(tree)) {
|
||||
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, NULL);
|
||||
}
|
||||
else {
|
||||
// All writes can be pipelined as put_ptes() cannot be called with
|
||||
// any operations pending on the affected PTEs and PDEs.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
|
||||
// Don't issue any membars as part of the clear, a single membar will be
|
||||
// done below before the invalidate.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
|
||||
// Don't issue any membars as part of the clear, a single membar
|
||||
// will be done below before the invalidate.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
|
||||
}
|
||||
|
||||
invalidate_depth = dir->host_parent->depth;
|
||||
|
||||
// Take the membar with the widest scope of any of the pointed-to PDEs
|
||||
this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
|
||||
// If we're using the CPU to do the write a SYS membar is required.
|
||||
// Otherwise, take the membar with the widest scope of any of the
|
||||
// pointed-to PDEs.
|
||||
if (uvm_mmu_use_cpu(tree))
|
||||
this_membar = UVM_MEMBAR_SYS;
|
||||
else
|
||||
this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
|
||||
|
||||
membar_after_invalidate = max(membar_after_invalidate, this_membar);
|
||||
|
||||
// If any of the cleared PDEs were in sysmem then a SYS membar is
|
||||
@@ -923,23 +1152,28 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
|
||||
dir = parent;
|
||||
}
|
||||
|
||||
if (free_count == 0) {
|
||||
uvm_mutex_unlock(&tree->lock);
|
||||
return;
|
||||
}
|
||||
if (free_count == 0)
|
||||
goto done;
|
||||
|
||||
if (uvm_mmu_use_cpu(tree))
|
||||
mb();
|
||||
else
|
||||
uvm_hal_wfi_membar(&push, membar_after_pde_clears);
|
||||
|
||||
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
||||
uvm_hal_membar(tree->gpu, &push, membar_after_pde_clears);
|
||||
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
|
||||
uvm_page_tree_pdb(tree)->addr,
|
||||
invalidate_depth,
|
||||
membar_after_invalidate);
|
||||
|
||||
// We just did the appropriate membar above, no need for another one in push_end().
|
||||
// At least currently as if the L2 bypass path changes to only require a GPU
|
||||
// membar between PDE write and TLB invalidate, we'll need to push a
|
||||
// sysmembar so the end-of-push semaphore is ordered behind the PDE writes.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
if (!uvm_mmu_use_cpu(tree)) {
|
||||
// We just did the appropriate membar above, no need for another one in
|
||||
// push_end(). If the L2 bypass path changes to only require a GPU
|
||||
// membar between PDE write and TLB invalidate, we'll need to push a
|
||||
// sysmembar so the end-of-push semaphore is ordered behind the PDE
|
||||
// writes.
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
}
|
||||
|
||||
page_tree_end(tree, &push);
|
||||
page_tree_tracker_overwrite_with_push(tree, &push);
|
||||
|
||||
@@ -949,6 +1183,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
|
||||
uvm_kvfree(free_queue[i]);
|
||||
}
|
||||
|
||||
done:
|
||||
uvm_mutex_unlock(&tree->lock);
|
||||
}
|
||||
|
||||
@@ -1255,19 +1490,22 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
|
||||
|
||||
UVM_ASSERT(pte_dir->depth == tree->hal->page_table_depth(page_size));
|
||||
|
||||
// The flat mappings should always be set up when executing this path
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
tree->gpu->parent->ce_hal->memset_8(&push,
|
||||
uvm_gpu_address_from_phys(pte_dir->phys_alloc.addr),
|
||||
uvm_mmu_gpu_address(tree->gpu, pte_dir->phys_alloc.addr),
|
||||
tree->hal->poisoned_pte(),
|
||||
pte_dir->phys_alloc.size);
|
||||
|
||||
// If both the new PTEs and the parent PDE are in vidmem, then a GPU-
|
||||
// local membar is enough to keep the memset of the PTEs ordered with
|
||||
// any later write of the PDE. Otherwise we need a sysmembar. See the
|
||||
// comments in write_gpu_state.
|
||||
// comments in write_gpu_state_gpu.
|
||||
if (pte_dir->phys_alloc.addr.aperture == UVM_APERTURE_VID &&
|
||||
parent->phys_alloc.addr.aperture == UVM_APERTURE_VID)
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
|
||||
@@ -1527,7 +1765,43 @@ NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
|
||||
static NV_STATUS uvm_page_table_range_vec_clear_ptes_cpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
|
||||
{
|
||||
uvm_page_tree_t *tree = range_vec->tree;
|
||||
NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
|
||||
NvU64 invalid_ptes[2] = {0, 0};
|
||||
uvm_push_t push;
|
||||
NV_STATUS status;
|
||||
size_t i;
|
||||
|
||||
UVM_ASSERT(uvm_mmu_use_cpu(tree));
|
||||
|
||||
for (i = 0; i < range_vec->range_count; ++i) {
|
||||
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
||||
uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
|
||||
|
||||
if (entry_size == 8)
|
||||
uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, range->start_index, invalid_ptes[0], range->entry_count);
|
||||
else
|
||||
uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, range->start_index, invalid_ptes, range->entry_count);
|
||||
}
|
||||
|
||||
// A CPU membar is needed between the PTE writes and the subsequent TLB
|
||||
// invalidate. Work submission guarantees such a membar.
|
||||
status = page_tree_begin_acquire(tree,
|
||||
NULL,
|
||||
&push,
|
||||
"Invalidating [0x%llx, 0x%llx)",
|
||||
range_vec->start,
|
||||
range_vec->start + range_vec->size);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
|
||||
return page_tree_end_and_wait(tree, &push);
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
NV_STATUS tracker_status;
|
||||
@@ -1545,6 +1819,7 @@ NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_
|
||||
UVM_ASSERT(range_vec);
|
||||
UVM_ASSERT(tree);
|
||||
UVM_ASSERT(gpu);
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
i = 0;
|
||||
while (i < range_vec->range_count) {
|
||||
@@ -1595,6 +1870,14 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
|
||||
{
|
||||
if (uvm_mmu_use_cpu(range_vec->tree))
|
||||
return uvm_page_table_range_vec_clear_ptes_cpu(range_vec, tlb_membar);
|
||||
else
|
||||
return uvm_page_table_range_vec_clear_ptes_gpu(range_vec, tlb_membar);
|
||||
}
|
||||
|
||||
void uvm_page_table_range_vec_deinit(uvm_page_table_range_vec_t *range_vec)
|
||||
{
|
||||
size_t i;
|
||||
@@ -1626,10 +1909,58 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec)
|
||||
uvm_kvfree(range_vec);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
|
||||
uvm_membar_t tlb_membar,
|
||||
uvm_page_table_range_pte_maker_t pte_maker,
|
||||
void *caller_data)
|
||||
static NV_STATUS uvm_page_table_range_vec_write_ptes_cpu(uvm_page_table_range_vec_t *range_vec,
|
||||
uvm_membar_t tlb_membar,
|
||||
uvm_page_table_range_pte_maker_t pte_maker,
|
||||
void *caller_data)
|
||||
{
|
||||
NV_STATUS status;
|
||||
size_t i;
|
||||
uvm_page_tree_t *tree = range_vec->tree;
|
||||
NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
|
||||
uvm_push_t push;
|
||||
NvU64 offset = 0;
|
||||
|
||||
UVM_ASSERT(uvm_mmu_use_cpu(tree));
|
||||
|
||||
// Enforce ordering with prior accesses to the pages being mapped before the
|
||||
// mappings are activated.
|
||||
mb();
|
||||
|
||||
for (i = 0; i < range_vec->range_count; ++i) {
|
||||
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
||||
uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
|
||||
NvU32 entry;
|
||||
|
||||
for (entry = range->start_index; entry < range->entry_count; ++entry) {
|
||||
NvU64 pte_bits[2] = {pte_maker(range_vec, offset, caller_data), 0};
|
||||
|
||||
if (entry_size == 8)
|
||||
uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, entry, pte_bits[0], 1);
|
||||
else
|
||||
uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, entry, pte_bits, 1);
|
||||
|
||||
offset += range_vec->page_size;
|
||||
}
|
||||
}
|
||||
|
||||
status = page_tree_begin_acquire(tree,
|
||||
NULL,
|
||||
&push,
|
||||
"Invalidating [0x%llx, 0x%llx)",
|
||||
range_vec->start,
|
||||
range_vec->start + range_vec->size);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
|
||||
return page_tree_end_and_wait(tree, &push);
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_page_table_range_vec_write_ptes_gpu(uvm_page_table_range_vec_t *range_vec,
|
||||
uvm_membar_t tlb_membar,
|
||||
uvm_page_table_range_pte_maker_t pte_maker,
|
||||
void *caller_data)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
NV_STATUS tracker_status;
|
||||
@@ -1650,6 +1981,8 @@ NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_
|
||||
|
||||
NvU32 max_entries_per_push = max_total_entry_size_per_push / entry_size;
|
||||
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
for (i = 0; i < range_vec->range_count; ++i) {
|
||||
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
||||
NvU64 range_start = range_vec_calc_range_start(range_vec, i);
|
||||
@@ -1728,6 +2061,17 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
|
||||
uvm_membar_t tlb_membar,
|
||||
uvm_page_table_range_pte_maker_t pte_maker,
|
||||
void *caller_data)
|
||||
{
|
||||
if (uvm_mmu_use_cpu(range_vec->tree))
|
||||
return uvm_page_table_range_vec_write_ptes_cpu(range_vec, tlb_membar, pte_maker, caller_data);
|
||||
else
|
||||
return uvm_page_table_range_vec_write_ptes_gpu(range_vec, tlb_membar, pte_maker, caller_data);
|
||||
}
|
||||
|
||||
typedef struct identity_mapping_pte_maker_data_struct
|
||||
{
|
||||
NvU64 phys_offset;
|
||||
@@ -1745,13 +2089,12 @@ static NvU64 identity_mapping_pte_maker(uvm_page_table_range_vec_t *range_vec, N
|
||||
}
|
||||
|
||||
static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
|
||||
NvU64 base,
|
||||
uvm_gpu_identity_mapping_t *mapping,
|
||||
NvU64 size,
|
||||
uvm_aperture_t aperture,
|
||||
NvU64 phys_offset,
|
||||
NvU32 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_vec_t **range_vec)
|
||||
uvm_pmm_alloc_flags_t pmm_flags)
|
||||
{
|
||||
NV_STATUS status;
|
||||
identity_mapping_pte_maker_data_t data =
|
||||
@@ -1761,32 +2104,36 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
|
||||
};
|
||||
|
||||
status = uvm_page_table_range_vec_create(&gpu->address_space_tree,
|
||||
base,
|
||||
mapping->base,
|
||||
size,
|
||||
page_size,
|
||||
pmm_flags,
|
||||
range_vec);
|
||||
&mapping->range_vec);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to init range vec for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
|
||||
aperture,
|
||||
base,
|
||||
base + size,
|
||||
mapping->base,
|
||||
mapping->base + size,
|
||||
nvstatusToString(status),
|
||||
uvm_gpu_name(gpu));
|
||||
return status;
|
||||
}
|
||||
|
||||
status = uvm_page_table_range_vec_write_ptes(*range_vec, UVM_MEMBAR_NONE, identity_mapping_pte_maker, &data);
|
||||
status = uvm_page_table_range_vec_write_ptes(mapping->range_vec,
|
||||
UVM_MEMBAR_NONE,
|
||||
identity_mapping_pte_maker,
|
||||
&data);
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("Failed to write PTEs for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
|
||||
aperture,
|
||||
base,
|
||||
base + size,
|
||||
mapping->base,
|
||||
mapping->base + size,
|
||||
nvstatusToString(status),
|
||||
uvm_gpu_name(gpu));
|
||||
return status;
|
||||
}
|
||||
|
||||
mapping->ready = true;
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -1795,6 +2142,10 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
|
||||
if (mapping->range_vec == NULL)
|
||||
return;
|
||||
|
||||
// Tell the teardown routines they can't use this mapping as part of their
|
||||
// teardown.
|
||||
mapping->ready = false;
|
||||
|
||||
(void)uvm_page_table_range_vec_clear_ptes(mapping->range_vec, UVM_MEMBAR_SYS);
|
||||
uvm_page_table_range_vec_destroy(mapping->range_vec);
|
||||
mapping->range_vec = NULL;
|
||||
@@ -1802,7 +2153,7 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
|
||||
|
||||
bool uvm_mmu_gpu_needs_static_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
{
|
||||
return false;
|
||||
return !gpu->parent->ce_phys_vidmem_write_supported;
|
||||
}
|
||||
|
||||
bool uvm_mmu_gpu_needs_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
@@ -1838,13 +2189,12 @@ NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
flat_mapping->base = gpu->parent->flat_vidmem_va_base;
|
||||
|
||||
return create_identity_mapping(gpu,
|
||||
flat_mapping->base,
|
||||
flat_mapping,
|
||||
size,
|
||||
aperture,
|
||||
phys_offset,
|
||||
page_size,
|
||||
UVM_PMM_ALLOC_FLAGS_EVICT,
|
||||
&flat_mapping->range_vec);
|
||||
UVM_PMM_ALLOC_FLAGS_EVICT);
|
||||
}
|
||||
|
||||
static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
@@ -1884,13 +2234,12 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
|
||||
UVM_ASSERT(peer_mapping->base);
|
||||
|
||||
return create_identity_mapping(gpu,
|
||||
peer_mapping->base,
|
||||
peer_mapping,
|
||||
size,
|
||||
aperture,
|
||||
phys_offset,
|
||||
page_size,
|
||||
UVM_PMM_ALLOC_FLAGS_EVICT,
|
||||
&peer_mapping->range_vec);
|
||||
UVM_PMM_ALLOC_FLAGS_EVICT);
|
||||
}
|
||||
|
||||
void uvm_mmu_destroy_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
|
||||
@@ -2304,14 +2653,14 @@ static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
|
||||
// SR-IOV each mapping addition adds a lot of overhead due to vGPU plugin
|
||||
// involvement), metadata memory footprint (inversely proportional to the
|
||||
// mapping size), etc.
|
||||
mapping_size = 4ULL * 1024 * 1024 * 1024;
|
||||
mapping_size = 4 * UVM_SIZE_1GB;
|
||||
|
||||
// The mapping size should be at least 1GB, due to bitlock limitations. This
|
||||
// shouldn't be a problem because the expectation is to use 512MB PTEs, and
|
||||
// using a granularity of 1GB already results in allocating a large array of
|
||||
// sysmem mappings with 128K entries.
|
||||
UVM_ASSERT(is_power_of_2(mapping_size));
|
||||
UVM_ASSERT(mapping_size >= 1ULL * 1024 * 1024 * 1024);
|
||||
UVM_ASSERT(mapping_size >= UVM_SIZE_1GB);
|
||||
UVM_ASSERT(mapping_size >= uvm_mmu_biggest_page_size(&gpu->address_space_tree));
|
||||
UVM_ASSERT(mapping_size <= flat_sysmem_va_size);
|
||||
|
||||
@@ -2367,13 +2716,12 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
|
||||
sysmem_mapping->base = virtual_address.address;
|
||||
|
||||
status = create_identity_mapping(gpu,
|
||||
sysmem_mapping->base,
|
||||
sysmem_mapping,
|
||||
gpu->sysmem_mappings.mapping_size,
|
||||
UVM_APERTURE_SYS,
|
||||
phys_offset,
|
||||
page_size,
|
||||
pmm_flags,
|
||||
&sysmem_mapping->range_vec);
|
||||
pmm_flags);
|
||||
}
|
||||
|
||||
sysmem_mapping_unlock(gpu, sysmem_mapping);
|
||||
@@ -2394,14 +2742,14 @@ NV_STATUS uvm_mmu_create_flat_mappings(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
status = create_dynamic_sysmem_mapping(gpu);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = create_static_vidmem_mapping(gpu);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
status = create_dynamic_sysmem_mapping(gpu);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = create_dynamic_vidmem_mapping(gpu);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
@@ -2416,8 +2764,16 @@ error:
|
||||
void uvm_mmu_destroy_flat_mappings(uvm_gpu_t *gpu)
|
||||
{
|
||||
destroy_dynamic_vidmem_mapping(gpu);
|
||||
destroy_static_vidmem_mapping(gpu);
|
||||
destroy_dynamic_sysmem_mapping(gpu);
|
||||
destroy_static_vidmem_mapping(gpu);
|
||||
}
|
||||
|
||||
uvm_gpu_address_t uvm_mmu_gpu_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
|
||||
{
|
||||
if (phys_addr.aperture == UVM_APERTURE_VID && !gpu->parent->ce_phys_vidmem_write_supported)
|
||||
return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
|
||||
|
||||
return uvm_gpu_address_from_phys(phys_addr);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp)
|
||||
|
||||
@@ -50,7 +50,11 @@
|
||||
// | |
|
||||
// | (not used) |
|
||||
// | |
|
||||
// ------------------ 64PB + 8TB
|
||||
// ------------------ 64PB + 8TB + 256GB (UVM_GPU_MAX_PHYS_MEM)
|
||||
// | vidmem |
|
||||
// | flat mapping | ==> UVM_GPU_MAX_PHYS_MEM
|
||||
// | (256GB) |
|
||||
// ------------------ 64PB + 8TB (flat_vidmem_va_base)
|
||||
// |peer ident. maps|
|
||||
// |32 * 256GB = 8TB| ==> NV_MAX_DEVICES * UVM_PEER_IDENTITY_VA_SIZE
|
||||
// ------------------ 64PB
|
||||
@@ -105,7 +109,7 @@
|
||||
// +----------------+ 0 (rm_va_base)
|
||||
|
||||
// Maximum memory of any GPU.
|
||||
#define UVM_GPU_MAX_PHYS_MEM (256ull * 1024 * 1024 * 1024)
|
||||
#define UVM_GPU_MAX_PHYS_MEM (256 * UVM_SIZE_1GB)
|
||||
|
||||
// The size of VA that should be reserved per peer identity mapping.
|
||||
// This should be at least the maximum amount of memory of any GPU.
|
||||
@@ -649,6 +653,14 @@ static uvm_aperture_t uvm_page_table_range_aperture(uvm_page_table_range_t *rang
|
||||
return range->table->phys_alloc.addr.aperture;
|
||||
}
|
||||
|
||||
// Given a GPU or CPU physical address that refers to pages tables, retrieve an
|
||||
// address suitable for CE writes to those page tables. This should be used
|
||||
// instead of uvm_gpu_address_copy because PTE writes are used to bootstrap the
|
||||
// various flat virtual mappings, so we usually ensure that PTE writes work even
|
||||
// if virtual mappings are required for other accesses. This is only needed when
|
||||
// CE has system-wide physical addressing restrictions.
|
||||
uvm_gpu_address_t uvm_mmu_gpu_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr);
|
||||
|
||||
NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -541,7 +541,7 @@ static NV_STATUS allocate_then_free_8_8_64k(uvm_gpu_t *gpu)
|
||||
|
||||
NvLength size = 64 * 1024;
|
||||
NvLength stride = 32 * size;
|
||||
NvLength start = stride * 248 + 256LL * 1024 * 1024 * 1024 + (1LL << 47);
|
||||
NvLength start = (248 * stride) + (256 * UVM_SIZE_1GB) + (128 * UVM_SIZE_1TB);
|
||||
int i;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
@@ -662,7 +662,7 @@ static NV_STATUS get_entire_table_4k(uvm_gpu_t *gpu)
|
||||
|
||||
NvU64 start = 1UL << 47;
|
||||
|
||||
NvLength size = 1 << 21;
|
||||
NvLength size = 2 * UVM_SIZE_1MB;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, start, size, &range), NV_OK);
|
||||
@@ -685,7 +685,7 @@ static NV_STATUS get_entire_table_512m(uvm_gpu_t *gpu)
|
||||
uvm_page_table_range_t range;
|
||||
|
||||
NvU64 start = 1UL << 48;
|
||||
NvLength size = 512UL * 512 * 1024 * 1024;
|
||||
NvLength size = 512UL * UVM_PAGE_SIZE_512M;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range), NV_OK);
|
||||
@@ -711,7 +711,7 @@ static NV_STATUS split_4k_from_2m(uvm_gpu_t *gpu)
|
||||
uvm_page_table_range_t range_64k;
|
||||
|
||||
NvU64 start = 1UL << 48;
|
||||
NvLength size = 1 << 21;
|
||||
NvLength size = 2 * UVM_SIZE_1MB;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range_2m), NV_OK);
|
||||
@@ -759,7 +759,7 @@ static NV_STATUS split_2m_from_512m(uvm_gpu_t *gpu)
|
||||
uvm_page_table_range_t range_2m;
|
||||
|
||||
NvU64 start = 1UL << 48;
|
||||
NvLength size = 512UL * 1024 * 1024;
|
||||
NvLength size = UVM_PAGE_SIZE_512M;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range_512m), NV_OK);
|
||||
@@ -812,7 +812,7 @@ static NV_STATUS get_2gb_range(uvm_gpu_t *gpu)
|
||||
uvm_page_tree_t tree;
|
||||
uvm_page_table_range_t range;
|
||||
|
||||
NvU64 start = 2UL * (1 << 30);
|
||||
NvU64 start = 2 * UVM_SIZE_1GB;
|
||||
NvU64 size = start;
|
||||
|
||||
MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
|
||||
|
||||
@@ -49,11 +49,13 @@ void uvm_hal_pascal_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// A single top level PDE on Pascal covers 128 TB and that's the minimum
|
||||
// size that can be used.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 128ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 128 * UVM_SIZE_1TB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = 384ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = 384 * UVM_SIZE_1TB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
parent_gpu->ce_phys_vidmem_write_supported = true;
|
||||
|
||||
parent_gpu->peer_copy_mode = UVM_GPU_PEER_COPY_MODE_VIRTUAL;
|
||||
|
||||
// Not all units on Pascal support 49-bit addressing, including those which
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2021 NVIDIA Corporation
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -105,7 +105,7 @@ static uvm_fault_access_type_t get_fault_access_type(const NvU32 *fault_entry)
|
||||
return UVM_FAULT_ACCESS_TYPE_COUNT;
|
||||
}
|
||||
|
||||
static uvm_fault_type_t get_fault_type(const NvU32 *fault_entry)
|
||||
uvm_fault_type_t uvm_hal_pascal_fault_buffer_get_fault_type(const NvU32 *fault_entry)
|
||||
{
|
||||
NvU32 hw_fault_type_value = READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, FAULT_TYPE);
|
||||
|
||||
@@ -197,11 +197,27 @@ static NvU32 *get_fault_buffer_entry(uvm_parent_gpu_t *parent_gpu, NvU32 index)
|
||||
return fault_entry;
|
||||
}
|
||||
|
||||
// When Confidential Computing is enabled, fault entries are encrypted. Each
|
||||
// fault has (unencrypted) metadata containing the authentication tag, and a
|
||||
// valid bit that allows UVM to check if an encrypted fault is valid, without
|
||||
// having to decrypt it first.
|
||||
static UvmFaultMetadataPacket *get_fault_buffer_entry_metadata(uvm_parent_gpu_t *parent_gpu, NvU32 index)
|
||||
{
|
||||
UvmFaultMetadataPacket *fault_entry_metadata;
|
||||
|
||||
UVM_ASSERT(index < parent_gpu->fault_buffer_info.replayable.max_faults);
|
||||
UVM_ASSERT(!uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu));
|
||||
|
||||
fault_entry_metadata = parent_gpu->fault_buffer_info.rm_info.replayable.bufferMetadata;
|
||||
UVM_ASSERT(fault_entry_metadata != NULL);
|
||||
|
||||
return fault_entry_metadata + index;
|
||||
}
|
||||
|
||||
void uvm_hal_pascal_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 index,
|
||||
uvm_fault_buffer_entry_t *buffer_entry)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU32 *fault_entry;
|
||||
NvU64 addr_hi, addr_lo;
|
||||
NvU64 timestamp_hi, timestamp_lo;
|
||||
@@ -209,13 +225,12 @@ void uvm_hal_pascal_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU32 utlb_id;
|
||||
|
||||
BUILD_BUG_ON(NVB069_FAULT_BUF_SIZE > UVM_GPU_MMU_MAX_FAULT_PACKET_SIZE);
|
||||
status = NV_OK;
|
||||
|
||||
fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
|
||||
// Valid bit must be set before this function is called
|
||||
UVM_ASSERT(parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, index));
|
||||
|
||||
fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
|
||||
addr_hi = READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, INST_HI);
|
||||
addr_lo = READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, INST_LO);
|
||||
buffer_entry->instance_ptr.address = addr_lo + (addr_hi << HWSIZE_MW(B069, FAULT_BUF_ENTRY, INST_LO));
|
||||
@@ -233,7 +248,7 @@ void uvm_hal_pascal_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
timestamp_lo = READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, TIMESTAMP_LO);
|
||||
buffer_entry->timestamp = timestamp_lo + (timestamp_hi << HWSIZE_MW(B069, FAULT_BUF_ENTRY, TIMESTAMP_LO));
|
||||
|
||||
buffer_entry->fault_type = get_fault_type(fault_entry);
|
||||
buffer_entry->fault_type = parent_gpu->fault_buffer_hal->get_fault_type(fault_entry);
|
||||
|
||||
buffer_entry->fault_access_type = get_fault_access_type(fault_entry);
|
||||
|
||||
@@ -269,23 +284,39 @@ void uvm_hal_pascal_fault_buffer_parse_entry(uvm_parent_gpu_t *parent_gpu,
|
||||
|
||||
bool uvm_hal_pascal_fault_buffer_entry_is_valid(uvm_parent_gpu_t *parent_gpu, NvU32 index)
|
||||
{
|
||||
NvU32 *fault_entry;
|
||||
bool is_valid;
|
||||
if (uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu)) {
|
||||
NvU32 *fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
|
||||
fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
return READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, VALID);
|
||||
}
|
||||
else {
|
||||
// Use the valid bit present in the encryption metadata, which is
|
||||
// unencrypted, instead of the valid bit present in the (encrypted)
|
||||
// fault itself.
|
||||
UvmFaultMetadataPacket *fault_entry_metadata = get_fault_buffer_entry_metadata(parent_gpu, index);
|
||||
|
||||
is_valid = READ_HWVALUE_MW(fault_entry, B069, FAULT_BUF_ENTRY, VALID);
|
||||
return fault_entry_metadata->valid;
|
||||
}
|
||||
|
||||
return is_valid;
|
||||
UVM_ASSERT_MSG(false, "Invalid path");
|
||||
return false;
|
||||
}
|
||||
|
||||
void uvm_hal_pascal_fault_buffer_entry_clear_valid(uvm_parent_gpu_t *parent_gpu, NvU32 index)
|
||||
{
|
||||
NvU32 *fault_entry;
|
||||
if (uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu)) {
|
||||
NvU32 *fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
|
||||
fault_entry = get_fault_buffer_entry(parent_gpu, index);
|
||||
WRITE_HWCONST_MW(fault_entry, B069, FAULT_BUF_ENTRY, VALID, FALSE);
|
||||
}
|
||||
else {
|
||||
// Use the valid bit present in the encryption metadata, which is
|
||||
// unencrypted, instead of the valid bit present in the (encrypted)
|
||||
// fault itself.
|
||||
UvmFaultMetadataPacket *fault_entry_metadata = get_fault_buffer_entry_metadata(parent_gpu, index);
|
||||
|
||||
WRITE_HWCONST_MW(fault_entry, B069, FAULT_BUF_ENTRY, VALID, FALSE);
|
||||
fault_entry_metadata->valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
NvU32 uvm_hal_pascal_fault_buffer_entry_size(uvm_parent_gpu_t *parent_gpu)
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
*******************************************************************************/
|
||||
|
||||
#include "uvm_api.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
#include "uvm_perf_events.h"
|
||||
#include "uvm_perf_module.h"
|
||||
#include "uvm_perf_thrashing.h"
|
||||
@@ -262,6 +263,7 @@ static unsigned uvm_perf_thrashing_pin_threshold = UVM_PERF_THRASHING_PIN_THRESH
|
||||
// detection/prevention parameters
|
||||
#define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT 500
|
||||
#define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_EMULATION (UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 800)
|
||||
#define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_HCC (UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10)
|
||||
|
||||
// Lapse of time in microseconds that determines if two consecutive events on
|
||||
// the same page can be considered thrashing
|
||||
@@ -532,18 +534,20 @@ static void gpu_thrashing_stats_destroy(uvm_gpu_t *gpu)
|
||||
|
||||
// Get the thrashing detection struct for the given VA space if it exists
|
||||
//
|
||||
// VA space lock needs to be held
|
||||
// The caller must ensure that the va_space cannot be deleted, for the
|
||||
// duration of this call. Holding either the va_block or va_space lock will do
|
||||
// that.
|
||||
static va_space_thrashing_info_t *va_space_thrashing_info_get_or_null(uvm_va_space_t *va_space)
|
||||
{
|
||||
// TODO: Bug 3898454: check locking requirement for UVM-HMM.
|
||||
|
||||
return uvm_perf_module_type_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
|
||||
}
|
||||
|
||||
// Get the thrashing detection struct for the given VA space. It asserts that
|
||||
// the information has been previously created.
|
||||
//
|
||||
// VA space lock needs to be held
|
||||
// The caller must ensure that the va_space cannot be deleted, for the
|
||||
// duration of this call. Holding either the va_block or va_space lock will do
|
||||
// that.
|
||||
static va_space_thrashing_info_t *va_space_thrashing_info_get(uvm_va_space_t *va_space)
|
||||
{
|
||||
va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
|
||||
@@ -1783,22 +1787,6 @@ const uvm_page_mask_t *uvm_perf_thrashing_get_thrashing_pages(uvm_va_block_t *va
|
||||
return &block_thrashing->thrashing_pages;
|
||||
}
|
||||
|
||||
bool uvm_perf_thrashing_is_block_thrashing(uvm_va_block_t *va_block)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
|
||||
va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
|
||||
block_thrashing_info_t *block_thrashing = NULL;
|
||||
|
||||
if (!va_space_thrashing->params.enable)
|
||||
return false;
|
||||
|
||||
block_thrashing = thrashing_info_get(va_block);
|
||||
if (!block_thrashing)
|
||||
return false;
|
||||
|
||||
return block_thrashing->num_thrashing_pages > 0;
|
||||
}
|
||||
|
||||
#define TIMER_GRANULARITY_NS 20000ULL
|
||||
static void thrashing_unpin_pages(struct work_struct *work)
|
||||
{
|
||||
@@ -1854,6 +1842,8 @@ static void thrashing_unpin_pages(struct work_struct *work)
|
||||
break;
|
||||
|
||||
va_block = pinned_page->va_block;
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_begin_wait(va_block);
|
||||
uvm_mutex_lock(&va_block->lock);
|
||||
|
||||
// Only operate if the pinned page's tracking state isn't already
|
||||
@@ -1876,6 +1866,8 @@ static void thrashing_unpin_pages(struct work_struct *work)
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&va_block->lock);
|
||||
if (uvm_va_block_is_hmm(va_block))
|
||||
uvm_hmm_migrate_finish(va_block);
|
||||
kmem_cache_free(g_pinned_page_cache, pinned_page);
|
||||
}
|
||||
|
||||
@@ -1947,12 +1939,24 @@ void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)
|
||||
NV_STATUS uvm_perf_thrashing_register_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
|
||||
{
|
||||
// If a simulated GPU is registered, re-initialize thrashing parameters in
|
||||
// case they need to be adjusted
|
||||
if (g_uvm_global.num_simulated_devices > 0) {
|
||||
// case they need to be adjusted.
|
||||
bool params_need_readjusting = g_uvm_global.num_simulated_devices > 0;
|
||||
|
||||
// Likewise, when the Confidential Computing feature is enabled, the DMA
|
||||
// path is slower due to cryptographic operations & other associated
|
||||
// overhead. Enforce a larger window to allow the thrashing mitigation
|
||||
// mechanisms to work properly.
|
||||
params_need_readjusting = params_need_readjusting || uvm_conf_computing_mode_enabled(gpu);
|
||||
|
||||
if (params_need_readjusting) {
|
||||
va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
|
||||
|
||||
if (!va_space_thrashing->params.test_overrides)
|
||||
if (!va_space_thrashing->params.test_overrides) {
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
g_uvm_perf_thrashing_lapse_usec = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_HCC;
|
||||
|
||||
va_space_thrashing_info_init_params(va_space_thrashing);
|
||||
}
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
|
||||
@@ -84,9 +84,6 @@ uvm_processor_mask_t *uvm_perf_thrashing_get_thrashing_processors(uvm_va_block_t
|
||||
|
||||
const uvm_page_mask_t *uvm_perf_thrashing_get_thrashing_pages(uvm_va_block_t *va_block);
|
||||
|
||||
// Returns true if any page in the block is thrashing, or false otherwise
|
||||
bool uvm_perf_thrashing_is_block_thrashing(uvm_va_block_t *va_block);
|
||||
|
||||
// Global initialization/cleanup functions
|
||||
NV_STATUS uvm_perf_thrashing_init(void);
|
||||
void uvm_perf_thrashing_exit(void);
|
||||
|
||||
@@ -172,6 +172,7 @@
|
||||
#include "uvm_va_block.h"
|
||||
#include "uvm_test.h"
|
||||
#include "uvm_linux.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
static int uvm_global_oversubscription = 1;
|
||||
module_param(uvm_global_oversubscription, int, S_IRUGO);
|
||||
@@ -242,11 +243,13 @@ const char *uvm_pmm_gpu_memory_type_string(uvm_pmm_gpu_memory_type_t type)
|
||||
{
|
||||
switch (type) {
|
||||
UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER);
|
||||
UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED);
|
||||
UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL);
|
||||
UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED);
|
||||
UVM_ENUM_STRING_DEFAULT();
|
||||
}
|
||||
|
||||
BUILD_BUG_ON(UVM_PMM_GPU_MEMORY_TYPE_COUNT != 2);
|
||||
BUILD_BUG_ON(UVM_PMM_GPU_MEMORY_TYPE_COUNT != 4);
|
||||
}
|
||||
|
||||
const char *uvm_pmm_gpu_chunk_state_string(uvm_pmm_gpu_chunk_state_t state)
|
||||
@@ -454,7 +457,19 @@ bool uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)
|
||||
UVM_ASSERT(type < UVM_PMM_GPU_MEMORY_TYPE_COUNT);
|
||||
|
||||
switch (type) {
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER:
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)
|
||||
{
|
||||
switch (type) {
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_KERNEL: // Alias UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -486,11 +501,11 @@ uvm_gpu_t *uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t *chunk)
|
||||
struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
|
||||
NvU64 sys_addr = chunk->address + uvm_gpu_numa_info(gpu)->system_memory_window_start;
|
||||
NvU64 sys_addr = chunk->address + gpu->parent->system_bus.memory_window_start;
|
||||
unsigned long pfn = sys_addr >> PAGE_SHIFT;
|
||||
|
||||
UVM_ASSERT(sys_addr + uvm_gpu_chunk_get_size(chunk) <= uvm_gpu_numa_info(gpu)->system_memory_window_end + 1);
|
||||
UVM_ASSERT(gpu->parent->numa_info.enabled);
|
||||
UVM_ASSERT(sys_addr + uvm_gpu_chunk_get_size(chunk) <= gpu->parent->system_bus.memory_window_end + 1);
|
||||
UVM_ASSERT(gpu->mem_info.numa.enabled);
|
||||
|
||||
return pfn_to_page(pfn);
|
||||
}
|
||||
@@ -520,7 +535,16 @@ void uvm_pmm_gpu_sync(uvm_pmm_gpu_t *pmm)
|
||||
|
||||
static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_parent_gpu_t *parent_gpu, uvm_pmm_gpu_memory_type_t type)
|
||||
{
|
||||
return type;
|
||||
if (uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
return type;
|
||||
|
||||
// Enforce the contract that when the Confidential Computing feature is
|
||||
// disabled, all user types are alike, as well as all kernel types,
|
||||
// respectively. See uvm_pmm_gpu_memory_type_t.
|
||||
if (uvm_pmm_gpu_memory_type_is_user(type))
|
||||
return UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
|
||||
return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
|
||||
@@ -622,18 +646,6 @@ static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
|
||||
size_t num_chunks,
|
||||
uvm_chunk_size_t chunk_size,
|
||||
uvm_pmm_alloc_flags_t flags,
|
||||
uvm_gpu_chunk_t **chunks,
|
||||
uvm_tracker_t *out_tracker)
|
||||
{
|
||||
uvm_pmm_gpu_memory_type_t memory_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
|
||||
|
||||
return pmm_gpu_alloc_kernel(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
|
||||
}
|
||||
|
||||
static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
|
||||
{
|
||||
uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
|
||||
@@ -1535,7 +1547,7 @@ static bool root_chunk_has_elevated_page(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_
|
||||
uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
|
||||
struct page *page;
|
||||
|
||||
if (!gpu->parent->numa_info.enabled)
|
||||
if (!gpu->mem_info.numa.enabled)
|
||||
return false;
|
||||
|
||||
page = uvm_gpu_chunk_to_page(pmm, chunk);
|
||||
@@ -2155,7 +2167,7 @@ NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
|
||||
// Also, user pages that are about to be overwritten, don't need to be
|
||||
// zeroed, either. Add an interface to uvm_pmm_gpu_alloc for callers to
|
||||
// specify when they don't need zeroed pages.
|
||||
const bool skip_pma_scrubbing = gpu->parent->numa_info.enabled;
|
||||
const bool skip_pma_scrubbing = gpu->mem_info.numa.enabled;
|
||||
UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(type) || uvm_pmm_gpu_memory_type_is_kernel(type));
|
||||
|
||||
options.flags = UVM_PMA_ALLOCATE_DONT_EVICT;
|
||||
@@ -2168,9 +2180,14 @@ NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
|
||||
|
||||
// TODO: Bug 200480500: Batching is currently disabled on P9. Re-enable
|
||||
// when the performance of best-effort allocations is verified.
|
||||
if (gpu->parent->numa_info.enabled)
|
||||
if (gpu->mem_info.numa.enabled)
|
||||
flags |= UVM_PMM_ALLOC_FLAGS_DONT_BATCH;
|
||||
|
||||
// When the confidential computing feature is enabled, allocate GPU memory
|
||||
// in the protected region, unless specified otherwise.
|
||||
if (uvm_conf_computing_mode_enabled(gpu) && memory_type_is_protected(type))
|
||||
options.flags |= UVM_PMA_ALLOCATE_PROTECTED_REGION;
|
||||
|
||||
if (!gpu->parent->rm_info.isSimulated &&
|
||||
!(options.flags & UVM_PMA_ALLOCATE_PINNED) &&
|
||||
!(flags & UVM_PMM_ALLOC_FLAGS_DONT_BATCH)) {
|
||||
@@ -2424,6 +2441,12 @@ static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
|
||||
UVM_ASSERT(uvm_global_id_equal(uvm_global_gpu_id_from_index(chunk->gpu_global_index), gpu->global_id));
|
||||
|
||||
|
||||
// See pmm_squash_memory_type().
|
||||
if (!uvm_conf_computing_mode_enabled(gpu)) {
|
||||
UVM_ASSERT(chunk->type == UVM_PMM_GPU_MEMORY_TYPE_USER ||
|
||||
chunk->type == UVM_PMM_GPU_MEMORY_TYPE_KERNEL);
|
||||
}
|
||||
|
||||
if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
|
||||
UVM_ASSERT(chunk_size > uvm_chunk_find_first_size(chunk_sizes));
|
||||
|
||||
@@ -2756,6 +2779,11 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
|
||||
UVM_ASSERT(IS_ALIGNED(UVM_CHUNK_SIZE_MAX, page_size));
|
||||
UVM_ASSERT(UVM_CHUNK_SIZE_MAX >= page_size);
|
||||
|
||||
// Currently, when the Confidential Computing feature is enabled, the
|
||||
// entirety of vidmem is protected.
|
||||
if (uvm_conf_computing_mode_enabled(uvm_pmm_to_gpu(pmm)) && (mem_type != UVM_PMA_GPU_MEMORY_TYPE_PROTECTED))
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
while (num_pages_left_to_evict > 0) {
|
||||
uvm_gpu_root_chunk_t *root_chunk;
|
||||
uvm_page_index_t page_index;
|
||||
@@ -2856,7 +2884,7 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void *void_pmm,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 *pages,
|
||||
NvU32 num_pages_to_evict,
|
||||
NvU64 phys_start,
|
||||
@@ -3369,9 +3397,20 @@ static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
|
||||
}
|
||||
|
||||
if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED && subchunk->is_referenced) {
|
||||
unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(pmm, subchunk);
|
||||
|
||||
// TODO: Bug 3368756: add support for large GPU pages.
|
||||
UVM_ASSERT(uvm_gpu_chunk_get_size(subchunk) == PAGE_SIZE);
|
||||
uvm_spin_unlock(&pmm->list_lock);
|
||||
|
||||
uvm_hmm_pmm_gpu_evict_chunk(uvm_pmm_to_gpu(pmm), subchunk);
|
||||
// The above check for subchunk state is racy because the
|
||||
// chunk may be freed after the lock is dropped. It is
|
||||
// still safe to proceed in that case because the struct
|
||||
// page reference will have dropped to zero and cannot
|
||||
// have been re-allocated as this is only called during
|
||||
// GPU teardown. Therefore migrate_device_range() will
|
||||
// simply fail.
|
||||
uvm_hmm_pmm_gpu_evict_pfn(pfn);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -3379,13 +3418,24 @@ static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
|
||||
}
|
||||
}
|
||||
|
||||
void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
|
||||
// Free any orphan pages.
|
||||
// This should be called as part of removing a GPU: after all work is stopped
|
||||
// and all va_blocks have been destroyed. There normally won't be any
|
||||
// device private struct page references left but there can be cases after
|
||||
// fork() where a child process still holds a reference. This function searches
|
||||
// for pages that still have a reference and migrates the page to the GPU in
|
||||
// order to release the reference in the CPU page table.
|
||||
static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (!pmm->initialized)
|
||||
return;
|
||||
|
||||
// This is only safe to call during GPU teardown where chunks
|
||||
// cannot be re-allocated.
|
||||
UVM_ASSERT(uvm_gpu_retained_count(uvm_pmm_to_gpu(pmm)) == 0);
|
||||
|
||||
// Scan all the root chunks looking for subchunks which are still
|
||||
// referenced. This is slow, but we only do this when unregistering a GPU
|
||||
// and is not critical for performance.
|
||||
@@ -3429,7 +3479,7 @@ static vm_fault_t devmem_fault(struct vm_fault *vmf)
|
||||
{
|
||||
uvm_va_space_t *va_space = vmf->page->zone_device_data;
|
||||
|
||||
if (!va_space)
|
||||
if (!va_space || va_space->va_space_mm.mm != vmf->vma->vm_mm)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
|
||||
@@ -3517,6 +3567,10 @@ static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
|
||||
static void devmem_deinit(uvm_pmm_gpu_t *pmm)
|
||||
{
|
||||
}
|
||||
|
||||
static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
|
||||
{
|
||||
}
|
||||
#endif // UVM_IS_CONFIG_HMM()
|
||||
|
||||
static void process_lazy_free(uvm_pmm_gpu_t *pmm)
|
||||
@@ -3551,8 +3605,11 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
|
||||
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
|
||||
const uvm_chunk_sizes_mask_t chunk_size_init[][UVM_PMM_GPU_MEMORY_TYPE_COUNT] =
|
||||
{
|
||||
{ gpu->parent->mmu_user_chunk_sizes, gpu->parent->mmu_kernel_chunk_sizes },
|
||||
{ 0, uvm_mem_kernel_chunk_sizes(gpu)},
|
||||
{ gpu->parent->mmu_user_chunk_sizes,
|
||||
gpu->parent->mmu_user_chunk_sizes,
|
||||
gpu->parent->mmu_kernel_chunk_sizes,
|
||||
gpu->parent->mmu_kernel_chunk_sizes },
|
||||
{ 0, 0, uvm_mem_kernel_chunk_sizes(gpu), uvm_mem_kernel_chunk_sizes(gpu)},
|
||||
};
|
||||
NV_STATUS status = NV_OK;
|
||||
size_t i, j, k;
|
||||
@@ -3597,13 +3654,13 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
|
||||
goto cleanup;
|
||||
|
||||
// Assert that max physical address of the GPU is not unreasonably big for
|
||||
// creating the flat array of root chunks. Currently the worst case is a
|
||||
// Maxwell GPU that has 0.5 GB of its physical memory mapped at the 64GB
|
||||
// physical address. 256GB should provide reasonable amount of
|
||||
// future-proofing and results in 128K chunks which is still manageable.
|
||||
UVM_ASSERT_MSG(gpu->mem_info.max_allocatable_address < 256ull * 1024 * 1024 * 1024,
|
||||
"Max physical address over 256GB: %llu\n",
|
||||
gpu->mem_info.max_allocatable_address);
|
||||
// creating the flat array of root chunks. 256GB should provide a reasonable
|
||||
// amount of future-proofing and results in 128K chunks which is still
|
||||
// manageable.
|
||||
UVM_ASSERT_MSG(gpu->mem_info.max_allocatable_address < UVM_GPU_MAX_PHYS_MEM,
|
||||
"Max physical address 0x%llx exceeds limit of 0x%llx\n",
|
||||
gpu->mem_info.max_allocatable_address,
|
||||
UVM_GPU_MAX_PHYS_MEM);
|
||||
|
||||
// Align up the size to have a root chunk for the last part of the FB. PMM
|
||||
// won't be able to allocate it, if it doesn't fit a whole root chunk, but
|
||||
@@ -3686,6 +3743,8 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
|
||||
return;
|
||||
|
||||
gpu = uvm_pmm_to_gpu(pmm);
|
||||
|
||||
uvm_pmm_gpu_free_orphan_pages(pmm);
|
||||
nv_kthread_q_flush(&gpu->parent->lazy_free_q);
|
||||
UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
|
||||
release_free_root_chunks(pmm);
|
||||
|
||||
@@ -97,9 +97,18 @@ typedef enum
|
||||
{
|
||||
// Memory type for backing user pages. On Pascal+ it can be evicted.
|
||||
UVM_PMM_GPU_MEMORY_TYPE_USER,
|
||||
// When the Confidential Computing feature is enabled, the protected flavor
|
||||
// allocates memory out of the VPR region. When it's disabled, all flavors
|
||||
// have no effects and are equivalent to the base type.
|
||||
UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED = UVM_PMM_GPU_MEMORY_TYPE_USER,
|
||||
UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED,
|
||||
|
||||
// Memory type for internal UVM allocations. It cannot be evicted.
|
||||
UVM_PMM_GPU_MEMORY_TYPE_KERNEL,
|
||||
// See user types for the behavior description when the Confidential
|
||||
// Computing feature is ON or OFF.
|
||||
UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED = UVM_PMM_GPU_MEMORY_TYPE_KERNEL,
|
||||
UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED,
|
||||
|
||||
// Number of types - MUST BE LAST.
|
||||
UVM_PMM_GPU_MEMORY_TYPE_COUNT
|
||||
@@ -216,15 +225,6 @@ uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page);
|
||||
// Return the PFN of the device private struct page for the given GPU chunk.
|
||||
unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
|
||||
|
||||
// Free any orphan pages.
|
||||
// This should be called as part of removing a GPU: after all work is stopped
|
||||
// and all va_blocks have been destroyed. There normally won't be any
|
||||
// device private struct page references left but there can be cases after
|
||||
// fork() where a child process still holds a reference. This function searches
|
||||
// for pages that still have a reference and migrates the page to the GPU in
|
||||
// order to release the reference in the CPU page table.
|
||||
void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm);
|
||||
|
||||
#endif
|
||||
|
||||
struct uvm_gpu_chunk_struct
|
||||
@@ -468,6 +468,10 @@ struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
|
||||
// node has to be returned to a valid state before calling either of the APIs.
|
||||
//
|
||||
// In case of an error, the chunks array is guaranteed to be cleared.
|
||||
//
|
||||
// If the memory returned by the PMM allocator cannot be physically addressed,
|
||||
// the MMU interface provides user chunk mapping and unmapping functions
|
||||
// (uvm_mmu_chunk_map/unmap) that enable virtual addressing.
|
||||
NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
|
||||
size_t num_chunks,
|
||||
uvm_chunk_size_t chunk_size,
|
||||
@@ -480,21 +484,26 @@ NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
|
||||
//
|
||||
// Internally calls uvm_pmm_gpu_alloc() and sets the state of all chunks to
|
||||
// allocated on success.
|
||||
NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
|
||||
size_t num_chunks,
|
||||
uvm_chunk_size_t chunk_size,
|
||||
uvm_pmm_alloc_flags_t flags,
|
||||
uvm_gpu_chunk_t **chunks,
|
||||
uvm_tracker_t *out_tracker);
|
||||
//
|
||||
// If Confidential Computing is enabled, this helper allocates protected kernel
|
||||
// memory.
|
||||
static NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
|
||||
size_t num_chunks,
|
||||
uvm_chunk_size_t chunk_size,
|
||||
uvm_pmm_alloc_flags_t flags,
|
||||
uvm_gpu_chunk_t **chunks,
|
||||
uvm_tracker_t *out_tracker)
|
||||
{
|
||||
return uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_KERNEL, flags, chunks, out_tracker);
|
||||
}
|
||||
|
||||
// Helper for allocating user memory
|
||||
//
|
||||
// Simple wrapper that just uses UVM_PMM_GPU_MEMORY_TYPE_USER for the memory
|
||||
// type.
|
||||
//
|
||||
// If the memory returned by the PMM allocator cannot be physically addressed,
|
||||
// the MMU interface provides user chunk mapping and unmapping functions
|
||||
// (uvm_mmu_chunk_map/unmap) that enable virtual addressing.
|
||||
// If Confidential Computing is enabled, this helper allocates protected user
|
||||
// memory.
|
||||
static NV_STATUS uvm_pmm_gpu_alloc_user(uvm_pmm_gpu_t *pmm,
|
||||
size_t num_chunks,
|
||||
uvm_chunk_size_t chunk_size,
|
||||
|
||||
@@ -618,7 +618,6 @@ static NV_STATUS cpu_chunk_map_on_cpu(uvm_cpu_chunk_t *chunk, void **cpu_addr)
|
||||
static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
{
|
||||
NvU64 dma_addr;
|
||||
uvm_gpu_phys_address_t gpu_phys_addr;
|
||||
uvm_gpu_address_t gpu_addr;
|
||||
uvm_push_t push;
|
||||
NvU32 *cpu_addr;
|
||||
@@ -630,12 +629,7 @@ static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t
|
||||
memset(cpu_addr, 0, chunk_size);
|
||||
|
||||
dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
|
||||
gpu_phys_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
|
||||
|
||||
if (uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu))
|
||||
gpu_addr = uvm_gpu_address_virtual_from_sysmem_phys(gpu, gpu_phys_addr.address);
|
||||
else
|
||||
gpu_addr = uvm_gpu_address_from_phys(gpu_phys_addr);
|
||||
gpu_addr = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr));
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
|
||||
UVM_CHANNEL_TYPE_GPU_TO_CPU,
|
||||
|
||||
@@ -100,6 +100,22 @@ typedef enum
|
||||
// It is duplicated because we do not want to expose it as an API.
|
||||
static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_parent_gpu_t *parent_gpu, uvm_pmm_gpu_memory_type_t type)
|
||||
{
|
||||
if (uvm_conf_computing_mode_enabled_parent(parent_gpu))
|
||||
return type;
|
||||
|
||||
// Enforce the contract that when the Confidential Computing feature is
|
||||
// disabled, all user types are alike, as well as all kernel types,
|
||||
// respectively. See uvm_pmm_gpu_memory_type_t.
|
||||
switch (type) {
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED:
|
||||
return UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_KERNEL: // Alias UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED
|
||||
case UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED:
|
||||
return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
|
||||
default:
|
||||
UVM_ASSERT(0);
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
@@ -306,6 +322,13 @@ static NV_STATUS gpu_mem_check(uvm_gpu_t *gpu,
|
||||
NvU32 *verif_cpu_addr = uvm_mem_get_cpu_addr_kernel(verif_mem);
|
||||
size_t i;
|
||||
|
||||
// TODO: Bug 3839176: [UVM][HCC][uvm_test] Update tests that assume GPU
|
||||
// engines can directly access sysmem
|
||||
// Skip this test for now. To enable this test under SEV,
|
||||
// The GPU->CPU CE copy needs to be updated so it uses encryption when
|
||||
// CC is enabled.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
UVM_ASSERT(verif_mem->size >= size);
|
||||
memset(verif_cpu_addr, 0, size);
|
||||
|
||||
@@ -341,6 +364,11 @@ static NV_STATUS gpu_mem_check(uvm_gpu_t *gpu,
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static uvm_gpu_address_t chunk_copy_addr(uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
|
||||
{
|
||||
return uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
|
||||
}
|
||||
|
||||
static NV_STATUS init_test_chunk(uvm_va_space_t *va_space,
|
||||
uvm_pmm_gpu_t *pmm,
|
||||
test_chunk_t *test_chunk,
|
||||
@@ -362,10 +390,7 @@ static NV_STATUS init_test_chunk(uvm_va_space_t *va_space,
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mmu_chunk_map(test_chunk->chunk), chunk_free);
|
||||
|
||||
if (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
||||
chunk_addr = uvm_gpu_address_virtual_from_vidmem_phys(gpu, test_chunk->chunk->address);
|
||||
else
|
||||
chunk_addr = uvm_gpu_address_physical(UVM_APERTURE_VID, test_chunk->chunk->address);
|
||||
chunk_addr = chunk_copy_addr(gpu, test_chunk->chunk);
|
||||
|
||||
// Fill the chunk
|
||||
TEST_NV_CHECK_GOTO(do_memset_4(gpu, chunk_addr, pattern, size, &test_chunk->tracker), chunk_unmap);
|
||||
@@ -407,15 +432,10 @@ static NV_STATUS destroy_test_chunk(uvm_pmm_gpu_t *pmm, test_chunk_t *test_chunk
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
|
||||
NV_STATUS status;
|
||||
uvm_gpu_address_t chunk_addr;
|
||||
uvm_gpu_chunk_t *chunk = test_chunk->chunk;
|
||||
uvm_gpu_address_t chunk_addr = chunk_copy_addr(gpu, chunk);
|
||||
uvm_chunk_size_t size = uvm_gpu_chunk_get_size(chunk);
|
||||
|
||||
if (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
||||
chunk_addr = uvm_gpu_address_virtual_from_vidmem_phys(gpu, chunk->address);
|
||||
else
|
||||
chunk_addr = uvm_gpu_address_physical(UVM_APERTURE_VID, chunk->address);
|
||||
|
||||
status = gpu_mem_check(gpu, verif_mem, chunk_addr, size, test_chunk->pattern, &test_chunk->tracker);
|
||||
|
||||
list_del(&test_chunk->node);
|
||||
@@ -511,7 +531,7 @@ static NV_STATUS basic_test(uvm_va_space_t *va_space, uvm_gpu_t *gpu,
|
||||
|
||||
if (mode == UvmTestPmmSanityModeBasic) {
|
||||
first_memory_type = UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
last_memory_type = UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
last_memory_type = UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED;
|
||||
first_free_pattern = BASIC_TEST_FREE_PATTERN_EVERY_N;
|
||||
last_free_pattern = BASIC_TEST_FREE_PATTERN_EVERY_N;
|
||||
}
|
||||
@@ -867,6 +887,8 @@ NV_STATUS uvm_test_pmm_check_leak(UVM_TEST_PMM_CHECK_LEAK_PARAMS *params, struct
|
||||
uvm_pmm_gpu_memory_type_t last_user_mode = UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
uvm_pmm_gpu_memory_type_t current_user_mode = first_user_mode;
|
||||
|
||||
last_user_mode = UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED;
|
||||
|
||||
if (params->alloc_limit < -1)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
@@ -1002,6 +1024,8 @@ NV_STATUS uvm_test_pmm_async_alloc(UVM_TEST_PMM_ASYNC_ALLOC_PARAMS *params, stru
|
||||
uvm_pmm_gpu_memory_type_t last_user_mode = UVM_PMM_GPU_MEMORY_TYPE_USER;
|
||||
uvm_pmm_gpu_memory_type_t current_user_mode = first_user_mode;
|
||||
|
||||
last_user_mode = UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED;
|
||||
|
||||
uvm_va_space_down_read(va_space);
|
||||
gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
|
||||
if (!gpu) {
|
||||
@@ -1226,7 +1250,7 @@ static NV_STATUS test_indirect_peers(uvm_gpu_t *owning_gpu, uvm_gpu_t *accessing
|
||||
}
|
||||
|
||||
// Check that accessing_gpu can read and write
|
||||
local_addr = uvm_gpu_address_physical(UVM_APERTURE_VID, chunks[0]->address);
|
||||
local_addr = chunk_copy_addr(owning_gpu, chunks[0]);
|
||||
peer_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunks[0], accessing_gpu);
|
||||
|
||||
// Init on local GPU
|
||||
@@ -1391,7 +1415,7 @@ NV_STATUS uvm_test_pmm_chunk_with_elevated_page(UVM_TEST_PMM_CHUNK_WITH_ELEVATED
|
||||
uvm_va_space_down_read(va_space);
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
if (!gpu->parent->numa_info.enabled)
|
||||
if (!gpu->mem_info.numa.enabled)
|
||||
continue;
|
||||
|
||||
ran_test = true;
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_va_space_mm.h"
|
||||
|
||||
bool uvm_is_valid_vma_range(struct mm_struct *mm, NvU64 start, NvU64 length)
|
||||
static bool uvm_is_valid_vma_range(struct mm_struct *mm, NvU64 start, NvU64 length)
|
||||
{
|
||||
const NvU64 end = start + length;
|
||||
struct vm_area_struct *vma;
|
||||
@@ -50,7 +50,7 @@ bool uvm_is_valid_vma_range(struct mm_struct *mm, NvU64 start, NvU64 length)
|
||||
return false;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 base, NvU64 length)
|
||||
uvm_api_range_type_t uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 base, NvU64 length)
|
||||
{
|
||||
uvm_va_range_t *va_range, *va_range_last;
|
||||
const NvU64 last_address = base + length - 1;
|
||||
@@ -61,21 +61,23 @@ NV_STATUS uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *m
|
||||
uvm_assert_rwsem_locked(&va_space->lock);
|
||||
|
||||
if (uvm_api_range_invalid(base, length))
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
return UVM_API_RANGE_TYPE_INVALID;
|
||||
|
||||
// Check if passed interval overlaps with any VA range.
|
||||
if (uvm_va_space_range_empty(va_space, base, last_address)) {
|
||||
if (g_uvm_global.ats.enabled &&
|
||||
uvm_va_space_pageable_mem_access_supported(va_space) &&
|
||||
mm &&
|
||||
uvm_is_valid_vma_range(mm, base, length))
|
||||
return NV_WARN_NOTHING_TO_DO;
|
||||
else if (uvm_hmm_is_enabled(va_space) &&
|
||||
mm &&
|
||||
uvm_is_valid_vma_range(mm, base, length))
|
||||
return NV_OK;
|
||||
else
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
uvm_is_valid_vma_range(mm, base, length)) {
|
||||
|
||||
return UVM_API_RANGE_TYPE_ATS;
|
||||
}
|
||||
else if (uvm_hmm_is_enabled(va_space) && mm && uvm_is_valid_vma_range(mm, base, length)) {
|
||||
return UVM_API_RANGE_TYPE_HMM;
|
||||
}
|
||||
else {
|
||||
return UVM_API_RANGE_TYPE_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
va_range_last = NULL;
|
||||
@@ -86,10 +88,10 @@ NV_STATUS uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *m
|
||||
// Check if passed interval overlaps with an unmanaged VA range, or a
|
||||
// sub-interval not tracked by a VA range
|
||||
if (!va_range_last || va_range_last->node.end < last_address)
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
return UVM_API_RANGE_TYPE_INVALID;
|
||||
|
||||
// Passed interval is fully covered by managed VA ranges
|
||||
return NV_OK;
|
||||
return UVM_API_RANGE_TYPE_MANAGED;
|
||||
}
|
||||
|
||||
static NV_STATUS split_as_needed(uvm_va_space_t *va_space,
|
||||
@@ -282,7 +284,7 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
|
||||
|
||||
NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS *params, struct file *filp)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NV_STATUS status = NV_OK;
|
||||
NV_STATUS tracker_status;
|
||||
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
@@ -294,7 +296,7 @@ NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS
|
||||
const NvU64 start = params->requestedBase;
|
||||
const NvU64 length = params->length;
|
||||
const NvU64 end = start + length - 1;
|
||||
bool range_is_ats = false;
|
||||
uvm_api_range_type_t type;
|
||||
|
||||
UVM_ASSERT(va_space);
|
||||
|
||||
@@ -302,13 +304,10 @@ NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS
|
||||
uvm_va_space_down_write(va_space);
|
||||
has_va_space_write_lock = true;
|
||||
|
||||
status = uvm_api_range_type_check(va_space, mm, start, length);
|
||||
if (status != NV_OK) {
|
||||
if (status != NV_WARN_NOTHING_TO_DO)
|
||||
goto done;
|
||||
|
||||
status = NV_OK;
|
||||
range_is_ats = true;
|
||||
type = uvm_api_range_type_check(va_space, mm, start, length);
|
||||
if (type == UVM_API_RANGE_TYPE_INVALID) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// If the CPU is the preferred location, we don't have to find the associated uvm_gpu_t
|
||||
@@ -333,10 +332,23 @@ NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS
|
||||
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
// TODO: Bug 2098544: On ATS systems, honor the preferred location policy
|
||||
// for system memory ranges instead of ignoring it.
|
||||
if (range_is_ats)
|
||||
// UvmSetPreferredLocation on non-ATS regions targets the VA range of the
|
||||
// associated file descriptor, not the calling process. Since
|
||||
// UvmSetPreferredLocation on ATS regions are handled in userspace,
|
||||
// implementing the non-ATS behavior is not possible. So, return an error
|
||||
// instead. Although the out of process case can be supported for HMM,
|
||||
// return an error to make the API behavior consistent for all SAM regions.
|
||||
if ((type != UVM_API_RANGE_TYPE_MANAGED) && (current->mm != mm)) {
|
||||
status = NV_ERR_NOT_SUPPORTED;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// For ATS regions, let userspace handle it.
|
||||
if (type == UVM_API_RANGE_TYPE_ATS) {
|
||||
UVM_ASSERT(g_uvm_global.ats.enabled);
|
||||
status = NV_WARN_NOTHING_TO_DO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
status = preferred_location_set(va_space, mm, start, length, preferred_location_id, &first_va_range_to_migrate, &local_tracker);
|
||||
if (status != NV_OK)
|
||||
@@ -384,19 +396,38 @@ NV_STATUS uvm_api_unset_preferred_location(const UVM_UNSET_PREFERRED_LOCATION_PA
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
struct mm_struct *mm;
|
||||
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
|
||||
uvm_api_range_type_t type;
|
||||
|
||||
UVM_ASSERT(va_space);
|
||||
|
||||
mm = uvm_va_space_mm_or_current_retain_lock(va_space);
|
||||
uvm_va_space_down_write(va_space);
|
||||
|
||||
status = uvm_api_range_type_check(va_space, mm, params->requestedBase, params->length);
|
||||
type = uvm_api_range_type_check(va_space, mm, params->requestedBase, params->length);
|
||||
if (type == UVM_API_RANGE_TYPE_INVALID) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (status == NV_OK)
|
||||
status = preferred_location_set(va_space, mm, params->requestedBase, params->length, UVM_ID_INVALID, NULL, &local_tracker);
|
||||
else if (status == NV_WARN_NOTHING_TO_DO)
|
||||
status = NV_OK;
|
||||
if ((type != UVM_API_RANGE_TYPE_MANAGED) && (current->mm != mm)) {
|
||||
status = NV_ERR_NOT_SUPPORTED;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (type == UVM_API_RANGE_TYPE_ATS) {
|
||||
status = NV_WARN_NOTHING_TO_DO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
status = preferred_location_set(va_space,
|
||||
mm,
|
||||
params->requestedBase,
|
||||
params->length,
|
||||
UVM_ID_INVALID,
|
||||
NULL,
|
||||
&local_tracker);
|
||||
|
||||
done:
|
||||
tracker_status = uvm_tracker_wait_deinit(&local_tracker);
|
||||
|
||||
uvm_va_space_up_write(va_space);
|
||||
@@ -483,26 +514,23 @@ static NV_STATUS accessed_by_set(uvm_va_space_t *va_space,
|
||||
bool set_bit)
|
||||
{
|
||||
uvm_processor_id_t processor_id = UVM_ID_INVALID;
|
||||
uvm_va_range_t *va_range, *va_range_last;
|
||||
struct mm_struct *mm;
|
||||
const NvU64 last_address = base + length - 1;
|
||||
bool range_is_sysmem = false;
|
||||
accessed_by_split_params_t split_params;
|
||||
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
|
||||
NV_STATUS status;
|
||||
NV_STATUS status = NV_OK;
|
||||
NV_STATUS tracker_status;
|
||||
uvm_api_range_type_t type;
|
||||
|
||||
UVM_ASSERT(va_space);
|
||||
|
||||
mm = uvm_va_space_mm_or_current_retain_lock(va_space);
|
||||
uvm_va_space_down_write(va_space);
|
||||
|
||||
status = uvm_api_range_type_check(va_space, mm, base, length);
|
||||
if (status != NV_OK) {
|
||||
if (status != NV_WARN_NOTHING_TO_DO)
|
||||
goto done;
|
||||
status = NV_OK;
|
||||
range_is_sysmem = true;
|
||||
type = uvm_api_range_type_check(va_space, mm, base, length);
|
||||
if (type == UVM_API_RANGE_TYPE_INVALID) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (uvm_uuid_is_cpu(processor_uuid)) {
|
||||
@@ -523,8 +551,10 @@ static NV_STATUS accessed_by_set(uvm_va_space_t *va_space,
|
||||
processor_id = gpu->id;
|
||||
}
|
||||
|
||||
if (range_is_sysmem)
|
||||
if (type == UVM_API_RANGE_TYPE_ATS) {
|
||||
status = NV_OK;
|
||||
goto done;
|
||||
}
|
||||
|
||||
split_params.processor_id = processor_id;
|
||||
split_params.set_bit = set_bit;
|
||||
@@ -536,36 +566,35 @@ static NV_STATUS accessed_by_set(uvm_va_space_t *va_space,
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
va_range_last = NULL;
|
||||
uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
|
||||
va_range_last = va_range;
|
||||
if (type == UVM_API_RANGE_TYPE_MANAGED) {
|
||||
uvm_va_range_t *va_range;
|
||||
uvm_va_range_t *va_range_last = NULL;
|
||||
|
||||
// If we didn't split the ends, check that they match
|
||||
if (va_range->node.start < base || va_range->node.end > last_address)
|
||||
UVM_ASSERT(uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by,
|
||||
processor_id) == set_bit);
|
||||
uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
|
||||
va_range_last = va_range;
|
||||
|
||||
if (set_bit) {
|
||||
status = uvm_va_range_set_accessed_by(va_range, processor_id, mm, &local_tracker);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
// If we didn't split the ends, check that they match
|
||||
if (va_range->node.start < base || va_range->node.end > last_address)
|
||||
UVM_ASSERT(uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by,
|
||||
processor_id) == set_bit);
|
||||
|
||||
if (set_bit) {
|
||||
status = uvm_va_range_set_accessed_by(va_range, processor_id, mm, &local_tracker);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
}
|
||||
else {
|
||||
uvm_va_range_unset_accessed_by(va_range, processor_id, &local_tracker);
|
||||
}
|
||||
}
|
||||
else {
|
||||
uvm_va_range_unset_accessed_by(va_range, processor_id, &local_tracker);
|
||||
}
|
||||
}
|
||||
|
||||
if (va_range_last) {
|
||||
UVM_ASSERT(va_range_last);
|
||||
UVM_ASSERT(va_range_last->node.end >= last_address);
|
||||
goto done;
|
||||
}
|
||||
|
||||
status = uvm_hmm_set_accessed_by(va_space,
|
||||
processor_id,
|
||||
set_bit,
|
||||
base,
|
||||
last_address,
|
||||
&local_tracker);
|
||||
else {
|
||||
UVM_ASSERT(type == UVM_API_RANGE_TYPE_HMM);
|
||||
status = uvm_hmm_set_accessed_by(va_space, processor_id, set_bit, base, last_address, &local_tracker);
|
||||
}
|
||||
|
||||
done:
|
||||
tracker_status = uvm_tracker_wait_deinit(&local_tracker);
|
||||
@@ -756,11 +785,11 @@ static bool read_duplication_is_split_needed(const uvm_va_policy_t *policy, void
|
||||
|
||||
static NV_STATUS read_duplication_set(uvm_va_space_t *va_space, NvU64 base, NvU64 length, bool enable)
|
||||
{
|
||||
uvm_va_range_t *va_range, *va_range_last;
|
||||
struct mm_struct *mm;
|
||||
const NvU64 last_address = base + length - 1;
|
||||
NV_STATUS status;
|
||||
uvm_read_duplication_policy_t new_policy;
|
||||
uvm_api_range_type_t type;
|
||||
|
||||
UVM_ASSERT(va_space);
|
||||
|
||||
@@ -768,11 +797,13 @@ static NV_STATUS read_duplication_set(uvm_va_space_t *va_space, NvU64 base, NvU6
|
||||
mm = uvm_va_space_mm_or_current_retain_lock(va_space);
|
||||
uvm_va_space_down_write(va_space);
|
||||
|
||||
status = uvm_api_range_type_check(va_space, mm, base, length);
|
||||
if (status != NV_OK) {
|
||||
if (status == NV_WARN_NOTHING_TO_DO)
|
||||
status = NV_OK;
|
||||
|
||||
type = uvm_api_range_type_check(va_space, mm, base, length);
|
||||
if (type == UVM_API_RANGE_TYPE_INVALID) {
|
||||
status = NV_ERR_INVALID_ADDRESS;
|
||||
goto done;
|
||||
}
|
||||
else if (type == UVM_API_RANGE_TYPE_ATS) {
|
||||
status = NV_OK;
|
||||
goto done;
|
||||
}
|
||||
|
||||
@@ -787,43 +818,44 @@ static NV_STATUS read_duplication_set(uvm_va_space_t *va_space, NvU64 base, NvU6
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
va_range_last = NULL;
|
||||
uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
|
||||
va_range_last = va_range;
|
||||
if (type == UVM_API_RANGE_TYPE_MANAGED) {
|
||||
uvm_va_range_t *va_range;
|
||||
uvm_va_range_t *va_range_last = NULL;
|
||||
|
||||
// If we didn't split the ends, check that they match
|
||||
if (va_range->node.start < base || va_range->node.end > last_address)
|
||||
UVM_ASSERT(uvm_va_range_get_policy(va_range)->read_duplication == new_policy);
|
||||
uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
|
||||
va_range_last = va_range;
|
||||
|
||||
// If the va_space cannot currently read duplicate, only change the user
|
||||
// state. All memory should already have read duplication unset.
|
||||
if (uvm_va_space_can_read_duplicate(va_space, NULL)) {
|
||||
// If we didn't split the ends, check that they match
|
||||
if (va_range->node.start < base || va_range->node.end > last_address)
|
||||
UVM_ASSERT(uvm_va_range_get_policy(va_range)->read_duplication == new_policy);
|
||||
|
||||
// Handle SetAccessedBy mappings
|
||||
if (new_policy == UVM_READ_DUPLICATION_ENABLED) {
|
||||
status = uvm_va_range_set_read_duplication(va_range, mm);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
}
|
||||
else {
|
||||
// If unsetting read duplication fails, the return status is
|
||||
// not propagated back to the caller
|
||||
(void)uvm_va_range_unset_read_duplication(va_range, mm);
|
||||
// If the va_space cannot currently read duplicate, only change the user
|
||||
// state. All memory should already have read duplication unset.
|
||||
if (uvm_va_space_can_read_duplicate(va_space, NULL)) {
|
||||
|
||||
// Handle SetAccessedBy mappings
|
||||
if (new_policy == UVM_READ_DUPLICATION_ENABLED) {
|
||||
status = uvm_va_range_set_read_duplication(va_range, mm);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
}
|
||||
else {
|
||||
// If unsetting read duplication fails, the return status is
|
||||
// not propagated back to the caller
|
||||
(void)uvm_va_range_unset_read_duplication(va_range, mm);
|
||||
}
|
||||
}
|
||||
|
||||
uvm_va_range_get_policy(va_range)->read_duplication = new_policy;
|
||||
}
|
||||
|
||||
uvm_va_range_get_policy(va_range)->read_duplication = new_policy;
|
||||
}
|
||||
|
||||
if (va_range_last) {
|
||||
UVM_ASSERT(va_range_last);
|
||||
UVM_ASSERT(va_range_last->node.end >= last_address);
|
||||
goto done;
|
||||
}
|
||||
|
||||
status = uvm_hmm_set_read_duplication(va_space,
|
||||
new_policy,
|
||||
base,
|
||||
last_address);
|
||||
else {
|
||||
UVM_ASSERT(type == UVM_API_RANGE_TYPE_HMM);
|
||||
status = uvm_hmm_set_read_duplication(va_space, new_policy, base, last_address);
|
||||
}
|
||||
|
||||
done:
|
||||
uvm_va_space_up_write(va_space);
|
||||
|
||||
@@ -30,6 +30,14 @@
|
||||
#include "uvm_va_space.h"
|
||||
#include "uvm_populate_pageable.h"
|
||||
|
||||
#if defined(NV_HANDLE_MM_FAULT_HAS_MM_ARG)
|
||||
#define UVM_HANDLE_MM_FAULT(vma, addr, flags) handle_mm_fault(vma->vm_mm, vma, addr, flags)
|
||||
#elif defined(NV_HANDLE_MM_FAULT_HAS_PT_REGS_ARG)
|
||||
#define UVM_HANDLE_MM_FAULT(vma, addr, flags) handle_mm_fault(vma, addr, flags, NULL)
|
||||
#else
|
||||
#define UVM_HANDLE_MM_FAULT(vma, addr, flags) handle_mm_fault(vma, addr, flags)
|
||||
#endif
|
||||
|
||||
static bool is_write_populate(struct vm_area_struct *vma, uvm_populate_permissions_t populate_permissions)
|
||||
{
|
||||
switch (populate_permissions) {
|
||||
@@ -45,6 +53,34 @@ static bool is_write_populate(struct vm_area_struct *vma, uvm_populate_permissio
|
||||
}
|
||||
}
|
||||
|
||||
NV_STATUS uvm_handle_fault(struct vm_area_struct *vma, unsigned long start, unsigned long vma_num_pages, bool write)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
unsigned long i;
|
||||
unsigned int ret = 0;
|
||||
unsigned int fault_flags = write ? FAULT_FLAG_WRITE : 0;
|
||||
|
||||
#ifdef FAULT_FLAG_REMOTE
|
||||
fault_flags |= (FAULT_FLAG_REMOTE);
|
||||
#endif
|
||||
|
||||
for (i = 0; i < vma_num_pages; i++) {
|
||||
ret = UVM_HANDLE_MM_FAULT(vma, start + (i * PAGE_SIZE), fault_flags);
|
||||
if (ret & VM_FAULT_ERROR) {
|
||||
#if defined(NV_VM_FAULT_TO_ERRNO_PRESENT)
|
||||
int err = vm_fault_to_errno(ret, fault_flags);
|
||||
status = errno_to_nv_status(err);
|
||||
#else
|
||||
status = errno_to_nv_status(-EFAULT);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long length,
|
||||
@@ -97,6 +133,10 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,
|
||||
if (uvm_managed_vma)
|
||||
uvm_record_unlock_mmap_lock_read(mm);
|
||||
|
||||
status = uvm_handle_fault(vma, start, vma_num_pages, !!(gup_flags & FOLL_WRITE));
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
if (touch)
|
||||
ret = NV_PIN_USER_PAGES_REMOTE(mm, start, vma_num_pages, gup_flags, pages, NULL, NULL);
|
||||
else
|
||||
|
||||
@@ -54,7 +54,7 @@ static void uvm_pte_batch_flush_ptes_inline(uvm_pte_batch_t *batch)
|
||||
uvm_push_set_flag(batch->push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
uvm_push_set_flag(batch->push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
gpu->parent->ce_hal->memcopy(batch->push,
|
||||
uvm_gpu_address_from_phys(batch->pte_first_address),
|
||||
uvm_mmu_gpu_address(gpu, batch->pte_first_address),
|
||||
inline_data_addr,
|
||||
ptes_size);
|
||||
}
|
||||
@@ -62,7 +62,7 @@ static void uvm_pte_batch_flush_ptes_inline(uvm_pte_batch_t *batch)
|
||||
static void uvm_pte_batch_flush_ptes_memset(uvm_pte_batch_t *batch)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(batch->push);
|
||||
uvm_gpu_address_t addr = uvm_gpu_address_from_phys(batch->pte_first_address);
|
||||
uvm_gpu_address_t addr = uvm_mmu_gpu_address(gpu, batch->pte_first_address);
|
||||
NvU32 i;
|
||||
|
||||
UVM_ASSERT(batch->pte_count != 0);
|
||||
@@ -201,7 +201,7 @@ void uvm_pte_batch_clear_ptes(uvm_pte_batch_t *batch, uvm_gpu_phys_address_t fir
|
||||
uvm_push_set_flag(batch->push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
uvm_push_set_flag(batch->push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
gpu->parent->ce_hal->memset_8(batch->push,
|
||||
uvm_gpu_address_from_phys(first_pte),
|
||||
uvm_mmu_gpu_address(gpu, first_pte),
|
||||
empty_pte_bits,
|
||||
entry_size * entry_count);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2021 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -351,11 +351,11 @@ void *uvm_push_inline_data_get(uvm_push_inline_data_t *data, size_t size)
|
||||
UVM_ASSERT(!uvm_global_is_suspended());
|
||||
|
||||
UVM_ASSERT_MSG(uvm_push_get_size(data->push) + uvm_push_inline_data_size(data) + UVM_METHOD_SIZE + size <= UVM_MAX_PUSH_SIZE,
|
||||
"push size %u inline data size %zu new data size %zu max push %u\n",
|
||||
uvm_push_get_size(data->push), uvm_push_inline_data_size(data), size, UVM_MAX_PUSH_SIZE);
|
||||
"push size %u inline data size %zu new data size %zu max push %u\n",
|
||||
uvm_push_get_size(data->push), uvm_push_inline_data_size(data), size, UVM_MAX_PUSH_SIZE);
|
||||
UVM_ASSERT_MSG(uvm_push_inline_data_size(data) + size <= UVM_PUSH_INLINE_DATA_MAX_SIZE,
|
||||
"inline data size %zu new data size %zu max %u\n",
|
||||
uvm_push_inline_data_size(data), size, UVM_PUSH_INLINE_DATA_MAX_SIZE);
|
||||
"inline data size %zu new data size %zu max %u\n",
|
||||
uvm_push_inline_data_size(data), size, UVM_PUSH_INLINE_DATA_MAX_SIZE);
|
||||
|
||||
data->next_data += size;
|
||||
|
||||
@@ -368,6 +368,7 @@ void *uvm_push_inline_data_get_aligned(uvm_push_inline_data_t *data, size_t size
|
||||
size_t offset = 0;
|
||||
char *buffer;
|
||||
|
||||
UVM_ASSERT(alignment <= UVM_PAGE_SIZE_4K);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(alignment, UVM_METHOD_SIZE), "alignment %zu\n", alignment);
|
||||
|
||||
offset = UVM_ALIGN_UP(next_ptr, alignment) - next_ptr;
|
||||
@@ -404,16 +405,16 @@ uvm_gpu_address_t uvm_push_inline_data_end(uvm_push_inline_data_t *data)
|
||||
return uvm_gpu_address_virtual(inline_data_address);
|
||||
}
|
||||
|
||||
// Same as uvm_push_get_single_inline_buffer() but provides the specified
|
||||
// alignment.
|
||||
static void *push_get_single_inline_buffer_aligned(uvm_push_t *push,
|
||||
size_t size,
|
||||
size_t alignment,
|
||||
uvm_gpu_address_t *gpu_address)
|
||||
void *uvm_push_get_single_inline_buffer(uvm_push_t *push,
|
||||
size_t size,
|
||||
size_t alignment,
|
||||
uvm_gpu_address_t *gpu_address)
|
||||
{
|
||||
uvm_push_inline_data_t data;
|
||||
void *buffer;
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(alignment, UVM_METHOD_SIZE));
|
||||
|
||||
uvm_push_inline_data_begin(push, &data);
|
||||
buffer = uvm_push_inline_data_get_aligned(&data, size, alignment);
|
||||
*gpu_address = uvm_push_inline_data_end(&data);
|
||||
@@ -423,11 +424,6 @@ static void *push_get_single_inline_buffer_aligned(uvm_push_t *push,
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void *uvm_push_get_single_inline_buffer(uvm_push_t *push, size_t size, uvm_gpu_address_t *gpu_address)
|
||||
{
|
||||
return push_get_single_inline_buffer_aligned(push, size, UVM_METHOD_SIZE, gpu_address);
|
||||
}
|
||||
|
||||
NvU64 *uvm_push_timestamp(uvm_push_t *push)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
@@ -435,12 +431,15 @@ NvU64 *uvm_push_timestamp(uvm_push_t *push)
|
||||
NvU64 *timestamp;
|
||||
uvm_gpu_address_t address;
|
||||
|
||||
timestamp = (NvU64 *)push_get_single_inline_buffer_aligned(push, timestamp_size, timestamp_size, &address);
|
||||
timestamp = (NvU64 *)uvm_push_get_single_inline_buffer(push, timestamp_size, timestamp_size, &address);
|
||||
|
||||
// Timestamp is in the second half of the 16 byte semaphore release
|
||||
timestamp += 1;
|
||||
|
||||
if (uvm_channel_is_ce(push->channel))
|
||||
gpu->parent->ce_hal->semaphore_timestamp(push, address.address);
|
||||
else if (uvm_channel_is_sec2(push->channel))
|
||||
gpu->parent->sec2_hal->semaphore_timestamp(push, address.address);
|
||||
else
|
||||
UVM_ASSERT_MSG(0, "Semaphore release timestamp on an unsupported channel.\n");
|
||||
|
||||
@@ -457,6 +456,8 @@ bool uvm_push_method_is_valid(uvm_push_t *push, NvU8 subch, NvU32 method_address
|
||||
return gpu->parent->host_hal->method_is_valid(push, method_address, method_data);
|
||||
else if (subch == UVM_SW_OBJ_SUBCHANNEL)
|
||||
return gpu->parent->host_hal->sw_method_is_valid(push, method_address, method_data);
|
||||
else if (subch == UVM_SUBCHANNEL_SEC2)
|
||||
return true;
|
||||
|
||||
UVM_ERR_PRINT("Unsupported subchannel 0x%x\n", subch);
|
||||
return false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2021 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -31,10 +31,6 @@
|
||||
#include "uvm_tracker.h"
|
||||
#include "nvtypes.h"
|
||||
|
||||
// Space (in bytes) used by uvm_push_end() on a CE channel.
|
||||
// This is the storage required by a semaphore release.
|
||||
#define UVM_PUSH_CE_END_SIZE 24
|
||||
|
||||
// The max amount of inline push data is limited by how much space can be jumped
|
||||
// over with a single NOOP method.
|
||||
#define UVM_PUSH_INLINE_DATA_MAX_SIZE (UVM_METHOD_COUNT_MAX * UVM_METHOD_SIZE)
|
||||
@@ -93,6 +89,12 @@ struct uvm_push_struct
|
||||
|
||||
// A bitmap of flags from uvm_push_flag_t
|
||||
DECLARE_BITMAP(flags, UVM_PUSH_FLAG_COUNT);
|
||||
|
||||
// IV to use when launching WLC push
|
||||
UvmCslIv launch_iv;
|
||||
|
||||
// Channel to use for indirect submission
|
||||
uvm_channel_t *launch_channel;
|
||||
};
|
||||
|
||||
#define UVM_PUSH_ACQUIRE_INFO_MAX_ENTRIES 16
|
||||
@@ -127,7 +129,7 @@ struct uvm_push_acquire_info_struct
|
||||
};
|
||||
} values[UVM_PUSH_ACQUIRE_INFO_MAX_ENTRIES];
|
||||
|
||||
NvU32 num_values;
|
||||
NvU32 num_values;
|
||||
};
|
||||
|
||||
struct uvm_push_info_struct
|
||||
@@ -361,9 +363,17 @@ static bool uvm_push_has_space(uvm_push_t *push, NvU32 free_space)
|
||||
// These do just enough for inline push data and uvm_push_get_gpu() to work.
|
||||
// Used by tests that run on fake GPUs without a channel manager (see
|
||||
// uvm_page_tree_test.c for an example).
|
||||
// When the Confidential Computing feature is enabled, LCIC channels also use
|
||||
// fake push for other things, like encrypting semaphore values to unprotected
|
||||
// sysmem.
|
||||
NV_STATUS uvm_push_begin_fake(uvm_gpu_t *gpu, uvm_push_t *push);
|
||||
void uvm_push_end_fake(uvm_push_t *push);
|
||||
|
||||
static bool uvm_push_is_fake(uvm_push_t *push)
|
||||
{
|
||||
return !push->channel;
|
||||
}
|
||||
|
||||
// Begin an inline data fragment in the push
|
||||
//
|
||||
// The inline data will be ignored by the GPU, but can be referenced from
|
||||
@@ -380,6 +390,7 @@ void uvm_push_end_fake(uvm_push_t *push);
|
||||
static void uvm_push_inline_data_begin(uvm_push_t *push, uvm_push_inline_data_t *data)
|
||||
{
|
||||
data->push = push;
|
||||
|
||||
// +1 for the NOOP method inserted at inline_data_end()
|
||||
data->next_data = (char*)(push->next + 1);
|
||||
}
|
||||
@@ -407,7 +418,8 @@ void *uvm_push_inline_data_get(uvm_push_inline_data_t *data, size_t size);
|
||||
// Same as uvm_push_inline_data_get() but provides the specified alignment.
|
||||
void *uvm_push_inline_data_get_aligned(uvm_push_inline_data_t *data, size_t size, size_t alignment);
|
||||
|
||||
// Get a single buffer of size bytes of inline data in the push
|
||||
// Get a single buffer of size bytes of inline data in the push, alignment must
|
||||
// be positive and a multiple of UVM_METHOD_SIZE.
|
||||
//
|
||||
// Returns the CPU pointer to the beginning of the buffer. The buffer can be
|
||||
// accessed as long as the push is on-going. Also returns the GPU address of the
|
||||
@@ -415,7 +427,10 @@ void *uvm_push_inline_data_get_aligned(uvm_push_inline_data_t *data, size_t size
|
||||
//
|
||||
// This is a wrapper around uvm_push_inline_data_begin() and
|
||||
// uvm_push_inline_data_end() so see their comments for more details.
|
||||
void *uvm_push_get_single_inline_buffer(uvm_push_t *push, size_t size, uvm_gpu_address_t *gpu_address);
|
||||
void *uvm_push_get_single_inline_buffer(uvm_push_t *push,
|
||||
size_t size,
|
||||
size_t alignment,
|
||||
uvm_gpu_address_t *gpu_address);
|
||||
|
||||
// Helper that copies size bytes of data from src into the inline data fragment
|
||||
static void uvm_push_inline_data_add(uvm_push_inline_data_t *data, const void *src, size_t size)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2021 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -108,6 +108,14 @@
|
||||
// value. For example, Kepler reserves subchannels 5-7 for software objects.
|
||||
#define UVM_SUBCHANNEL_C076 UVM_SW_OBJ_SUBCHANNEL
|
||||
|
||||
// NVA06F_SUBCHANNEL_COMPUTE is a semi-arbitrary value for UVM_SUBCHANNEL_SEC2.
|
||||
// We need a "unique" subchannel across all subchannels UVM submits work. This
|
||||
// is used when we are post-processing a pushbuffer and we need to extract SEC2
|
||||
// methods from a it, having a unique subchannel facilitates the SEC2 method
|
||||
// identification.
|
||||
#define UVM_SUBCHANNEL_SEC2 NVA06F_SUBCHANNEL_COMPUTE
|
||||
#define UVM_SUBCHANNEL_CBA2 UVM_SUBCHANNEL_SEC2
|
||||
|
||||
#define UVM_METHOD_SIZE 4
|
||||
#define UVM_METHOD_COUNT_MAX HWMASK(B06F, DMA, INCR_COUNT)
|
||||
#if HWMASK(B06F, DMA, INCR_COUNT) != HWMASK(B06F, DMA, NONINCR_COUNT)
|
||||
@@ -126,29 +134,29 @@
|
||||
HWVALUE(B06F, DMA, NONINCR_SUBCHANNEL, (subch)) | \
|
||||
HWVALUE(B06F, DMA, NONINCR_COUNT, (count)))
|
||||
|
||||
#define __UVM_ASSERT_CONTIGUOUS_METHODS(a1, a2) BUILD_BUG_ON((a2) - (a1) != 4)
|
||||
#define __UVM_ASSERT_CONTIGUOUS_METHODS(a1, a2) BUILD_BUG_ON((a2) - (a1) != UVM_METHOD_SIZE)
|
||||
|
||||
// __NV_PUSH_*U support being called recursively from the N+1 sized method with
|
||||
// the _0U doing all the common things.
|
||||
// Notably all the push macros assume that symbol "push" of type uvm_push_t * is
|
||||
// in scope.
|
||||
#define __NV_PUSH_0U(subch, count, a1) \
|
||||
do { \
|
||||
UVM_ASSERT(!uvm_global_is_suspended()); \
|
||||
UVM_ASSERT(uvm_push_get_size(push) + (count + 1) * 4 <= UVM_MAX_PUSH_SIZE); \
|
||||
UVM_ASSERT_MSG(a1 % 4 == 0, "Address %u\n", a1); \
|
||||
\
|
||||
push->next[0] = UVM_METHOD_INC(subch, a1, count); \
|
||||
++push->next; \
|
||||
#define __NV_PUSH_0U(subch, count, a1) \
|
||||
do { \
|
||||
UVM_ASSERT(!uvm_global_is_suspended()); \
|
||||
UVM_ASSERT(uvm_push_get_size(push) + (count + 1) * UVM_METHOD_SIZE <= UVM_MAX_PUSH_SIZE); \
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(a1, UVM_METHOD_SIZE), "Address %u\n", a1); \
|
||||
\
|
||||
push->next[0] = UVM_METHOD_INC(subch, a1, count); \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
|
||||
#define __NV_PUSH_1U(subch, count, a1,d1) \
|
||||
do { \
|
||||
__NV_PUSH_0U(subch, count, a1); \
|
||||
push->next[0] = d1; \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a1, d1), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d1; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
|
||||
@@ -157,8 +165,8 @@
|
||||
__UVM_ASSERT_CONTIGUOUS_METHODS(a1, a2); \
|
||||
__NV_PUSH_1U(subch, count, a1,d1); \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a2, d2), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d2; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
@@ -168,8 +176,8 @@
|
||||
__UVM_ASSERT_CONTIGUOUS_METHODS(a2, a3); \
|
||||
__NV_PUSH_2U(subch, count, a1,d1, a2,d2); \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a3, d3), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d3; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
@@ -179,8 +187,8 @@
|
||||
__UVM_ASSERT_CONTIGUOUS_METHODS(a3, a4); \
|
||||
__NV_PUSH_3U(subch, count, a1,d1, a2,d2, a3,d3); \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a4, d4), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d4; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
@@ -190,8 +198,8 @@
|
||||
__UVM_ASSERT_CONTIGUOUS_METHODS(a4, a5); \
|
||||
__NV_PUSH_4U(subch, count, a1,d1, a2,d2, a3,d3, a4,d4); \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a5, d5), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d5; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
@@ -201,8 +209,8 @@
|
||||
__UVM_ASSERT_CONTIGUOUS_METHODS(a5, a6); \
|
||||
__NV_PUSH_5U(subch, count, a1,d1, a2,d2, a3,d3, a4,d4, a5,d5); \
|
||||
UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a6, d6), \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
"Method validation failed in channel %s\n", \
|
||||
push->channel->name); \
|
||||
push->next[0] = d6; \
|
||||
++push->next; \
|
||||
} while (0)
|
||||
@@ -248,13 +256,13 @@
|
||||
|
||||
// Non-incrementing method with count data fields following it. The data is left
|
||||
// untouched and hence it's primarily useful for a NOP method.
|
||||
#define __NV_PUSH_NU_NONINC(subch, count, address) \
|
||||
do { \
|
||||
UVM_ASSERT(!uvm_global_is_suspended()); \
|
||||
UVM_ASSERT(uvm_push_get_size(push) + (count + 1) * 4 <= UVM_MAX_PUSH_SIZE); \
|
||||
UVM_ASSERT_MSG(address % 4 == 0, "Address %u\n", address); \
|
||||
push->next[0] = UVM_METHOD_NONINC(subch, address, count); \
|
||||
push->next += count + 1; \
|
||||
#define __NV_PUSH_NU_NONINC(subch, count, address) \
|
||||
do { \
|
||||
UVM_ASSERT(!uvm_global_is_suspended()); \
|
||||
UVM_ASSERT(uvm_push_get_size(push) + (count + 1) * UVM_METHOD_SIZE <= UVM_MAX_PUSH_SIZE); \
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(address, UVM_METHOD_SIZE), "Address %u\n", address); \
|
||||
push->next[0] = UVM_METHOD_NONINC(subch, address, count); \
|
||||
push->next += count + 1; \
|
||||
} while (0)
|
||||
|
||||
#define NV_PUSH_NU_NONINC(class, a1, count) \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -38,10 +38,78 @@
|
||||
|
||||
#define TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES 2
|
||||
|
||||
static NvU32 get_push_end_size(uvm_channel_t *channel)
|
||||
static NvU32 get_push_begin_size(uvm_channel_t *channel)
|
||||
{
|
||||
if (uvm_channel_is_ce(channel))
|
||||
return UVM_PUSH_CE_END_SIZE;
|
||||
if (uvm_channel_is_sec2(channel)) {
|
||||
// SEC2 channels allocate CSL signature buffer at the beginning.
|
||||
return UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE + UVM_METHOD_SIZE;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This is the storage required by a semaphore release.
|
||||
static NvU32 get_push_end_min_size(uvm_channel_t *channel)
|
||||
{
|
||||
if (uvm_channel_is_ce(channel)) {
|
||||
if (uvm_channel_is_wlc(channel)) {
|
||||
// Space (in bytes) used by uvm_push_end() on a Secure CE channel.
|
||||
// Note that Secure CE semaphore release pushes two memset and one
|
||||
// encryption method on top of the regular release.
|
||||
// Memset size
|
||||
// -------------
|
||||
// PUSH_2U (SET_REMAP) : 3 Words
|
||||
// PUSH_2U (OFFSET_OUT) : 3 Words
|
||||
// PUSH_1U (LINE_LENGTH_IN) : 2 Words
|
||||
// PUSH_1U (LAUNCH_DMA) : 2 Words
|
||||
// Total 10 * UVM_METHOD_SIZE : 40 Bytes
|
||||
//
|
||||
// Encrypt size
|
||||
// -------------
|
||||
// PUSH_1U (SET_SECURE_COPY_MODE) : 2 Words
|
||||
// PUSH_4U (ENCRYPT_AUTH_TAG + IV) : 5 Words
|
||||
// PUSH_4U (OFFSET_IN_OUT) : 5 Words
|
||||
// PUSH_2U (LINE_LENGTH_IN) : 2 Words
|
||||
// PUSH_2U (LAUNCH_DMA) : 2 Words
|
||||
// Total 16 * UVM_METHOD_SIZE : 64 Bytes
|
||||
//
|
||||
// TOTAL : 144 Bytes
|
||||
|
||||
// Same as CE + LCIC GPPut update + LCIC doorbell
|
||||
return 24 + 144 + 24 + 24;
|
||||
}
|
||||
else if (uvm_channel_is_secure_ce(channel)) {
|
||||
return 24 + 144;
|
||||
}
|
||||
// Space (in bytes) used by uvm_push_end() on a CE channel.
|
||||
return 24;
|
||||
}
|
||||
else if (uvm_channel_is_sec2(channel)) {
|
||||
// A perfectly aligned inline buffer in SEC2 semaphore release.
|
||||
// We add UVM_METHOD_SIZE because of the NOP method to reserve
|
||||
// UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES (the inline buffer.)
|
||||
return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_METHOD_SIZE;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static NvU32 get_push_end_max_size(uvm_channel_t *channel)
|
||||
{
|
||||
if (uvm_channel_is_ce(channel)) {
|
||||
if (uvm_channel_is_wlc(channel)) {
|
||||
// WLC pushes are always padded to UVM_MAX_WLC_PUSH_SIZE
|
||||
return UVM_MAX_WLC_PUSH_SIZE;
|
||||
}
|
||||
// Space (in bytes) used by uvm_push_end() on a CE channel.
|
||||
return get_push_end_min_size(channel);
|
||||
}
|
||||
else if (uvm_channel_is_sec2(channel)) {
|
||||
// Space (in bytes) used by uvm_push_end() on a SEC2 channel.
|
||||
// Note that SEC2 semaphore release uses an inline buffer with alignment
|
||||
// requirements. This is the "worst" case semaphore_release storage.
|
||||
return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -56,27 +124,41 @@ static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
|
||||
for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; ++type) {
|
||||
uvm_push_t push;
|
||||
NvU32 push_size_before;
|
||||
NvU32 push_end_size_observed, push_end_size_expected;
|
||||
NvU32 push_end_size_observed;
|
||||
NvU32 push_end_size_expected[2];
|
||||
|
||||
// SEC2 is only available when Confidential Computing is enabled
|
||||
if ((type == UVM_CHANNEL_TYPE_SEC2) && !uvm_conf_computing_mode_enabled(gpu))
|
||||
continue;
|
||||
|
||||
// WLC is only available when Confidential Computing is enabled
|
||||
if ((type == UVM_CHANNEL_TYPE_WLC) && !uvm_conf_computing_mode_enabled(gpu))
|
||||
continue;
|
||||
|
||||
// LCIC doesn't accept pushes
|
||||
if (type == UVM_CHANNEL_TYPE_LCIC)
|
||||
continue;
|
||||
TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
|
||||
type,
|
||||
&push,
|
||||
"type %s\n",
|
||||
"type %s",
|
||||
uvm_channel_type_to_string(type)));
|
||||
|
||||
push_size_before = uvm_push_get_size(&push);
|
||||
uvm_push_end(&push);
|
||||
|
||||
push_end_size_expected = get_push_end_size(push.channel);
|
||||
push_end_size_observed = uvm_push_get_size(&push) - push_size_before;
|
||||
|
||||
if (push_end_size_observed != push_end_size_expected) {
|
||||
UVM_TEST_PRINT("push_end_size incorrect, %u instead of %u on channel type %s for GPU %s\n",
|
||||
push_end_size_expected[0] = get_push_end_min_size(push.channel);
|
||||
push_end_size_expected[1] = get_push_end_max_size(push.channel);
|
||||
|
||||
if (push_end_size_observed < push_end_size_expected[0] ||
|
||||
push_end_size_observed > push_end_size_expected[1]) {
|
||||
UVM_TEST_PRINT("push_end_size incorrect, %u instead of [%u:%u] on channel type %s for GPU %s\n",
|
||||
push_end_size_observed,
|
||||
push_end_size_expected,
|
||||
push_end_size_expected[0],
|
||||
push_end_size_expected[1],
|
||||
uvm_channel_type_to_string(type),
|
||||
uvm_gpu_name(gpu));
|
||||
|
||||
// The size mismatch error gets precedence over a wait error
|
||||
(void) uvm_push_wait(&push);
|
||||
|
||||
@@ -107,6 +189,11 @@ static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
|
||||
uvm_mem_t *mem = NULL;
|
||||
char *verif;
|
||||
|
||||
// TODO: Bug 3839176: test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_PUSH_INLINE_DATA_MAX_SIZE, current->mm, &mem);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
@@ -152,7 +239,10 @@ static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
|
||||
inline_buf[j] = 1 + i + j;
|
||||
break;
|
||||
case TEST_INLINE_SINGLE_BUFFER:
|
||||
inline_buf = (char*)uvm_push_get_single_inline_buffer(&push, test_size, &data_gpu_address);
|
||||
inline_buf = (char*)uvm_push_get_single_inline_buffer(&push,
|
||||
test_size,
|
||||
UVM_METHOD_SIZE,
|
||||
&data_gpu_address);
|
||||
inline_data_size = test_size;
|
||||
for (j = 0; j < test_size; ++j)
|
||||
inline_buf[j] = 1 + i + j;
|
||||
@@ -221,6 +311,12 @@ static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
|
||||
// A secure channels reserved at the start of a push cannot be reserved
|
||||
// again until that push ends. The test would block indefinitely
|
||||
// if secure pools are not skipped, because the number of pushes started
|
||||
// per pool exceeds the number of channels in the pool.
|
||||
if (uvm_channel_type_requires_secure_pool(gpu, channel_type))
|
||||
goto done;
|
||||
for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
|
||||
uvm_push_t *push = &pushes[i];
|
||||
status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i);
|
||||
@@ -278,6 +374,11 @@ static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
|
||||
uvm_rm_mem_t *mem = NULL;
|
||||
atomic_t on_complete_counter = ATOMIC_INIT(0);
|
||||
|
||||
// TODO: Bug 3839176: test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
// This test issues virtual memcopies/memsets, which in SR-IOV heavy cannot
|
||||
// be pushed to a proxy channel. Pushing to a UVM internal CE channel works
|
||||
// in all scenarios.
|
||||
@@ -294,7 +395,7 @@ static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
|
||||
num_non_paused_pushes = channel->num_gpfifo_entries;
|
||||
|
||||
// The UVM driver only allows push interleaving across separate threads, but
|
||||
// it is hard to consistenly replicate the interleaving. Instead, we
|
||||
// it is hard to consistently replicate the interleaving. Instead, we
|
||||
// temporarily disable lock tracking, so we can interleave pushes from a
|
||||
// single thread.
|
||||
uvm_thread_context_lock_disable_tracking();
|
||||
@@ -302,7 +403,7 @@ static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &mem);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
host_va = (NvU32*)uvm_rm_mem_get_cpu_va(mem);
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel));
|
||||
gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel)).address;
|
||||
memset(host_va, 0, size);
|
||||
|
||||
// Begin a few pushes on the channel, but do not end them yet.
|
||||
@@ -434,14 +535,14 @@ static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE));
|
||||
TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE + 1));
|
||||
TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel)));
|
||||
TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel) + 1));
|
||||
|
||||
semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(sema_to_acquire, gpu, uvm_channel_is_proxy(push->channel));
|
||||
gpu->parent->host_hal->semaphore_acquire(push, semaphore_gpu_va, value);
|
||||
|
||||
// Push a noop leaving just push_end_size in the pushbuffer.
|
||||
push_end_size = get_push_end_size(push->channel);
|
||||
push_end_size = get_push_end_max_size(push->channel);
|
||||
gpu->parent->host_hal->noop(push, UVM_MAX_PUSH_SIZE - uvm_push_get_size(push) - push_end_size);
|
||||
|
||||
TEST_CHECK_RET(uvm_push_has_space(push, push_end_size));
|
||||
@@ -476,7 +577,7 @@ static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
|
||||
|
||||
// Test doing pushes of exactly UVM_MAX_PUSH_SIZE size and only allowing them to
|
||||
// complete one by one.
|
||||
static NV_STATUS test_max_pushes_on_gpu_and_channel_type(uvm_gpu_t *gpu, uvm_channel_type_t channel_type)
|
||||
static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
@@ -485,6 +586,7 @@ static NV_STATUS test_max_pushes_on_gpu_and_channel_type(uvm_gpu_t *gpu, uvm_cha
|
||||
NvU32 total_push_size = 0;
|
||||
NvU32 push_count = 0;
|
||||
NvU32 i;
|
||||
uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
|
||||
|
||||
uvm_tracker_init(&tracker);
|
||||
|
||||
@@ -492,6 +594,13 @@ static NV_STATUS test_max_pushes_on_gpu_and_channel_type(uvm_gpu_t *gpu, uvm_cha
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
uvm_gpu_semaphore_set_payload(&sema, 0);
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
// Use SEC2 channel when Confidential Compute is enabled
|
||||
// since all other channel types need extra space for
|
||||
// work launch, and the channel type really doesn't
|
||||
// matter for this test.
|
||||
channel_type = UVM_CHANNEL_TYPE_SEC2;
|
||||
}
|
||||
|
||||
// Need to wait for all channels to completely idle so that the pushbuffer
|
||||
// is in completely idle state when we begin.
|
||||
@@ -553,14 +662,6 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
|
||||
{
|
||||
|
||||
TEST_NV_CHECK_RET(test_max_pushes_on_gpu_and_channel_type(gpu, UVM_CHANNEL_TYPE_GPU_INTERNAL));
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Test doing UVM_PUSHBUFFER_CHUNKS independent pushes expecting each one to use
|
||||
// a different chunk in the pushbuffer.
|
||||
static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
|
||||
@@ -570,6 +671,15 @@ static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
|
||||
uvm_gpu_semaphore_t sema;
|
||||
uvm_tracker_t tracker = UVM_TRACKER_INIT();
|
||||
NvU32 i;
|
||||
uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
|
||||
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
// Use SEC2 channel when Confidential Compute is enabled
|
||||
// since all other channel types need extra space for
|
||||
// work launch, and the channel type really doesn't
|
||||
// matter for this test.
|
||||
channel_type = UVM_CHANNEL_TYPE_SEC2;
|
||||
}
|
||||
|
||||
uvm_tracker_init(&tracker);
|
||||
|
||||
@@ -587,7 +697,7 @@ static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
|
||||
NvU64 semaphore_gpu_va;
|
||||
uvm_push_t push;
|
||||
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Push using chunk %u", i);
|
||||
status = uvm_push_begin(gpu->channel_manager, channel_type, &push, "Push using chunk %u", i);
|
||||
TEST_CHECK_GOTO(status == NV_OK, done);
|
||||
|
||||
semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(&sema, gpu, uvm_channel_is_proxy(push.channel));
|
||||
@@ -666,6 +776,15 @@ static NV_STATUS test_timestamp_on_gpu(uvm_gpu_t *gpu)
|
||||
NvU32 i;
|
||||
NvU64 last_stamp = 0;
|
||||
|
||||
// TODO: Bug 3988992: [UVM][HCC] RFE - Support encrypted semaphore for secure CE channels
|
||||
// This test is waived when Confidential Computing is enabled because it
|
||||
// assumes that CPU can directly read the result of a semaphore timestamp
|
||||
// operation. Instead the operation needs to be follower up by an encrypt
|
||||
// -decrypt trip to be accessible to CPU. This will be cleaner and simpler
|
||||
// once encrypted semaphores are available.
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
for (i = 0; i < 10; ++i) {
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Releasing a timestamp");
|
||||
if (status != NV_OK)
|
||||
@@ -769,6 +888,10 @@ static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
|
||||
|
||||
for_each_va_space_gpu(gpu_a, va_space) {
|
||||
|
||||
// TODO: Bug 3839176: the test is waived on Confidential Computing because
|
||||
// it assumes that GPU can access system memory without using encryption.
|
||||
if (uvm_conf_computing_mode_enabled(gpu_a))
|
||||
return NV_OK;
|
||||
for_each_va_space_gpu(gpu_b, va_space) {
|
||||
if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
|
||||
waive = false;
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "uvm_gpu.h"
|
||||
#include "uvm_common.h"
|
||||
#include "uvm_linux.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
// Print pushbuffer state into a seq_file if provided or with UVM_DBG_PRINT() if not.
|
||||
static void uvm_pushbuffer_print_common(uvm_pushbuffer_t *pushbuffer, struct seq_file *s);
|
||||
@@ -120,6 +121,36 @@ NV_STATUS uvm_pushbuffer_create(uvm_channel_manager_t *channel_manager, uvm_push
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
UVM_ASSERT(channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_SYS);
|
||||
|
||||
// Move the above allocation to unprotected_sysmem
|
||||
pushbuffer->memory_unprotected_sysmem = pushbuffer->memory;
|
||||
pushbuffer->memory = NULL;
|
||||
|
||||
// Make sure the base can be least 4KB aligned. Pushes can include inline buffers
|
||||
// with specific alignment requirement. Different base between backing memory
|
||||
// locations would change that.
|
||||
pushbuffer->memory_protected_sysmem = uvm_kvmalloc_zero(UVM_PUSHBUFFER_SIZE + UVM_PAGE_SIZE_4K);
|
||||
if (!pushbuffer->memory_protected_sysmem) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
||||
status = uvm_rm_mem_alloc(gpu,
|
||||
UVM_RM_MEM_TYPE_GPU,
|
||||
UVM_PUSHBUFFER_SIZE,
|
||||
pushbuffer_alignment,
|
||||
&pushbuffer->memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
status = uvm_rm_mem_map_gpu(pushbuffer->memory_unprotected_sysmem, gpu, pushbuffer_alignment);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Verify the GPU can access the pushbuffer.
|
||||
UVM_ASSERT((uvm_pushbuffer_get_gpu_va_base(pushbuffer) + UVM_PUSHBUFFER_SIZE - 1) < gpu->parent->max_host_va);
|
||||
|
||||
@@ -227,9 +258,24 @@ done:
|
||||
return chunk != NULL;
|
||||
}
|
||||
|
||||
static char *get_base_cpu_va(uvm_pushbuffer_t *pushbuffer)
|
||||
{
|
||||
// Confidential Computing pushes are assembled in protected sysmem
|
||||
// and safely (through encrypt/decrypt) moved to protected vidmem.
|
||||
// Or signed and moved to unprotected sysmem.
|
||||
if (uvm_conf_computing_mode_enabled(pushbuffer->channel_manager->gpu)) {
|
||||
// Align protected sysmem base to 4kB. This should be enough to give
|
||||
// the same alignment behaviour for inline buffers as the other two
|
||||
// backing memory locations.
|
||||
return (char*)(UVM_ALIGN_UP((uintptr_t)pushbuffer->memory_protected_sysmem, UVM_PAGE_SIZE_4K));
|
||||
}
|
||||
|
||||
return (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
|
||||
}
|
||||
|
||||
static NvU32 *chunk_get_next_push_start_addr(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
|
||||
{
|
||||
char *push_start = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
|
||||
char *push_start = get_base_cpu_va(pushbuffer);
|
||||
push_start += chunk_get_offset(pushbuffer, chunk);
|
||||
push_start += chunk->next_push_start;
|
||||
|
||||
@@ -266,6 +312,16 @@ NV_STATUS uvm_pushbuffer_begin_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *pu
|
||||
|
||||
UVM_ASSERT(pushbuffer);
|
||||
UVM_ASSERT(push);
|
||||
UVM_ASSERT(push->channel);
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// WLC pushes use static PB and don't count against max concurrent
|
||||
// pushes.
|
||||
push->begin = (void*)UVM_ALIGN_UP((uintptr_t)push->channel->conf_computing.static_pb_protected_sysmem,
|
||||
UVM_PAGE_SIZE_4K);
|
||||
push->next = push->begin;
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Note that this semaphore is uvm_up()ed in end_push().
|
||||
uvm_down(&pushbuffer->concurrent_pushes_sema);
|
||||
@@ -374,6 +430,8 @@ void uvm_pushbuffer_destroy(uvm_pushbuffer_t *pushbuffer)
|
||||
|
||||
proc_remove(pushbuffer->procfs.info_file);
|
||||
|
||||
uvm_rm_mem_free(pushbuffer->memory_unprotected_sysmem);
|
||||
uvm_kvfree(pushbuffer->memory_protected_sysmem);
|
||||
uvm_rm_mem_free(pushbuffer->memory);
|
||||
uvm_kvfree(pushbuffer);
|
||||
}
|
||||
@@ -426,7 +484,17 @@ void uvm_pushbuffer_mark_completed(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entr
|
||||
|
||||
NvU32 uvm_pushbuffer_get_offset_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
|
||||
{
|
||||
NvU32 offset = (char*)push->begin - (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
|
||||
NvU32 offset;
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// WLC channels use private static PB and their gpfifo entries are not
|
||||
// added to any chunk's list. This only needs to return legal offset.
|
||||
// Completion cleanup will not find WLC gpfifo entries as either first
|
||||
// or last entry of any chunk.
|
||||
return 0;
|
||||
}
|
||||
|
||||
offset = (char*)push->begin - get_base_cpu_va(pushbuffer);
|
||||
|
||||
UVM_ASSERT(((NvU64)offset) % sizeof(NvU32) == 0);
|
||||
|
||||
@@ -439,14 +507,65 @@ NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
bool is_proxy_channel = uvm_channel_is_proxy(push->channel);
|
||||
|
||||
pushbuffer_base = uvm_rm_mem_get_gpu_va(pushbuffer->memory, gpu, is_proxy_channel);
|
||||
pushbuffer_base = uvm_rm_mem_get_gpu_va(pushbuffer->memory, gpu, is_proxy_channel).address;
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel) || uvm_channel_is_lcic(push->channel)) {
|
||||
// We need to use the same static locations for PB as the fixed
|
||||
// schedule because that's what the channels are initialized to use.
|
||||
return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_protected_vidmem, gpu);
|
||||
}
|
||||
else if (uvm_channel_is_sec2(push->channel)) {
|
||||
// SEC2 PBs are in unprotected sysmem
|
||||
pushbuffer_base = uvm_pushbuffer_get_sec2_gpu_va_base(pushbuffer);
|
||||
}
|
||||
|
||||
return pushbuffer_base + uvm_pushbuffer_get_offset_for_push(pushbuffer, push);
|
||||
}
|
||||
|
||||
void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
|
||||
{
|
||||
char *pushbuffer_base;
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// Reuse existing WLC static pb for initialization
|
||||
UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
|
||||
return push->channel->conf_computing.static_pb_unprotected_sysmem_cpu;
|
||||
}
|
||||
|
||||
pushbuffer_base = uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem);
|
||||
|
||||
return pushbuffer_base + uvm_pushbuffer_get_offset_for_push(pushbuffer, push);
|
||||
}
|
||||
|
||||
NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
|
||||
{
|
||||
NvU64 pushbuffer_base;
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// Reuse existing WLC static pb for initialization
|
||||
UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
|
||||
return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_unprotected_sysmem,
|
||||
uvm_push_get_gpu(push));
|
||||
}
|
||||
|
||||
pushbuffer_base = uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, uvm_push_get_gpu(push));
|
||||
|
||||
return pushbuffer_base + uvm_pushbuffer_get_offset_for_push(pushbuffer, push);
|
||||
}
|
||||
|
||||
void uvm_pushbuffer_end_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_gpfifo_entry_t *gpfifo)
|
||||
{
|
||||
uvm_pushbuffer_chunk_t *chunk = gpfifo_to_chunk(pushbuffer, gpfifo);
|
||||
uvm_pushbuffer_chunk_t *chunk;
|
||||
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// WLC channels use static pushbuffer and don't count towards max
|
||||
// concurrent pushes. Initializing the list as head makes sure the
|
||||
// deletion in "uvm_pushbuffer_mark_completed" doesn't crash.
|
||||
INIT_LIST_HEAD(&gpfifo->pending_list_node);
|
||||
return;
|
||||
}
|
||||
|
||||
chunk = gpfifo_to_chunk(pushbuffer, gpfifo);
|
||||
|
||||
uvm_channel_pool_assert_locked(push->channel->pool);
|
||||
|
||||
@@ -513,3 +632,10 @@ NvU64 uvm_pushbuffer_get_gpu_va_base(uvm_pushbuffer_t *pushbuffer)
|
||||
{
|
||||
return uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory, pushbuffer->channel_manager->gpu);
|
||||
}
|
||||
|
||||
NvU64 uvm_pushbuffer_get_sec2_gpu_va_base(uvm_pushbuffer_t *pushbuffer)
|
||||
{
|
||||
UVM_ASSERT(uvm_conf_computing_mode_enabled(pushbuffer->channel_manager->gpu));
|
||||
|
||||
return uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, pushbuffer->channel_manager->gpu);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -155,6 +155,42 @@
|
||||
// uvm_push_end().
|
||||
#define UVM_PUSH_MAX_CONCURRENT_PUSHES UVM_PUSHBUFFER_CHUNKS
|
||||
|
||||
// Push space needed for static part for the WLC schedule, as initialized in
|
||||
// 'setup_wlc_schedule':
|
||||
// * CE decrypt (of WLC PB): 56B
|
||||
// * WFI: 8B
|
||||
// Total: 64B
|
||||
//
|
||||
// Push space needed for secure work launch is 224B. The push is constructed
|
||||
// in 'internal_channel_submit_work_indirect' and 'uvm_channel_end_push'
|
||||
// * CE decrypt (of indirect PB): 56B
|
||||
// * 2*semaphore release (indirect GPFIFO entry): 2*24B
|
||||
// * semaphore release (indirect GPPUT): 24B
|
||||
// * semaphore release (indirect doorbell): 24B
|
||||
// Appendix added in 'uvm_channel_end_push':
|
||||
// * semaphore release (WLC tracking): 168B
|
||||
// * semaphore increment (memcopy): 24B
|
||||
// * notifier memset: 40B
|
||||
// * payload encryption: 64B
|
||||
// * notifier memset: 40B
|
||||
// * semaphore increment (LCIC GPPUT): 24B
|
||||
// * semaphore release (LCIC doorbell): 24B
|
||||
// Total: 368B
|
||||
#define UVM_MAX_WLC_PUSH_SIZE (368)
|
||||
|
||||
// Push space needed for static LCIC schedule, as initialized in
|
||||
// 'setup_lcic_schedule':
|
||||
// * WFI: 8B
|
||||
// * semaphore increment (WLC GPPUT): 24B
|
||||
// * semaphore increment (WLC GPPUT): 24B
|
||||
// * semaphore increment (LCIC tracking): 160B
|
||||
// * semaphore increment (memcopy): 24B
|
||||
// * notifier memcopy: 36B
|
||||
// * payload encryption: 64B
|
||||
// * notifier memcopy: 36B
|
||||
// Total: 216B
|
||||
#define UVM_LCIC_PUSH_SIZE (216)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Offset within the chunk of where a next push should begin if there is
|
||||
@@ -176,6 +212,12 @@ struct uvm_pushbuffer_struct
|
||||
// Memory allocation backing the pushbuffer
|
||||
uvm_rm_mem_t *memory;
|
||||
|
||||
// Mirror image of memory in dma sysmem
|
||||
uvm_rm_mem_t *memory_unprotected_sysmem;
|
||||
|
||||
// Secure sysmem backing memory
|
||||
void *memory_protected_sysmem;
|
||||
|
||||
// Array of the pushbuffer chunks
|
||||
uvm_pushbuffer_chunk_t chunks[UVM_PUSHBUFFER_CHUNKS];
|
||||
|
||||
@@ -221,6 +263,12 @@ void uvm_pushbuffer_mark_completed(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entr
|
||||
// Get the GPU VA for an ongoing push
|
||||
NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push);
|
||||
|
||||
// Get the CPU VA for encrypted sysmem mirror
|
||||
void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push);
|
||||
|
||||
// Get the GPU VA for encrypted sysmem mirror
|
||||
NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push);
|
||||
|
||||
// Get the offset of the beginning of the push from the base of the pushbuffer allocation
|
||||
NvU32 uvm_pushbuffer_get_offset_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push);
|
||||
|
||||
@@ -239,4 +287,8 @@ void uvm_pushbuffer_print(uvm_pushbuffer_t *pushbuffer);
|
||||
// Helper to retrieve the pushbuffer->memory GPU VA.
|
||||
NvU64 uvm_pushbuffer_get_gpu_va_base(uvm_pushbuffer_t *pushbuffer);
|
||||
|
||||
// SEC2 variant to retrieve GPU VA for push location.
|
||||
// Unlike other channels, SEC2 uses signed pushes in unprotected sysmem.
|
||||
NvU64 uvm_pushbuffer_get_sec2_gpu_va_base(uvm_pushbuffer_t *pushbuffer);
|
||||
|
||||
#endif // __UVM_PUSHBUFFER_H__
|
||||
|
||||
@@ -78,7 +78,7 @@ static NvU64 test_free_range(uvm_range_allocator_t *range_allocator, uvm_range_a
|
||||
return size;
|
||||
}
|
||||
|
||||
#define BASIC_TEST_SIZE (1024ull * 1024 * 1024)
|
||||
#define BASIC_TEST_SIZE UVM_SIZE_1GB
|
||||
#define BASIC_TEST_MAX_ALLOCS (128)
|
||||
|
||||
// Check that a specific range is free in the allocator
|
||||
|
||||
@@ -102,12 +102,21 @@ NvU64 uvm_rm_mem_get_gpu_proxy_va(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu)
|
||||
return rm_mem->proxy_vas[uvm_global_id_value(gpu->global_id)];
|
||||
}
|
||||
|
||||
NvU64 uvm_rm_mem_get_gpu_va(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, bool is_proxy_va_space)
|
||||
uvm_gpu_address_t uvm_rm_mem_get_gpu_va(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, bool is_proxy_va_space)
|
||||
{
|
||||
uvm_gpu_address_t gpu_va = {0};
|
||||
|
||||
gpu_va.aperture = UVM_APERTURE_MAX;
|
||||
gpu_va.is_virtual = true;
|
||||
if (uvm_conf_computing_mode_enabled(gpu) && (rm_mem->type == UVM_RM_MEM_TYPE_SYS))
|
||||
gpu_va.is_unprotected = true;
|
||||
|
||||
if (is_proxy_va_space)
|
||||
return uvm_rm_mem_get_gpu_proxy_va(rm_mem, gpu);
|
||||
gpu_va.address = uvm_rm_mem_get_gpu_proxy_va(rm_mem, gpu);
|
||||
else
|
||||
return uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
|
||||
gpu_va.address = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
|
||||
|
||||
return gpu_va;
|
||||
}
|
||||
|
||||
void *uvm_rm_mem_get_cpu_va(uvm_rm_mem_t *rm_mem)
|
||||
@@ -199,6 +208,9 @@ NV_STATUS uvm_rm_mem_alloc(uvm_gpu_t *gpu,
|
||||
if (rm_mem == NULL)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
if (!uvm_conf_computing_mode_enabled(gpu) || type == UVM_RM_MEM_TYPE_SYS)
|
||||
alloc_info.bUnprotected = NV_TRUE;
|
||||
|
||||
alloc_info.alignment = gpu_alignment;
|
||||
|
||||
if (type == UVM_RM_MEM_TYPE_SYS)
|
||||
@@ -245,6 +257,8 @@ NV_STATUS uvm_rm_mem_map_cpu(uvm_rm_mem_t *rm_mem)
|
||||
|
||||
gpu = rm_mem->gpu_owner;
|
||||
gpu_va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
|
||||
if (uvm_conf_computing_mode_enabled(gpu))
|
||||
UVM_ASSERT(rm_mem->type == UVM_RM_MEM_TYPE_SYS);
|
||||
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceMemoryCpuMap(gpu->rm_address_space,
|
||||
gpu_va,
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "uvm_forward_decl.h"
|
||||
#include "uvm_processors.h"
|
||||
#include "uvm_test_ioctl.h"
|
||||
#include "uvm_hal_types.h"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@@ -143,9 +144,7 @@ NvU64 uvm_rm_mem_get_gpu_proxy_va(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu);
|
||||
|
||||
// Get the GPU VA of the given memory in UVM's internal address space (if the
|
||||
// flag is false), or proxy address space (if flag is true).
|
||||
NvU64 uvm_rm_mem_get_gpu_va(uvm_rm_mem_t *rm_mem,
|
||||
uvm_gpu_t *gpu,
|
||||
bool is_proxy_va_space);
|
||||
uvm_gpu_address_t uvm_rm_mem_get_gpu_va(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, bool is_proxy_va_space);
|
||||
|
||||
// Query if the memory is mapped on the CPU, GPU (UVM internal/kernel address
|
||||
// space), or GPU (proxy address space)
|
||||
|
||||
@@ -195,12 +195,16 @@ static NV_STATUS test_all_gpus_in_va(uvm_va_space_t *va_space)
|
||||
for (i = 0; i < ARRAY_SIZE(sizes); ++i) {
|
||||
for (j = 0; j < ARRAY_SIZE(mem_types); ++j) {
|
||||
for (k = 0; k < ARRAY_SIZE(alignments); ++k) {
|
||||
bool test_cpu_mappings = true;
|
||||
|
||||
// Create an allocation in the GPU's address space
|
||||
TEST_NV_CHECK_RET(uvm_rm_mem_alloc(gpu, mem_types[j], sizes[i], alignments[k], &rm_mem));
|
||||
|
||||
test_cpu_mappings = mem_types[j] == UVM_RM_MEM_TYPE_SYS ||
|
||||
!uvm_conf_computing_mode_enabled(gpu);
|
||||
// Test CPU mappings
|
||||
TEST_NV_CHECK_GOTO(map_cpu(rm_mem), error);
|
||||
if (test_cpu_mappings)
|
||||
TEST_NV_CHECK_GOTO(map_cpu(rm_mem), error);
|
||||
|
||||
// Test mappings in the GPU owning the allocation
|
||||
TEST_NV_CHECK_GOTO(map_gpu_owner(rm_mem, alignments[k]), error);
|
||||
|
||||
614
kernel-open/nvidia-uvm/uvm_sec2_test.c
Normal file
614
kernel-open/nvidia-uvm/uvm_sec2_test.c
Normal file
@@ -0,0 +1,614 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2021-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <uvm_common.h>
|
||||
#include <uvm_gpu.h>
|
||||
#include <uvm_mem.h>
|
||||
#include <uvm_push.h>
|
||||
#include <uvm_hal.h>
|
||||
#include <uvm_test.h>
|
||||
#include <uvm_va_space.h>
|
||||
#include <uvm_kvmalloc.h>
|
||||
#include <linux/string.h>
|
||||
#include "nv_uvm_interface.h"
|
||||
|
||||
typedef struct test_sem_mem_t {
|
||||
void *cpu_va;
|
||||
NvU64 gpu_va;
|
||||
|
||||
union {
|
||||
uvm_mem_t *uvm_mem;
|
||||
uvm_rm_mem_t *rm_mem;
|
||||
};
|
||||
} test_sem_mem;
|
||||
|
||||
static NV_STATUS test_semaphore_alloc_uvm_rm_mem(uvm_gpu_t *gpu, const size_t size, test_sem_mem *mem_out)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_rm_mem_t *mem = NULL;
|
||||
NvU64 gpu_va;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &mem);
|
||||
TEST_NV_CHECK_RET(status);
|
||||
|
||||
gpu_va = uvm_rm_mem_get_gpu_uvm_va(mem, gpu);
|
||||
TEST_CHECK_GOTO(gpu_va < gpu->parent->max_host_va, error);
|
||||
|
||||
mem_out->cpu_va = uvm_rm_mem_get_cpu_va(mem);
|
||||
mem_out->gpu_va = gpu_va;
|
||||
mem_out->rm_mem = mem;
|
||||
|
||||
return NV_OK;
|
||||
|
||||
error:
|
||||
uvm_rm_mem_free(mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_semaphore_alloc_sem(uvm_gpu_t *gpu, const size_t size, test_sem_mem *mem_out)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *mem = NULL;
|
||||
NvU64 gpu_va;
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, current->mm, &mem));
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(mem, gpu), error);
|
||||
gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
|
||||
|
||||
// Use an RM allocation when SEC2 cannot address the semaphore.
|
||||
// SEC2 VA width is similar to Host's.
|
||||
if (gpu_va >= gpu->parent->max_host_va) {
|
||||
uvm_mem_free(mem);
|
||||
return test_semaphore_alloc_uvm_rm_mem(gpu, size, mem_out);
|
||||
}
|
||||
|
||||
// This semaphore resides in the uvm_mem region, i.e., it has the GPU VA
|
||||
// MSbit set. The intent is to validate semaphore operations when the
|
||||
// semaphore's VA is in the high-end of the GPU effective virtual address
|
||||
// space spectrum, i.e., its VA upper-bit is set.
|
||||
TEST_CHECK_GOTO(gpu_va & (1ULL << (gpu->address_space_tree.hal->num_va_bits() - 1)), error);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(mem), error);
|
||||
|
||||
mem_out->cpu_va = uvm_mem_get_cpu_addr_kernel(mem);
|
||||
mem_out->gpu_va = gpu_va;
|
||||
mem_out->uvm_mem = mem;
|
||||
|
||||
return NV_OK;
|
||||
|
||||
error:
|
||||
uvm_mem_free(mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void test_semaphore_free_sem(uvm_gpu_t *gpu, test_sem_mem *mem)
|
||||
{
|
||||
if (mem->gpu_va >= gpu->parent->uvm_mem_va_base)
|
||||
uvm_mem_free(mem->uvm_mem);
|
||||
else
|
||||
uvm_rm_mem_free(mem->rm_mem);
|
||||
}
|
||||
|
||||
// This test is similar to the test_semaphore_release() test in
|
||||
// uvm_host_test.c, except that this one uses sec2_hal->semaphore_release();
|
||||
static NV_STATUS test_semaphore_release(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
test_sem_mem mem = { 0 };
|
||||
uvm_push_t push;
|
||||
NvU32 value;
|
||||
NvU32 payload = 0xA5A55A5A;
|
||||
NvU32 *cpu_ptr;
|
||||
|
||||
// Semaphore release needs 1 word (4 bytes).
|
||||
const size_t size = sizeof(NvU32);
|
||||
|
||||
status = test_semaphore_alloc_sem(gpu, size, &mem);
|
||||
TEST_NV_CHECK_RET(status);
|
||||
|
||||
// Initialize the payload.
|
||||
cpu_ptr = (NvU32 *)mem.cpu_va;
|
||||
*cpu_ptr = 0;
|
||||
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_SEC2, &push, "semaphore_release test");
|
||||
TEST_NV_CHECK_GOTO(status, done);
|
||||
|
||||
gpu->parent->sec2_hal->semaphore_release(&push, mem.gpu_va, payload);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
TEST_NV_CHECK_GOTO(status, done);
|
||||
|
||||
value = *cpu_ptr;
|
||||
if (value != payload) {
|
||||
UVM_TEST_PRINT("Semaphore payload = %u instead of %u, GPU %s\n", value, payload, uvm_gpu_name(gpu));
|
||||
status = NV_ERR_INVALID_STATE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
test_semaphore_free_sem(gpu, &mem);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// This test is similiar to the test_semaphore_timestamp() test in
|
||||
// uvm_ce_test.c, except that this one uses sec2_hal->semaphore_timestamp();
|
||||
static NV_STATUS test_semaphore_timestamp(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
test_sem_mem mem = { 0 };
|
||||
uvm_push_t push;
|
||||
NvU32 i;
|
||||
NvU64 *timestamp;
|
||||
NvU64 last_timestamp = 0;
|
||||
|
||||
// 2 iterations:
|
||||
// 1: compare retrieved timestamp with 0;
|
||||
// 2: compare retrieved timestamp with previous timestamp (obtained in 1).
|
||||
const NvU32 iterations = 2;
|
||||
|
||||
// The semaphore is 4 words long (16 bytes).
|
||||
const size_t size = 16;
|
||||
|
||||
// TODO: Bug 3804752: SEC2 semaphore timestamp is not implemented for
|
||||
// Hopper
|
||||
if (uvm_conf_computing_mode_is_hcc(gpu))
|
||||
return NV_OK;
|
||||
|
||||
status = test_semaphore_alloc_sem(gpu, size, &mem);
|
||||
TEST_NV_CHECK_RET(status);
|
||||
|
||||
timestamp = (NvU64 *)mem.cpu_va;
|
||||
TEST_CHECK_GOTO(timestamp != NULL, done);
|
||||
memset(timestamp, 0, size);
|
||||
|
||||
// Shift the timestamp pointer to where the semaphore timestamp info is.
|
||||
timestamp += 1;
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
status = uvm_push_begin(gpu->channel_manager,
|
||||
UVM_CHANNEL_TYPE_SEC2,
|
||||
&push,
|
||||
"semaphore_timestamp test, iter: %u",
|
||||
i);
|
||||
TEST_NV_CHECK_GOTO(status, done);
|
||||
|
||||
gpu->parent->sec2_hal->semaphore_timestamp(&push, mem.gpu_va);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
TEST_NV_CHECK_GOTO(status, done);
|
||||
|
||||
TEST_CHECK_GOTO(*timestamp != 0, done);
|
||||
TEST_CHECK_GOTO(*timestamp >= last_timestamp, done);
|
||||
last_timestamp = *timestamp;
|
||||
}
|
||||
|
||||
done:
|
||||
test_semaphore_free_sem(gpu, &mem);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
MEM_ALLOC_TYPE_SYSMEM_DMA,
|
||||
MEM_ALLOC_TYPE_VIDMEM_PROTECTED
|
||||
} mem_alloc_type_t;
|
||||
|
||||
static bool mem_match(uvm_mem_t *mem1, uvm_mem_t *mem2)
|
||||
{
|
||||
void *mem1_addr;
|
||||
void *mem2_addr;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem1));
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem2));
|
||||
UVM_ASSERT(mem1->size == mem2->size);
|
||||
|
||||
mem1_addr = uvm_mem_get_cpu_addr_kernel(mem1);
|
||||
mem2_addr = uvm_mem_get_cpu_addr_kernel(mem2);
|
||||
|
||||
return !memcmp(mem1_addr, mem2_addr, mem1->size);
|
||||
}
|
||||
|
||||
static NV_STATUS ce_memset_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem, size_t size, NvU32 val)
|
||||
{
|
||||
uvm_push_t push;
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
|
||||
UVM_CHANNEL_TYPE_GPU_INTERNAL,
|
||||
&push,
|
||||
"VPR memset"));
|
||||
|
||||
gpu->parent->ce_hal->memset_4(&push, uvm_mem_gpu_address_virtual_kernel(mem, gpu), val, size);
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void write_range_cpu(uvm_mem_t *mem, size_t size, NvU64 base_val)
|
||||
{
|
||||
char *start, *end;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_sysmem(mem));
|
||||
|
||||
start = uvm_mem_get_cpu_addr_kernel(mem);
|
||||
end = start + size;
|
||||
|
||||
for (; start < end; start += sizeof(NvU64))
|
||||
*(NvU64 *) start = base_val++;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_and_init_mem(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size, mem_alloc_type_t type)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
UVM_ASSERT(mem);
|
||||
|
||||
*mem = NULL;
|
||||
|
||||
if (type == MEM_ALLOC_TYPE_VIDMEM_PROTECTED) {
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc_vidmem_protected(size, gpu, mem));
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
|
||||
TEST_NV_CHECK_GOTO(ce_memset_gpu(gpu, *mem, size, 0xdead), err);
|
||||
}
|
||||
else {
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(*mem), err);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
|
||||
write_range_cpu(*mem, size, 0xdeaddead);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
|
||||
err:
|
||||
uvm_mem_free(*mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void cpu_encrypt(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
size_t copy_size)
|
||||
{
|
||||
size_t i;
|
||||
void *src_plain = uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
void *dst_cipher = uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
void *auth_tag_buffer = uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(size, copy_size));
|
||||
|
||||
for (i = 0; i < size / copy_size; i++) {
|
||||
uvm_conf_computing_cpu_encrypt(channel, dst_cipher, src_plain, NULL, copy_size, auth_tag_buffer);
|
||||
|
||||
src_plain = (char *) src_plain + copy_size;
|
||||
dst_cipher = (char *) dst_cipher + copy_size;
|
||||
auth_tag_buffer = (char *) auth_tag_buffer + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
UvmCslIv *decrypt_iv,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
size_t copy_size)
|
||||
{
|
||||
size_t i;
|
||||
void *src_cipher = uvm_mem_get_cpu_addr_kernel(src_mem);
|
||||
void *dst_plain = uvm_mem_get_cpu_addr_kernel(dst_mem);
|
||||
void *auth_tag_buffer = uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(size, copy_size));
|
||||
|
||||
for (i = 0; i < size / copy_size; i++) {
|
||||
TEST_NV_CHECK_RET(uvm_conf_computing_cpu_decrypt(channel,
|
||||
dst_plain,
|
||||
src_cipher,
|
||||
&decrypt_iv[i],
|
||||
copy_size,
|
||||
auth_tag_buffer));
|
||||
|
||||
dst_plain = (char *) dst_plain + copy_size;
|
||||
src_cipher = (char *) src_cipher + copy_size;
|
||||
auth_tag_buffer = (char *) auth_tag_buffer + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// gpu_encrypt uses a secure CE for encryption (instead of SEC2). SEC2 does not
|
||||
// support encryption. The following function is copied from uvm_ce_test.c and
|
||||
// adapted to SEC2 tests.
|
||||
static void gpu_encrypt(uvm_push_t *push,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
UvmCslIv *decrypt_iv,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
size_t copy_size)
|
||||
{
|
||||
size_t i;
|
||||
size_t num_iterations = size / copy_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
uvm_gpu_address_t dst_cipher_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
|
||||
uvm_gpu_address_t src_plain_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
|
||||
uvm_gpu_address_t auth_tag_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);
|
||||
|
||||
for (i = 0; i < num_iterations; i++) {
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
|
||||
|
||||
if (i > 0)
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
|
||||
gpu->parent->ce_hal->encrypt(push, dst_cipher_address, src_plain_address, copy_size, auth_tag_address);
|
||||
dst_cipher_address.address += copy_size;
|
||||
src_plain_address.address += copy_size;
|
||||
auth_tag_address.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
decrypt_iv++;
|
||||
}
|
||||
}
|
||||
|
||||
static void gpu_decrypt(uvm_push_t *push,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
size_t copy_size)
|
||||
{
|
||||
size_t i;
|
||||
size_t num_iterations = size / copy_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
uvm_gpu_address_t src_cipher_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
|
||||
uvm_gpu_address_t dst_plain_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
|
||||
uvm_gpu_address_t auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);
|
||||
|
||||
UVM_ASSERT(IS_ALIGNED(size, copy_size));
|
||||
|
||||
for (i = 0; i < num_iterations; i++) {
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
gpu->parent->sec2_hal->decrypt(push,
|
||||
dst_plain_address.address,
|
||||
src_cipher_address.address,
|
||||
copy_size,
|
||||
auth_tag_gpu_address.address);
|
||||
|
||||
dst_plain_address.address += copy_size;
|
||||
src_cipher_address.address += copy_size;
|
||||
auth_tag_gpu_address.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
// This test only uses sysmem so that we can use the CPU for encryption and SEC2
|
||||
// for decryption, i.e., the test doesn't depend on any other GPU engine for
|
||||
// the encryption operation (refer to test_cpu_to_gpu_roundtrip()). This is not
|
||||
// how SEC2 is used in the driver. The intended SEC2 usage is to decrypt from
|
||||
// unprotected sysmem to protected vidmem, which is tested in
|
||||
// test_cpu_to_gpu_roundtrip().
|
||||
static NV_STATUS test_cpu_to_gpu_sysmem(uvm_gpu_t *gpu, size_t copy_size, size_t size)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *src_plain = NULL;
|
||||
uvm_mem_t *cipher = NULL;
|
||||
uvm_mem_t *dst_plain = NULL;
|
||||
uvm_mem_t *auth_tag_mem = NULL;
|
||||
size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
uvm_push_t push;
|
||||
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
|
||||
write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
|
||||
write_range_cpu(dst_plain, size, 0xA5A5A5A5);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_SEC2, &push, "enc(cpu)_dec(gpu)"), out);
|
||||
|
||||
cpu_encrypt(push.channel, cipher, src_plain, auth_tag_mem, size, copy_size);
|
||||
gpu_decrypt(&push, dst_plain, cipher, auth_tag_mem, size, copy_size);
|
||||
|
||||
uvm_push_end_and_wait(&push);
|
||||
|
||||
TEST_CHECK_GOTO(mem_match(src_plain, dst_plain), out);
|
||||
|
||||
out:
|
||||
uvm_mem_free(auth_tag_mem);
|
||||
uvm_mem_free(cipher);
|
||||
uvm_mem_free(dst_plain);
|
||||
uvm_mem_free(src_plain);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// This test depends on the CE for the encryption, so we assume tests from
|
||||
// uvm_ce_test.c have successfully passed.
|
||||
static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, size_t size)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *src_plain = NULL;
|
||||
uvm_mem_t *src_cipher = NULL;
|
||||
uvm_mem_t *dst_cipher = NULL;
|
||||
uvm_mem_t *dst_plain = NULL;
|
||||
uvm_mem_t *dst_plain_cpu = NULL;
|
||||
uvm_mem_t *auth_tag_mem = NULL;
|
||||
size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
uvm_push_t push;
|
||||
UvmCslIv *decrypt_iv;
|
||||
uvm_tracker_t tracker;
|
||||
|
||||
decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
|
||||
if (!decrypt_iv)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
uvm_tracker_init(&tracker);
|
||||
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_VIDMEM_PROTECTED), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain_cpu, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
|
||||
|
||||
write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_SEC2, &push, "enc(cpu)_dec(gpu)"), out);
|
||||
|
||||
cpu_encrypt(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size);
|
||||
gpu_decrypt(&push, dst_plain, src_cipher, auth_tag_mem, size, copy_size);
|
||||
|
||||
uvm_push_end(&push);
|
||||
TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), out);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
|
||||
UVM_CHANNEL_TYPE_GPU_TO_CPU,
|
||||
&tracker,
|
||||
&push,
|
||||
"enc(gpu)_dec(cpu)"),
|
||||
out);
|
||||
|
||||
gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
|
||||
|
||||
TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher), out);
|
||||
TEST_CHECK_GOTO(!mem_match(dst_cipher, src_plain), out);
|
||||
|
||||
TEST_NV_CHECK_GOTO(cpu_decrypt(push.channel,
|
||||
dst_plain_cpu,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
out);
|
||||
|
||||
TEST_CHECK_GOTO(mem_match(src_plain, dst_plain_cpu), out);
|
||||
|
||||
out:
|
||||
uvm_mem_free(auth_tag_mem);
|
||||
uvm_mem_free(dst_plain_cpu);
|
||||
uvm_mem_free(dst_plain);
|
||||
uvm_mem_free(dst_cipher);
|
||||
uvm_mem_free(src_cipher);
|
||||
uvm_mem_free(src_plain);
|
||||
|
||||
uvm_kvfree(decrypt_iv);
|
||||
|
||||
uvm_tracker_deinit(&tracker);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_encryption_decryption(uvm_gpu_t *gpu)
|
||||
{
|
||||
size_t copy_sizes[] = { 4, 16, 512, 2 * UVM_SIZE_1KB, 4 * UVM_SIZE_1KB, 64 * UVM_SIZE_1KB, 2 * UVM_SIZE_1MB };
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(copy_sizes); i++) {
|
||||
// Limit the number of methods in the gpu_encrypt()/gpu_decrypt() work
|
||||
// submission.
|
||||
size_t size = min(UVM_VA_BLOCK_SIZE, 256ull * copy_sizes[i]);
|
||||
|
||||
// gpu_encrypt() and gpu_decrypt() iterate over a 'size' buffer. If
|
||||
// copy_sizes[i] < 16 (SEC2 src and dst alignment requirement is
|
||||
// 16-byte), SEC2 and our HAL implementation assert/fail. When
|
||||
// copy_sizes[i] < 16, we only perform a single copy_sizes[i] copy.
|
||||
if (copy_sizes[i] < 16)
|
||||
size = copy_sizes[i];
|
||||
|
||||
UVM_ASSERT(size % copy_sizes[i] == 0);
|
||||
|
||||
TEST_NV_CHECK_RET(test_cpu_to_gpu_sysmem(gpu, copy_sizes[i], size));
|
||||
TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu, copy_sizes[i], size));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS test_sec2(uvm_va_space_t *va_space)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
TEST_CHECK_RET(uvm_conf_computing_mode_enabled(gpu));
|
||||
|
||||
TEST_NV_CHECK_RET(test_semaphore_release(gpu));
|
||||
TEST_NV_CHECK_RET(test_semaphore_timestamp(gpu));
|
||||
TEST_NV_CHECK_RET(test_encryption_decryption(gpu));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_sec2_sanity(UVM_TEST_SEC2_SANITY_PARAMS *params, struct file *filp)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
|
||||
uvm_va_space_down_read_rm(va_space);
|
||||
|
||||
status = test_sec2(va_space);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
done:
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_sec2_cpu_gpu_roundtrip(UVM_TEST_SEC2_CPU_GPU_ROUNDTRIP_PARAMS *params, struct file *filp)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
uvm_gpu_t *gpu;
|
||||
|
||||
uvm_va_space_down_read(va_space);
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
TEST_CHECK_RET(uvm_conf_computing_mode_enabled(gpu));
|
||||
|
||||
// To exercise certain SEC2 context save/restore races, do a looped
|
||||
// decrypt with smaller copy sizes instead of larger copy sizes since we
|
||||
// need SEC2 to context switch with pending work in different channels
|
||||
// and smaller copies decrypt increases the probability of exercising
|
||||
// SEC2 context switching. A single push of the entire size may not be
|
||||
// enough to re-create this scenario since SEC2 doesn't preempt in the
|
||||
// middle of the decrypt.
|
||||
status = test_cpu_to_gpu_roundtrip(gpu, UVM_PAGE_SIZE_4K, UVM_VA_BLOCK_SIZE);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
uvm_va_space_up_read(va_space);
|
||||
|
||||
return status;
|
||||
}
|
||||
@@ -300,7 +300,6 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_REVERSE_MAP, uvm_test_pmm_reverse_map);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_INDIRECT_PEERS, uvm_test_pmm_indirect_peers);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_MM_RETAIN, uvm_test_va_space_mm_retain);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN, uvm_test_va_space_mm_delay_shutdown);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_CHUNK_WITH_ELEVATED_PAGE, uvm_test_pmm_chunk_with_elevated_page);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_INJECT_ERROR, uvm_test_va_space_inject_error);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_GET_GPU_TIME, uvm_test_get_gpu_time);
|
||||
@@ -327,6 +326,8 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR,
|
||||
uvm_test_va_range_inject_add_gpu_va_space_error);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY, uvm_test_destroy_gpu_va_space_delay);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SEC2_SANITY, uvm_test_sec2_sanity);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SEC2_CPU_GPU_ROUNDTRIP, uvm_test_sec2_cpu_gpu_roundtrip);
|
||||
UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
|
||||
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
|
||||
|
||||
@@ -187,5 +187,7 @@ NV_STATUS uvm_test_tools_flush_replay_events(UVM_TEST_TOOLS_FLUSH_REPLAY_EVENTS_
|
||||
NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_BUFFER_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_test_rb_tree_directed(UVM_TEST_RB_TREE_DIRECTED_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_test_rb_tree_random(UVM_TEST_RB_TREE_RANDOM_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_test_sec2_sanity(UVM_TEST_SEC2_SANITY_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_test_sec2_cpu_gpu_roundtrip(UVM_TEST_SEC2_CPU_GPU_ROUNDTRIP_PARAMS *params, struct file *filp);
|
||||
NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp);
|
||||
#endif
|
||||
|
||||
@@ -1081,20 +1081,6 @@ typedef struct
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_VA_SPACE_MM_RETAIN_PARAMS;
|
||||
|
||||
// Forces the VA space mm_shutdown callback to delay until more than one thread
|
||||
// has entered the callback. This provides a high probability of exercising code
|
||||
// to handle this race condition between exit_mmap and file close.
|
||||
//
|
||||
// The delay has an upper bound to prevent an infinite stall.
|
||||
#define UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN UVM_TEST_IOCTL_BASE(68)
|
||||
typedef struct
|
||||
{
|
||||
NvBool verbose;
|
||||
|
||||
// NV_ERR_PAGE_TABLE_NOT_AVAIL if no va_space_mm is present
|
||||
NV_STATUS rmStatus;
|
||||
} UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN_PARAMS;
|
||||
|
||||
#define UVM_TEST_PMM_CHUNK_WITH_ELEVATED_PAGE UVM_TEST_IOCTL_BASE(69)
|
||||
typedef struct
|
||||
{
|
||||
@@ -1371,10 +1357,6 @@ typedef struct
|
||||
// Approximate duration for which to sleep with the va_space_mm retained.
|
||||
NvU64 sleep_us NV_ALIGN_BYTES(8); // In
|
||||
|
||||
// On success, this contains the value of mm->mm_users before mmput() is
|
||||
// called.
|
||||
NvU64 mm_users NV_ALIGN_BYTES(8); // Out
|
||||
|
||||
// NV_ERR_PAGE_TABLE_NOT_AVAIL Could not retain va_space_mm
|
||||
// (uvm_va_space_mm_or_current_retain returned
|
||||
// NULL)
|
||||
@@ -1420,6 +1402,12 @@ typedef struct
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS;
|
||||
|
||||
#define UVM_TEST_SEC2_SANITY UVM_TEST_IOCTL_BASE(95)
|
||||
typedef struct
|
||||
{
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_SEC2_SANITY_PARAMS;
|
||||
|
||||
#define UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED UVM_TEST_IOCTL_BASE(96)
|
||||
typedef struct
|
||||
{
|
||||
@@ -1433,6 +1421,14 @@ typedef struct
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS;
|
||||
|
||||
// Tests the CSL/SEC2 encryption/decryption methods by doing a secure transfer
|
||||
// of memory from CPU->GPU and a subsequent GPU->CPU transfer.
|
||||
#define UVM_TEST_SEC2_CPU_GPU_ROUNDTRIP UVM_TEST_IOCTL_BASE(99)
|
||||
typedef struct
|
||||
{
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_SEC2_CPU_GPU_ROUNDTRIP_PARAMS;
|
||||
|
||||
#define UVM_TEST_CPU_CHUNK_API UVM_TEST_IOCTL_BASE(100)
|
||||
typedef struct
|
||||
{
|
||||
|
||||
@@ -76,6 +76,9 @@ struct uvm_thread_context_struct
|
||||
// calls try_to_migrate() doesn't pass the pgmap_owner.
|
||||
uvm_va_block_t *ignore_hmm_invalidate_va_block;
|
||||
|
||||
// Used to filter out invalidations we don't care about.
|
||||
unsigned long hmm_invalidate_seqnum;
|
||||
|
||||
// Pointer to enclosing node (if any) in red-black tree
|
||||
//
|
||||
// This field is ignored in interrupt paths
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2022 NVIDIA Corporation
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -1613,7 +1613,9 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!uvm_fd_va_space(event_tracker->uvm_file)) {
|
||||
// We don't use uvm_fd_va_space() here because tools can work
|
||||
// without an associated va_space_mm.
|
||||
if (!uvm_fd_get_type(event_tracker->uvm_file, UVM_FD_VA_SPACE)) {
|
||||
fput(event_tracker->uvm_file);
|
||||
event_tracker->uvm_file = NULL;
|
||||
status = NV_ERR_ILLEGAL_ACTION;
|
||||
@@ -2045,17 +2047,23 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
|
||||
|
||||
// The RM flavor of the lock is needed to perform ECC checks.
|
||||
uvm_va_space_down_read_rm(va_space);
|
||||
status = uvm_va_block_find_create(va_space, UVM_ALIGN_DOWN(target_va_start, PAGE_SIZE), block_context, &block);
|
||||
if (status != NV_OK) {
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
if (mm)
|
||||
uvm_up_read_mmap_lock(mm);
|
||||
goto exit;
|
||||
}
|
||||
status = uvm_va_block_find_create(va_space, UVM_PAGE_ALIGN_DOWN(target_va_start), block_context, &block);
|
||||
if (status != NV_OK)
|
||||
goto unlock_and_exit;
|
||||
|
||||
uvm_va_space_global_gpus(va_space, global_gpus);
|
||||
|
||||
for_each_global_gpu_in_mask(gpu, global_gpus) {
|
||||
|
||||
// When CC is enabled, the staging memory cannot be mapped on the
|
||||
// GPU (it is protected sysmem), but it is still used to store the
|
||||
// unencrypted version of the page contents when the page is
|
||||
// resident on vidmem.
|
||||
if (uvm_conf_computing_mode_enabled(gpu)) {
|
||||
UVM_ASSERT(uvm_global_processor_mask_empty(retained_global_gpus));
|
||||
|
||||
break;
|
||||
}
|
||||
if (uvm_global_processor_mask_test_and_set(retained_global_gpus, gpu->global_id))
|
||||
continue;
|
||||
|
||||
@@ -2069,26 +2077,15 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
|
||||
// (even if those mappings may never be used) as tools read/write is
|
||||
// not on a performance critical path.
|
||||
status = uvm_mem_map_gpu_kernel(stage_mem, gpu);
|
||||
if (status != NV_OK) {
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
if (mm)
|
||||
uvm_up_read_mmap_lock(mm);
|
||||
goto exit;
|
||||
}
|
||||
if (status != NV_OK)
|
||||
goto unlock_and_exit;
|
||||
}
|
||||
|
||||
// Make sure a CPU resident page has an up to date struct page pointer.
|
||||
if (uvm_va_block_is_hmm(block)) {
|
||||
status = uvm_hmm_va_block_update_residency_info(block,
|
||||
mm,
|
||||
UVM_ALIGN_DOWN(target_va_start, PAGE_SIZE),
|
||||
true);
|
||||
if (status != NV_OK) {
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
if (mm)
|
||||
uvm_up_read_mmap_lock(mm);
|
||||
goto exit;
|
||||
}
|
||||
status = uvm_hmm_va_block_update_residency_info(block, mm, UVM_PAGE_ALIGN_DOWN(target_va_start), true);
|
||||
if (status != NV_OK)
|
||||
goto unlock_and_exit;
|
||||
}
|
||||
|
||||
status = tools_access_va_block(block, block_context, target_va_start, bytes_now, is_write, stage_mem);
|
||||
@@ -2127,6 +2124,13 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
|
||||
*bytes += bytes_now;
|
||||
}
|
||||
|
||||
unlock_and_exit:
|
||||
if (status != NV_OK) {
|
||||
uvm_va_space_up_read_rm(va_space);
|
||||
if (mm)
|
||||
uvm_up_read_mmap_lock(mm);
|
||||
}
|
||||
|
||||
exit:
|
||||
uvm_va_block_context_free(block_context);
|
||||
|
||||
|
||||
@@ -83,6 +83,13 @@ static NV_STATUS test_tracker_completion(uvm_va_space_t *va_space)
|
||||
uvm_for_each_pool(pool, gpu->channel_manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip WLC channels as they are used for secure work launch
|
||||
if (uvm_channel_pool_is_wlc(pool))
|
||||
continue;
|
||||
|
||||
// Skip LCIC channels as those can't accept pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
uvm_push_t push;
|
||||
NvU64 semaphore_gpu_va;
|
||||
@@ -214,6 +221,9 @@ static NV_STATUS test_tracker_basic(uvm_va_space_t *va_space)
|
||||
uvm_for_each_pool(pool, gpu->channel_manager) {
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// Skip LCIC channels as those can't accept pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
continue;
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
uvm_push_t push;
|
||||
status = uvm_push_begin_on_channel(channel, &push, "Test push");
|
||||
|
||||
@@ -46,11 +46,13 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
|
||||
// A single top level PDE on Turing covers 128 TB and that's the minimum
|
||||
// size that can be used.
|
||||
parent_gpu->rm_va_base = 0;
|
||||
parent_gpu->rm_va_size = 128ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->rm_va_size = 128 * UVM_SIZE_1TB;
|
||||
|
||||
parent_gpu->uvm_mem_va_base = 384ull * 1024 * 1024 * 1024 * 1024;
|
||||
parent_gpu->uvm_mem_va_base = 384 * UVM_SIZE_1TB;
|
||||
parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
|
||||
|
||||
parent_gpu->ce_phys_vidmem_write_supported = true;
|
||||
|
||||
parent_gpu->peer_copy_mode = UVM_GPU_PEER_COPY_MODE_VIRTUAL;
|
||||
|
||||
// Not all units on Turing support 49-bit addressing, including those which
|
||||
|
||||
@@ -83,9 +83,14 @@ void uvm_hal_turing_host_clear_faulted_channel_method(uvm_push_t *push,
|
||||
|
||||
// Direct copy of uvm_hal_maxwell_host_set_gpfifo_entry(). It removes
|
||||
// GP_ENTRY1_PRIV_KERNEL, which has been deprecated in Turing+.
|
||||
void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va, NvU32 pushbuffer_length)
|
||||
void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
NvU64 pushbuffer_va,
|
||||
NvU32 pushbuffer_length,
|
||||
uvm_gpfifo_sync_t sync_flag)
|
||||
{
|
||||
NvU64 fifo_entry_value;
|
||||
const NvU32 sync_value = (sync_flag == UVM_GPFIFO_SYNC_WAIT) ? HWCONST(C46F, GP_ENTRY1, SYNC, WAIT) :
|
||||
HWCONST(C46F, GP_ENTRY1, SYNC, PROCEED);
|
||||
|
||||
UVM_ASSERT(!uvm_global_is_suspended());
|
||||
UVM_ASSERT_MSG(pushbuffer_va % 4 == 0, "pushbuffer va unaligned: %llu\n", pushbuffer_va);
|
||||
@@ -93,7 +98,8 @@ void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry, NvU64 pushbuffer_va
|
||||
|
||||
fifo_entry_value = HWVALUE(C46F, GP_ENTRY0, GET, NvU64_LO32(pushbuffer_va) >> 2);
|
||||
fifo_entry_value |= (NvU64)(HWVALUE(C46F, GP_ENTRY1, GET_HI, NvU64_HI32(pushbuffer_va)) |
|
||||
HWVALUE(C46F, GP_ENTRY1, LENGTH, pushbuffer_length >> 2)) << 32;
|
||||
HWVALUE(C46F, GP_ENTRY1, LENGTH, pushbuffer_length >> 2) |
|
||||
sync_value) << 32;
|
||||
|
||||
*fifo_entry = fifo_entry_value;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2013-2020 NVidia Corporation
|
||||
Copyright (c) 2013-2023 NVidia Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@@ -493,7 +493,8 @@ typedef enum
|
||||
UvmEventFatalReasonInternalError = 5,
|
||||
|
||||
// This value is reported when a fault is triggered in an invalid context
|
||||
// Example: CPU fault on a managed allocation while a kernel is running on a pre-Pascal GPU
|
||||
// Example: CPU fault on a managed allocation while a kernel is running on a
|
||||
// pre-Pascal GPU
|
||||
UvmEventFatalReasonInvalidOperation = 6,
|
||||
// ---- Add new values above this line
|
||||
UvmEventNumFatalReasons
|
||||
@@ -659,8 +660,8 @@ typedef struct
|
||||
// or malign-double will have no effect on the field offsets
|
||||
//
|
||||
NvU8 padding8bits;
|
||||
NvU32 batchId; // Per-GPU unique id to identify the faults that have
|
||||
// been serviced in batch
|
||||
NvU32 batchId; // Per-GPU unique id to identify the faults that
|
||||
// have been serviced in batch
|
||||
NvU64 timeStamp; // cpu time when the replay of the faulting memory
|
||||
// accesses is queued on the gpu
|
||||
NvU64 timeStampGpu; // gpu time stamp when the replay operation finished
|
||||
@@ -674,15 +675,16 @@ typedef struct
|
||||
{
|
||||
//
|
||||
// eventType has to be the 1st argument of this structure.
|
||||
// Setting eventType = UvmEventTypeFatalFault helps to identify event data in
|
||||
// a queue.
|
||||
// Setting eventType = UvmEventTypeFatalFault helps to identify event data
|
||||
// in a queue.
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 faultType; // type of gpu fault, refer UvmEventFaultType. Only valid
|
||||
// if processorIndex is a GPU
|
||||
NvU8 faultType; // type of gpu fault, refer UvmEventFaultType. Only
|
||||
// valid if processorIndex is a GPU
|
||||
NvU8 accessType; // memory access type, refer UvmEventMemoryAccessType
|
||||
NvU8 processorIndex; // processor that experienced the fault
|
||||
NvU8 reason; // reason why the fault is fatal, refer UvmEventFatalReason
|
||||
NvU8 reason; // reason why the fault is fatal, refer
|
||||
// UvmEventFatalReason
|
||||
NvU8 padding8bits;
|
||||
NvU16 padding16bits;
|
||||
NvU64 address; // virtual address at which the processor faulted
|
||||
@@ -798,8 +800,8 @@ typedef struct
|
||||
{
|
||||
//
|
||||
// eventType has to be the 1st argument of this structure.
|
||||
// Setting eventType = UvmEventTypeThrottlingStart helps to identify event data
|
||||
// in a queue.
|
||||
// Setting eventType = UvmEventTypeThrottlingStart helps to identify event
|
||||
// data in a queue.
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 processorIndex; // index of the cpu/gpu that was throttled
|
||||
@@ -819,8 +821,8 @@ typedef struct
|
||||
{
|
||||
//
|
||||
// eventType has to be the 1st argument of this structure.
|
||||
// Setting eventType = UvmEventTypeThrottlingEnd helps to identify event data
|
||||
// in a queue.
|
||||
// Setting eventType = UvmEventTypeThrottlingEnd helps to identify event
|
||||
// data in a queue.
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 processorIndex; // index of the cpu/gpu that was throttled
|
||||
@@ -946,8 +948,8 @@ typedef struct
|
||||
{
|
||||
//
|
||||
// eventType has to be the 1st argument of this structure.
|
||||
// Setting eventType = UvmEventTypeAccessCounter helps to identify event data
|
||||
// in a queue.
|
||||
// Setting eventType = UvmEventTypeAccessCounter helps to identify event
|
||||
// data in a queue.
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 srcIndex; // index of the gpu that received the access counter
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user