根据显卡下载对应的cuda和cudnn
我使用的是docker,首先拉取镜像,我用的是ubuntu20.04
加速:pull hub.1panel.dev/
devel是开发版本
sudo docker pull hub.1panel.dev/nvidia/cuda:11.6.1-devel-ubuntu20.04
先测试一下cuda有没有安装好
nvcc -V
更新,安装 vim、 wget
apt update
apt install vim wget
安装cudnn
cudnn下载网址:https://developer.nvidia.com/rdp/cudnn-archive
# 解压
tar -xf cudnn-linux-x86_64-8.9.7.29_cuda11-archive.tar.xz
# cd进入文件
cd cudnn-linux-x86_64-8.9.7.29_cuda11-archive
# 将include/cudnn.h文件复制到usr/local/cuda/include文件夹
cp include/cudnn.h /usr/local/cuda-11.6/include
#将lib下所有文件复制到/usr/local/cuda/lib64文件夹中
cp lib/libcudnn* /usr/local/cuda-11.6/lib64
# 添加读取权限
chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
安装cmake
直接参考之前的博客:https://blog.csdn.net/qq_42102546/article/details/135014765
安装minni conda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
激活环境
source /root/miniconda3/bin/activate
创建虚拟环境
conda create -n py_17 python=3.9
进入虚拟环境
conda activate py_17
安装yolo环境
pip install ultralytics -i https://pypi.tuna.tsinghua.edu.cn/simple
安装onnxruntime-gpu
pip install onnxruntime-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
安装cv2的依赖
apt install libglib2.0-0 libgl1-mesa-glx
测试:
import torch
import os
import cv2
print(torch.__version__) # 确认 PyTorch 版本
cuda_available = torch.cuda.is_available()
if cuda_available:
print("安装的是 GPU 版本的 PyTorch,当前可用的 GPU 数量为:", torch.cuda.device_count())
print("当前使用的 GPU 名称为:", torch.cuda.get_device_name(0))
else:
print("安装的是 CPU 版本的 PyTorch")
# 使用GPU进行计算
os.environ['CUDA_LAUNCH_BLOCKING'] = "0"
print(torch.rand(1).cuda())
a = torch.Tensor([1, 2])
a = a.cuda()
print(a)
# 查看 torch版本
print(torch.__version__)
# 查看cuda是否可用
device = torch.device('cuda')
print(torch.cuda.is_available())
print("结束")
print(cv2.__version__)
print(cv2.cuda.getCudaEnabledDeviceCount())
import onnxruntime as ort
import tensorrt
print(ort.get_device())
print(ort.get_available_providers())
print(tensorrt.__version__ )
tensorRT还没有安装,报错没有关系,等都安装好了还用这个做测试。
安装c++的opencv 可以直接看之前的博客:https://blog.csdn.net/qq_42102546/article/details/145717954
tensorRT 下载网址:https://developer.nvidia.cn/tensorrt
根据你的cuda版本去下载,我的是cuda11.6
下载后解压
tar -zxvf TensorRT-8.6.0.12.Linux.x86_64-gnu.cuda-11.8.tar.gz
然后复制到 /usr/local 这只是个人喜好,你可以直接配置环境变量
cd TensorRT-8.6.0.12
cp ./* /usr/local
配置环境变量
vim ~/.bashrc
键盘大写“G”,在最末端输入
export LD_LIBRARY_PATH=$PATH:/usr/local/TensorRT-8.6.0.12/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=$PATH:/usr/local/TensorRT-8.6.0.12/lib::$LIBRARY_PATH
cuda也可以加上
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
使其生效:
source ~/.bashrc
测试
cd /usr/local/TensorRT-8.6.0.12/samples/sampleOnnxMNIST; make; ../../bin/sample_onnx_mnist
输出结果:
c++测试
CMakeLists.txt 文件内容
cmake_minimum_required(VERSION 3.16)
project(first_cmake)
# 查找 OpenCV
find_package(OpenCV REQUIRED)
message(STATUS "OpenCV version: ${OpenCV_VERSION}")
message(STATUS "OpenCV libraries: ${OpenCV_LIBS}")
message(STATUS "OpenCV include path: ${OpenCV_INCLUDE_DIRS}")
# 查找 CUDA
find_package(CUDA REQUIRED)
message(STATUS "CUDA version: ${CUDA_VERSION}")
message(STATUS "CUDA libraries: ${CUDA_LIBRARIES}")
message(STATUS "CUDA include path: ${CUDA_INCLUDE_DIRS}")
# 查找线程库
find_package(Threads REQUIRED)
# 设置 TensorRT 路径
set(TENSORRT_INCLUDE_DIR /usr/local/TensorRT-8.6.0.12/include)
set(TENSORRT_LIBRARY_DIR /usr/local/TensorRT-8.6.0.12/lib)
# 包含 TensorRT 头文件路径
include_directories(${TENSORRT_INCLUDE_DIR})
# 链接 TensorRT 库路径
link_directories(${TENSORRT_LIBRARY_DIR})
# 添加可执行文件
add_executable(first_cmake open_ce.cpp)
# 链接库
target_link_libraries(first_cmake ${OpenCV_LIBS} Threads::Threads nvinfer nvinfer_plugin ${CUDA_LIBRARIES})
# 包含头文件目录
target_include_directories(first_cmake PRIVATE ${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR})
open_ce.cpp 文件内容
#include <iostream>
#include <NvInfer.h>
#include <cuda_runtime_api.h>
// 自定义日志记录器
class Logger : public nvinfer1::ILogger {
void log(Severity severity, const char* msg) noexcept override {
if (severity != Severity::kINFO) {
std::cerr << msg << std::endl;
}
}
};
int main() {
// 创建日志记录器
Logger logger;
// 创建构建器
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
if (!builder) {
std::cerr << "Failed to create TensorRT builder." << std::endl;
return -1;
}
// 创建网络定义
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
if (!network) {
std::cerr << "Failed to create TensorRT network." << std::endl;
builder->destroy();
return -1;
}
// 创建输入张量
nvinfer1::ITensor* input = network->addInput("input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{1, 1, 1, 1});
if (!input) {
std::cerr << "Failed to create input tensor." << std::endl;
network->destroy();
builder->destroy();
return -1;
}
// 添加一个恒等层(Identity Layer)
nvinfer1::IIdentityLayer* identityLayer = network->addIdentity(*input);
if (!identityLayer) {
std::cerr << "Failed to add identity layer." << std::endl;
network->destroy();
builder->destroy();
return -1;
}
// 获取恒等层的输出张量
nvinfer1::ITensor* output = identityLayer->getOutput(0);
output->setName("output");
// 标记输出张量
network->markOutput(*output);
// 创建构建配置
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
if (!config) {
std::cerr << "Failed to create TensorRT builder config." << std::endl;
network->destroy();
builder->destroy();
return -1;
}
// 构建引擎
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
if (!engine) {
std::cerr << "Failed to build TensorRT engine." << std::endl;
config->destroy();
network->destroy();
builder->destroy();
return -1;
}
// 创建推理上下文
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
if (!context) {
std::cerr << "Failed to create TensorRT execution context." << std::endl;
engine->destroy();
return -1;
}
// 准备输入和输出数据
float inputData[1] = {1.0f}; // 输入数据
float outputData[1]; // 输出数据
// 分配 CUDA 内存
void* d_input;
void* d_output;
cudaMalloc(&d_input, sizeof(float));
cudaMalloc(&d_output, sizeof(float));
// 将输入数据从主机内存复制到设备内存
cudaMemcpy(d_input, inputData, sizeof(float), cudaMemcpyHostToDevice);
// 定义输入和输出缓冲区指针
void* buffers[2];
buffers[0] = d_input; // 输入缓冲区
buffers[1] = d_output; // 输出缓冲区
// 执行推理
context->enqueueV2(buffers, 0, nullptr);
// 将输出数据从设备内存复制到主机内存
cudaMemcpy(outputData, d_output, sizeof(float), cudaMemcpyDeviceToHost);
// 输出结果
std::cout << "Output: " << outputData[0] << std::endl;
// 释放 CUDA 内存
cudaFree(d_input);
cudaFree(d_output);
// 释放资源
context->destroy();
engine->destroy();
config->destroy();
network->destroy();
builder->destroy();
return 0;
}
Trying to load shared library libnvinfer_builder_resource.so.8.6.0
Loaded shared library libnvinfer_builder_resource.so.8.6.0
CUDA lazy loading is enabled.
Original: 1 layers
After dead-layer removal: 1 layers
Graph construction completed in 0.00158627 seconds.
Running: IdentityToCastTransform on (Unnamed Layer* 0) [Identity]
Swap the layer type of (Unnamed Layer* 0) [Identity] from IDENTITY to CAST
After Myelin optimization: 1 layers
Applying ScaleNodes fusions.
After scale fusion: 1 layers
Running: CastToCopyTransform on (Unnamed Layer* 0) [Identity]
Swap the layer type of (Unnamed Layer* 0) [Identity] from CAST to CAST
After dupe layer removal: 1 layers
After final dead-layer removal: 1 layers
After tensor merging: 1 layers
After vertical fusions: 1 layers
After dupe layer removal: 1 layers
After final dead-layer removal: 1 layers
After tensor merging: 1 layers
After slice removal: 1 layers
After concat removal: 1 layers
Trying to split Reshape and strided tensor
Building graph using backend strategy 2
Constructing optimization profile number 0 [1/1].
Applying generic optimizations to the graph for inference.
Reserving memory for host IO tensors. Host: 0 bytes
=============== Computing reformatting costs
=============== Computing reformatting costs:
*************** Autotuning Reformat: Float(1,1,1,1) -> Float(1,1,1,1) ***************
--------------- Timing Runner: (Unnamed Layer* 0) [Identity] (Reformat[0x80000006])
Tactic: 0x00000000000003e8 Time: 0.0122651
Tactic: 0x00000000000003ea Time: 0.0224988
Tactic: 0x0000000000000000 Time: 0.00841467
(Unnamed Layer* 0) [Identity] (Reformat[0x80000006]) profiling completed in 0.0247727 seconds. Fastest Tactic: 0x0000000000000000 Time: 0.00841467
--------------- Timing Runner: (Unnamed Layer* 0) [Identity] (MyelinReformat[0x80000035])
(foreignNode) Set user’s cuda kernel library
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
Tactic: 0x0000000000000000 Time: 0.00626647
(Unnamed Layer* 0) [Identity] (MyelinReformat[0x80000035]) profiling completed in 0.3183 seconds. Fastest Tactic: 0x0000000000000000 Time: 0.00626647
Chose Runner Type: MyelinReformat Tactic: 0x0000000000000000
Formats and tactics selection completed in 0.343932 seconds.
After reformat layers: 1 layers
Total number of blocks in pre-optimized block assignment: 1
(foreignNode) Set user’s cuda kernel library
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
Layer: (Unnamed Layer* 0) [Identity] Host Persistent: 32 Device Persistent: 0 Scratch Memory: 0
Skipped printing memory information for 0 layers with 0 memory size i.e. Host Persistent + Device Persistent + Scratch Memory == 0.
Total number of blocks in optimized block assignment: 0
Total number of generated kernels selected for the engine: 0
Disabling unused tactic source: EDGE_MASK_CONVOLUTIONS
Disabling unused tactic source: JIT_CONVOLUTIONS
Engine generation completed in 0.522099 seconds.
Deleting timing cache: 1 entries, served 0 hits since creation.
Engine Layer Information:
Layer(MyelinReformat): (Unnamed Layer* 0) [Identity], Tactic: 0x0000000000000000, input (Float[1,1,1,1]) -> output (Float[1,1,1,1])
Total per-runner device persistent memory is 0
Total per-runner host persistent memory is 32
Allocated activation device memory of size 0
CUDA lazy loading is enabled.
Output: 1