aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonas Hahnfeld <hahnjo@hahnjo.de>2018-02-28 17:53:46 +0000
committerJonas Hahnfeld <hahnjo@hahnjo.de>2018-02-28 17:53:46 +0000
commitf828172bcfd7d6d10497c645c3cc5eee321cd669 (patch)
tree283b488bc46e6684bdbd211bb1fbcf166ca8b84b
parentc84a8973645f32cff0883c36cc896da033471dec (diff)
[CUDA] Include single GPU binary, NFCI.
Binaries for multiple architectures are combined by fatbinary, so the current code was effectively not needed. Differential Revision: https://reviews.llvm.org/D43461 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@326342 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/clang/Frontend/CodeGenOptions.h7
-rw-r--r--lib/CodeGen/CGCUDANV.cpp135
-rw-r--r--lib/Driver/ToolChains/Clang.cpp11
-rw-r--r--lib/Frontend/CompilerInvocation.cpp4
-rw-r--r--test/Driver/cuda-options.cu23
5 files changed, 87 insertions, 93 deletions
diff --git a/include/clang/Frontend/CodeGenOptions.h b/include/clang/Frontend/CodeGenOptions.h
index 183e6ac712..dd2e9bd599 100644
--- a/include/clang/Frontend/CodeGenOptions.h
+++ b/include/clang/Frontend/CodeGenOptions.h
@@ -205,10 +205,9 @@ public:
/// the summary and module symbol table (and not, e.g. any debug metadata).
std::string ThinLinkBitcodeFile;
- /// A list of file names passed with -fcuda-include-gpubinary options to
- /// forward to CUDA runtime back-end for incorporating them into host-side
- /// object file.
- std::vector<std::string> CudaGpuBinaryFileNames;
+ /// Name of file passed with -fcuda-include-gpubinary option to forward to
+ /// CUDA runtime back-end for incorporating them into host-side object file.
+ std::string CudaGpuBinaryFileName;
/// The name of the file to which the backend should save YAML optimization
/// records.
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index d24ef0a8a9..4272aef058 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -41,10 +41,10 @@ private:
/// Keeps track of kernel launch stubs emitted in this module
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
- /// Keeps track of variables containing handles of GPU binaries. Populated by
+ /// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
- llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
+ llvm::GlobalVariable *GpuBinaryHandle = nullptr;
llvm::Constant *getSetupArgumentFn() const;
llvm::Constant *getLaunchFn() const;
@@ -245,16 +245,14 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
/// Creates a global constructor function for the module:
/// \code
/// void __cuda_module_ctor(void*) {
-/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-/// __cuda_register_globals(Handle0);
-/// ...
-/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-/// __cuda_register_globals(HandleN);
+/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
+/// __cuda_register_globals(Handle);
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
- // No need to generate ctors/dtors if there are no GPU binaries.
- if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
+ // No need to generate ctors/dtors if there is no GPU binary.
+ std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+ if (GpuBinaryFileName.empty())
return nullptr;
// void __cuda_register_globals(void* handle);
@@ -267,6 +265,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
llvm::StructType *FatbinWrapperTy =
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
+ // Register GPU binary with the CUDA runtime, store returned handle in a
+ // global variable and save a reference in GpuBinaryHandle to be cleaned up
+ // in destructor on exit. Then associate all known kernels with the GPU binary
+ // handle so CUDA runtime can figure out what to call on the GPU side.
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
+ llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
+ if (std::error_code EC = GpuBinaryOrErr.getError()) {
+ CGM.getDiags().Report(diag::err_cannot_open_file)
+ << GpuBinaryFileName << EC.message();
+ return nullptr;
+ }
+
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
@@ -276,79 +286,56 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CtorBuilder.SetInsertPoint(CtorEntryBB);
- // For each GPU binary, register it with the CUDA runtime and store returned
- // handle in a global variable and save the handle in GpuBinaryHandles vector
- // to be cleaned up in destructor on exit. Then associate all known kernels
- // with the GPU binary handle so CUDA runtime can figure out what to call on
- // the GPU side.
- for (const std::string &GpuBinaryFileName :
- CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
- llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
- if (std::error_code EC = GpuBinaryOrErr.getError()) {
- CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
- << EC.message();
- continue;
- }
-
- const char *FatbinConstantName =
- CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
- // NVIDIA's cuobjdump looks for fatbins in this section.
- const char *FatbinSectionName =
- CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
-
- // Create initialized wrapper structure that points to the loaded GPU binary
- ConstantInitBuilder Builder(CGM);
- auto Values = Builder.beginStruct(FatbinWrapperTy);
- // Fatbin wrapper magic.
- Values.addInt(IntTy, 0x466243b1);
- // Fatbin version.
- Values.addInt(IntTy, 1);
- // Data.
- Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(),
- "", FatbinConstantName, 8));
- // Unused in fatbin v1.
- Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
- llvm::GlobalVariable *FatbinWrapper =
- Values.finishAndCreateGlobal("__cuda_fatbin_wrapper",
- CGM.getPointerAlign(),
- /*constant*/ true);
- FatbinWrapper->setSection(FatbinSectionName);
-
- // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
- llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
- RegisterFatbinFunc,
- CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
- llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
- TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
- llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
- CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
- CGM.getPointerAlign());
-
- // Call __cuda_register_globals(GpuBinaryHandle);
- if (RegisterGlobalsFunc)
- CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
-
- // Save GpuBinaryHandle so we can unregister it in destructor.
- GpuBinaryHandles.push_back(GpuBinaryHandle);
- }
+ const char *FatbinConstantName =
+ CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+ // NVIDIA's cuobjdump looks for fatbins in this section.
+ const char *FatbinSectionName =
+ CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
+ // Create initialized wrapper structure that points to the loaded GPU binary
+ ConstantInitBuilder Builder(CGM);
+ auto Values = Builder.beginStruct(FatbinWrapperTy);
+ // Fatbin wrapper magic.
+ Values.addInt(IntTy, 0x466243b1);
+ // Fatbin version.
+ Values.addInt(IntTy, 1);
+ // Data.
+ Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
+ FatbinConstantName, 8));
+ // Unused in fatbin v1.
+ Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
+ llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
+ "__cuda_fatbin_wrapper", CGM.getPointerAlign(),
+ /*constant*/ true);
+ FatbinWrapper->setSection(FatbinSectionName);
+
+ // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+ llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+ RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+ GpuBinaryHandle = new llvm::GlobalVariable(
+ TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+ llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+ CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
+ CGM.getPointerAlign());
+
+ // Call __cuda_register_globals(GpuBinaryHandle);
+ if (RegisterGlobalsFunc)
+ CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
CtorBuilder.CreateRetVoid();
return ModuleCtorFunc;
}
-/// Creates a global destructor function that unregisters all GPU code blobs
+/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.
/// \code
/// void __cuda_module_dtor(void*) {
-/// __cudaUnregisterFatBinary(Handle0);
-/// ...
-/// __cudaUnregisterFatBinary(HandleN);
+/// __cudaUnregisterFatBinary(Handle);
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
- // No need for destructor if we don't have handles to unregister.
- if (GpuBinaryHandles.empty())
+ // No need for destructor if we don't have a handle to unregister.
+ if (!GpuBinaryHandle)
return nullptr;
// void __cudaUnregisterFatBinary(void ** handle);
@@ -364,11 +351,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);
- for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
- auto HandleValue =
+ auto HandleValue =
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
- DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
- }
+ DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
DtorBuilder.CreateRetVoid();
return ModuleDtorFunc;
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index 6ad38bc0e4..3683121b8a 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -4677,13 +4677,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}
if (IsCuda) {
- // Host-side cuda compilation receives device-side outputs as Inputs[1...].
- // Include them with -fcuda-include-gpubinary.
+ // Host-side cuda compilation receives all device-side outputs in a single
+ // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
if (Inputs.size() > 1) {
- for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
- CmdArgs.push_back("-fcuda-include-gpubinary");
- CmdArgs.push_back(I->getFilename());
- }
+ assert(Inputs.size() == 2 && "More than one GPU binary!");
+ CmdArgs.push_back("-fcuda-include-gpubinary");
+ CmdArgs.push_back(Inputs[1].getFilename());
}
if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 4be390b49b..a217d356dc 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -1046,8 +1046,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags,
Opts.SanitizeTrap);
- Opts.CudaGpuBinaryFileNames =
- Args.getAllArgValues(OPT_fcuda_include_gpubinary);
+ Opts.CudaGpuBinaryFileName =
+ Args.getLastArgValue(OPT_fcuda_include_gpubinary);
Opts.Backchain = Args.hasArg(OPT_mbackchain);
diff --git a/test/Driver/cuda-options.cu b/test/Driver/cuda-options.cu
index c4bfda903d..67d4fdeae2 100644
--- a/test/Driver/cuda-options.cu
+++ b/test/Driver/cuda-options.cu
@@ -73,11 +73,10 @@
// and that all results are included on the host side.
// RUN: %clang -### -target x86_64-linux-gnu \
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
-// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
-// RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM30 \
-// RUN: -check-prefix DEVICE2-SM35 -check-prefix HOST \
-// RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN: -check-prefix NOLINK %s
+// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
+// RUN: -check-prefixes DEVICE-SM30,DEVICE2-SM35 \
+// RUN: -check-prefixes INCLUDES-DEVICE,INCLUDES-DEVICE2 \
+// RUN: -check-prefixes HOST,HOST-NOSAVE,NOLINK %s
// Verify that device-side results are passed to the correct tool when
// -save-temps is used.
@@ -182,9 +181,15 @@
// DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
// DEVICE2-SAME: "-fcuda-is-device"
// DEVICE2-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
+// DEVICE2-SAME: "-o" "[[PTXFILE2:[^"]*]]"
// DEVICE2-SAME: "-x" "cuda"
+// Match another call to ptxas.
+// DEVICE2: ptxas
+// DEVICE2-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE2-DAG: "--output-file" "[[CUBINFILE2:[^"]*]]"
+// DEVICE2-DAG: "[[PTXFILE2]]"
+
// Match no device-side compilation.
// NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
// NODEVICE-NOT: "-fcuda-is-device"
@@ -193,6 +198,8 @@
// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+// INCLUDES-DEVICE2-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE2]]"
+// INCLUDES-DEVICE2-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE2]]"
// Match host-side preprocessor job with -save-temps.
// HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
@@ -207,7 +214,11 @@
// HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
// HOST-NOSAVE-SAME: "-x" "cuda"
// HOST-SAVE-SAME: "-x" "cuda-cpp-output"
+// There is only one GPU binary after combining it with fatbinary!
+// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// There is only one GPU binary after combining it with fatbinary.
+// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
// Match external assembler that uses compilation output.
// HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"