|
| 1 | +//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This provides an abstract class for CUDA CIR generation. Concrete |
| 10 | +// subclasses of this implement code generation for specific OpenCL |
| 11 | +// runtime libraries. |
| 12 | +// |
| 13 | +//===----------------------------------------------------------------------===// |
| 14 | + |
| 15 | +#include "CIRGenCUDARuntime.h" |
| 16 | +#include "CIRGenCXXABI.h" |
| 17 | +#include "CIRGenFunction.h" |
| 18 | +#include "CIRGenModule.h" |
| 19 | +#include "mlir/IR/Operation.h" |
| 20 | +#include "clang/Basic/Cuda.h" |
| 21 | +#include "clang/CIR/Dialect/IR/CIRTypes.h" |
| 22 | +#include "llvm/Support/Casting.h" |
| 23 | +#include "llvm/Support/raw_ostream.h" |
| 24 | + |
| 25 | +using namespace clang; |
| 26 | +using namespace clang::CIRGen; |
| 27 | + |
| 28 | +static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) { |
| 29 | + // If the host and device have different C++ ABIs, mark it as the device |
| 30 | + // mangle context so that the mangling needs to retrieve the additional |
| 31 | + // device lambda mangling number instead of the regular host one. |
| 32 | + if (cgm.getASTContext().getAuxTargetInfo() && |
| 33 | + cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() && |
| 34 | + cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { |
| 35 | + return std::unique_ptr<MangleContext>( |
| 36 | + cgm.getASTContext().createDeviceMangleContext( |
| 37 | + *cgm.getASTContext().getAuxTargetInfo())); |
| 38 | + } |
| 39 | + |
| 40 | + return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext( |
| 41 | + cgm.getASTContext().getAuxTargetInfo())); |
| 42 | +} |
| 43 | + |
| 44 | +namespace { |
| 45 | + |
| 46 | +class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { |
| 47 | +protected: |
| 48 | + StringRef Prefix; |
| 49 | + |
| 50 | + // Map a device stub function to a symbol for identifying kernel in host |
| 51 | + // code. For CUDA, the symbol for identifying the kernel is the same as the |
| 52 | + // device stub function. For HIP, they are different. |
| 53 | + llvm::DenseMap<StringRef, mlir::Operation *> KernelHandles; |
| 54 | + |
| 55 | + // Map a kernel handle to the kernel stub. |
| 56 | + llvm::DenseMap<mlir::Operation *, mlir::Operation *> KernelStubs; |
| 57 | + |
| 58 | + // Mangle context for device. |
| 59 | + std::unique_ptr<MangleContext> deviceMC; |
| 60 | + |
| 61 | +private: |
| 62 | + void emitDeviceStubBodyLegacy(CIRGenFunction &cgf, cir::FuncOp fn, |
| 63 | + FunctionArgList &args); |
| 64 | + void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn, |
| 65 | + FunctionArgList &args); |
| 66 | + std::string addPrefixToName(StringRef FuncName) const; |
| 67 | + std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
| 68 | + |
| 69 | +public: |
| 70 | + CIRGenNVCUDARuntime(CIRGenModule &cgm); |
| 71 | + ~CIRGenNVCUDARuntime(); |
| 72 | + |
| 73 | + void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, |
| 74 | + FunctionArgList &args) override; |
| 75 | + |
| 76 | + mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD) override; |
| 77 | + |
| 78 | + void internalizeDeviceSideVar(const VarDecl *d, |
| 79 | + cir::GlobalLinkageKind &linkage) override; |
| 80 | + /// Returns function or variable name on device side even if the current |
| 81 | + /// compilation is for host. |
| 82 | + std::string getDeviceSideName(const NamedDecl *nd) override; |
| 83 | +}; |
| 84 | + |
| 85 | +} // namespace |
| 86 | + |
| 87 | +CIRGenCUDARuntime *clang::CIRGen::CreateNVCUDARuntime(CIRGenModule &cgm) { |
| 88 | + return new CIRGenNVCUDARuntime(cgm); |
| 89 | +} |
| 90 | + |
| 91 | +CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {} |
| 92 | + |
| 93 | +CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm) |
| 94 | + : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) { |
| 95 | + if (cgm.getLangOpts().OffloadViaLLVM) |
| 96 | + llvm_unreachable("NYI"); |
| 97 | + else if (cgm.getLangOpts().HIP) |
| 98 | + Prefix = "hip"; |
| 99 | + else |
| 100 | + Prefix = "cuda"; |
| 101 | +} |
| 102 | + |
| 103 | +std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
| 104 | + return (Prefix + FuncName).str(); |
| 105 | +} |
| 106 | +std::string |
| 107 | +CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
| 108 | + return ("__" + Prefix + FuncName).str(); |
| 109 | +} |
| 110 | + |
| 111 | +void CIRGenNVCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf, |
| 112 | + cir::FuncOp fn, |
| 113 | + FunctionArgList &args) { |
| 114 | + llvm_unreachable("NYI"); |
| 115 | +} |
| 116 | + |
| 117 | +void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, |
| 118 | + cir::FuncOp fn, |
| 119 | + FunctionArgList &args) { |
| 120 | + |
| 121 | + // This requires arguments to be sent to kernels in a different way. |
| 122 | + if (cgm.getLangOpts().OffloadViaLLVM) |
| 123 | + llvm_unreachable("NYI"); |
| 124 | + |
| 125 | + auto &builder = cgm.getBuilder(); |
| 126 | + |
| 127 | + // For [cuda|hip]LaunchKernel, we must add another layer of indirection |
| 128 | + // to arguments. For example, for function `add(int a, float b)`, |
| 129 | + // we need to pass it as `void *args[2] = { &a, &b }`. |
| 130 | + |
| 131 | + auto loc = fn.getLoc(); |
| 132 | + auto voidPtrArrayTy = |
| 133 | + cir::ArrayType::get(&cgm.getMLIRContext(), cgm.VoidPtrTy, args.size()); |
| 134 | + mlir::Value kernelArgs = builder.createAlloca( |
| 135 | + loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args", |
| 136 | + CharUnits::fromQuantity(16)); |
| 137 | + |
| 138 | + mlir::Value kernelArgsDecayed = |
| 139 | + builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs, |
| 140 | + cir::PointerType::get(cgm.VoidPtrTy)); |
| 141 | + |
| 142 | + // Store arguments into kernelArgs |
| 143 | + for (auto [i, arg] : llvm::enumerate(args)) { |
| 144 | + mlir::Value index = |
| 145 | + builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i)); |
| 146 | + mlir::Value storePos = |
| 147 | + builder.createPtrStride(loc, kernelArgsDecayed, index); |
| 148 | + builder.CIRBaseBuilderTy::createStore( |
| 149 | + loc, cgf.GetAddrOfLocalVar(arg).getPointer(), storePos); |
| 150 | + } |
| 151 | + |
| 152 | + // We retrieve dim3 type by looking into the second argument of |
| 153 | + // cudaLaunchKernel, as is done in OG. |
| 154 | + TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl(); |
| 155 | + DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl); |
| 156 | + |
| 157 | + // The default stream is usually stream 0 (the legacy default stream). |
| 158 | + // For per-thread default stream, we need a different LaunchKernel function. |
| 159 | + if (cgm.getLangOpts().GPUDefaultStream == |
| 160 | + LangOptions::GPUDefaultStreamKind::PerThread) |
| 161 | + llvm_unreachable("NYI"); |
| 162 | + |
| 163 | + std::string launchAPI = addPrefixToName("LaunchKernel"); |
| 164 | + const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchAPI); |
| 165 | + FunctionDecl *launchFD = nullptr; |
| 166 | + for (auto *result : dc->lookup(&launchII)) { |
| 167 | + if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result)) |
| 168 | + launchFD = fd; |
| 169 | + } |
| 170 | + |
| 171 | + if (launchFD == nullptr) { |
| 172 | + cgm.Error(cgf.CurFuncDecl->getLocation(), |
| 173 | + "Can't find declaration for " + launchAPI); |
| 174 | + return; |
| 175 | + } |
| 176 | + |
| 177 | + // Use this function to retrieve arguments for cudaLaunchKernel: |
| 178 | + // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t |
| 179 | + // *sharedMem, cudaStream_t *stream) |
| 180 | + // |
| 181 | + // Here [cuda|hip]Stream_t, while also being the 6th argument of |
| 182 | + // [cuda|hip]LaunchKernel, is a pointer to some opaque struct. |
| 183 | + |
| 184 | + mlir::Type dim3Ty = |
| 185 | + cgf.getTypes().convertType(launchFD->getParamDecl(1)->getType()); |
| 186 | + mlir::Type streamTy = |
| 187 | + cgf.getTypes().convertType(launchFD->getParamDecl(5)->getType()); |
| 188 | + |
| 189 | + mlir::Value gridDim = |
| 190 | + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, |
| 191 | + "grid_dim", CharUnits::fromQuantity(8)); |
| 192 | + mlir::Value blockDim = |
| 193 | + builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty, |
| 194 | + "block_dim", CharUnits::fromQuantity(8)); |
| 195 | + mlir::Value sharedMem = |
| 196 | + builder.createAlloca(loc, cir::PointerType::get(cgm.SizeTy), cgm.SizeTy, |
| 197 | + "shared_mem", cgm.getSizeAlign()); |
| 198 | + mlir::Value stream = |
| 199 | + builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy, |
| 200 | + "stream", cgm.getPointerAlign()); |
| 201 | + |
| 202 | + cir::FuncOp popConfig = cgm.createRuntimeFunction( |
| 203 | + cir::FuncType::get({gridDim.getType(), blockDim.getType(), |
| 204 | + sharedMem.getType(), stream.getType()}, |
| 205 | + cgm.SInt32Ty), |
| 206 | + addUnderscoredPrefixToName("PopCallConfiguration")); |
| 207 | + cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream}); |
| 208 | + |
| 209 | + // Now emit the call to cudaLaunchKernel |
| 210 | + // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim, |
| 211 | + // dim3 blockDim, |
| 212 | + // void **args, size_t sharedMem, |
| 213 | + // [cuda|hip]Stream_t stream); |
| 214 | + |
| 215 | + // We now either pick the function or the stub global for cuda, hip |
| 216 | + // resepectively. |
| 217 | + auto kernel = [&]() { |
| 218 | + if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>( |
| 219 | + KernelHandles[fn.getSymName()])) { |
| 220 | + auto kernelTy = |
| 221 | + cir::PointerType::get(&cgm.getMLIRContext(), globalOp.getSymType()); |
| 222 | + mlir::Value kernel = builder.create<cir::GetGlobalOp>( |
| 223 | + loc, kernelTy, globalOp.getSymName()); |
| 224 | + return kernel; |
| 225 | + } |
| 226 | + if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>( |
| 227 | + KernelHandles[fn.getSymName()])) { |
| 228 | + auto kernelTy = cir::PointerType::get(&cgm.getMLIRContext(), |
| 229 | + funcOp.getFunctionType()); |
| 230 | + mlir::Value kernel = |
| 231 | + builder.create<cir::GetGlobalOp>(loc, kernelTy, funcOp.getSymName()); |
| 232 | + mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy); |
| 233 | + return func; |
| 234 | + } |
| 235 | + assert(false && "Expected stub handle to be cir::GlobalOp or funcOp"); |
| 236 | + }(); |
| 237 | + // mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy); |
| 238 | + CallArgList launchArgs; |
| 239 | + |
| 240 | + launchArgs.add(RValue::get(kernel), launchFD->getParamDecl(0)->getType()); |
| 241 | + launchArgs.add( |
| 242 | + RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))), |
| 243 | + launchFD->getParamDecl(1)->getType()); |
| 244 | + launchArgs.add( |
| 245 | + RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))), |
| 246 | + launchFD->getParamDecl(2)->getType()); |
| 247 | + launchArgs.add(RValue::get(kernelArgsDecayed), |
| 248 | + launchFD->getParamDecl(3)->getType()); |
| 249 | + launchArgs.add( |
| 250 | + RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)), |
| 251 | + launchFD->getParamDecl(4)->getType()); |
| 252 | + launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)), |
| 253 | + launchFD->getParamDecl(5)->getType()); |
| 254 | + |
| 255 | + mlir::Type launchTy = cgm.getTypes().convertType(launchFD->getType()); |
| 256 | + mlir::Operation *launchFn = |
| 257 | + cgm.createRuntimeFunction(cast<cir::FuncType>(launchTy), launchAPI); |
| 258 | + const auto &callInfo = cgm.getTypes().arrangeFunctionDeclaration(launchFD); |
| 259 | + cgf.emitCall(callInfo, CIRGenCallee::forDirect(launchFn), ReturnValueSlot(), |
| 260 | + launchArgs); |
| 261 | +} |
| 262 | + |
| 263 | +void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, |
| 264 | + FunctionArgList &args) { |
| 265 | + if (auto globalOp = |
| 266 | + llvm::dyn_cast<cir::GlobalOp>(KernelHandles[fn.getSymName()])) { |
| 267 | + auto symbol = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr()); |
| 268 | + // Set the initializer for the global |
| 269 | + cgm.setInitializer(globalOp, symbol); |
| 270 | + } |
| 271 | + // CUDA 9.0 changed the way to launch kernels. |
| 272 | + if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(), |
| 273 | + CudaFeature::CUDA_USES_NEW_LAUNCH) || |
| 274 | + (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) || |
| 275 | + cgm.getLangOpts().OffloadViaLLVM) |
| 276 | + emitDeviceStubBodyNew(cgf, fn, args); |
| 277 | + else |
| 278 | + emitDeviceStubBodyLegacy(cgf, fn, args); |
| 279 | +} |
| 280 | + |
| 281 | +mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn, |
| 282 | + GlobalDecl GD) { |
| 283 | + |
| 284 | + // Check if we already have a kernel handle for this function |
| 285 | + auto Loc = KernelHandles.find(fn.getSymName()); |
| 286 | + if (Loc != KernelHandles.end()) { |
| 287 | + auto OldHandle = Loc->second; |
| 288 | + // Here we know that the fn did not change. Return it |
| 289 | + if (KernelStubs[OldHandle] == fn) |
| 290 | + return OldHandle; |
| 291 | + |
| 292 | + // We've found the function name, but F itself has changed, so we need to |
| 293 | + // update the references. |
| 294 | + if (cgm.getLangOpts().HIP) { |
| 295 | + // For HIP compilation the handle itself does not change, so we only need |
| 296 | + // to update the Stub value. |
| 297 | + KernelStubs[OldHandle] = fn; |
| 298 | + return OldHandle; |
| 299 | + } |
| 300 | + // For non-HIP compilation, erase the old Stub and fall-through to creating |
| 301 | + // new entries. |
| 302 | + KernelStubs.erase(OldHandle); |
| 303 | + } |
| 304 | + |
| 305 | + // If not targeting HIP, store the function itself |
| 306 | + if (!cgm.getLangOpts().HIP) { |
| 307 | + KernelHandles[fn.getSymName()] = fn; |
| 308 | + KernelStubs[fn] = fn; |
| 309 | + return fn; |
| 310 | + } |
| 311 | + |
| 312 | + // Create a new CIR global variable to represent the kernel handle |
| 313 | + auto &builder = cgm.getBuilder(); |
| 314 | + auto globalName = cgm.getMangledName( |
| 315 | + GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)); |
| 316 | + auto globalOp = cgm.getOrInsertGlobal( |
| 317 | + fn->getLoc(), globalName, fn.getFunctionType(), [&] { |
| 318 | + return CIRGenModule::createGlobalOp( |
| 319 | + cgm, fn->getLoc(), globalName, |
| 320 | + builder.getPointerTo(fn.getFunctionType()), true, /* addrSpace=*/{}, |
| 321 | + /*insertPoint=*/nullptr, fn.getLinkage()); |
| 322 | + }); |
| 323 | + |
| 324 | + globalOp->setAttr("alignment", builder.getI64IntegerAttr( |
| 325 | + cgm.getPointerAlign().getQuantity())); |
| 326 | + globalOp->setAttr("visibility", fn->getAttr("sym_visibility")); |
| 327 | + |
| 328 | + // Store references |
| 329 | + KernelHandles[fn.getSymName()] = globalOp; |
| 330 | + KernelStubs[globalOp] = fn; |
| 331 | + |
| 332 | + return globalOp; |
| 333 | +} |
| 334 | + |
| 335 | +std::string CIRGenNVCUDARuntime::getDeviceSideName(const NamedDecl *nd) { |
| 336 | + GlobalDecl gd; |
| 337 | + // nd could be either a kernel or a variable. |
| 338 | + if (auto *fd = dyn_cast<FunctionDecl>(nd)) |
| 339 | + gd = GlobalDecl(fd, KernelReferenceKind::Kernel); |
| 340 | + else |
| 341 | + gd = GlobalDecl(nd); |
| 342 | + std::string deviceSideName; |
| 343 | + MangleContext *mc; |
| 344 | + if (cgm.getLangOpts().CUDAIsDevice) |
| 345 | + mc = &cgm.getCXXABI().getMangleContext(); |
| 346 | + else |
| 347 | + mc = deviceMC.get(); |
| 348 | + if (mc->shouldMangleDeclName(nd)) { |
| 349 | + SmallString<256> buffer; |
| 350 | + llvm::raw_svector_ostream out(buffer); |
| 351 | + mc->mangleName(gd, out); |
| 352 | + deviceSideName = std::string(out.str()); |
| 353 | + } else |
| 354 | + deviceSideName = std::string(nd->getIdentifier()->getName()); |
| 355 | + |
| 356 | + // Make unique name for device side static file-scope variable for HIP. |
| 357 | + if (cgm.getASTContext().shouldExternalize(nd) && |
| 358 | + cgm.getLangOpts().GPURelocatableDeviceCode) { |
| 359 | + SmallString<256> buffer; |
| 360 | + llvm::raw_svector_ostream out(buffer); |
| 361 | + out << deviceSideName; |
| 362 | + cgm.printPostfixForExternalizedDecl(out, nd); |
| 363 | + deviceSideName = std::string(out.str()); |
| 364 | + } |
| 365 | + return deviceSideName; |
| 366 | +} |
| 367 | + |
| 368 | +void CIRGenNVCUDARuntime::internalizeDeviceSideVar( |
| 369 | + const VarDecl *d, cir::GlobalLinkageKind &linkage) { |
| 370 | + if (cgm.getLangOpts().GPURelocatableDeviceCode) |
| 371 | + llvm_unreachable("NYI"); |
| 372 | + |
| 373 | + // __shared__ variables are odd. Shadows do get created, but |
| 374 | + // they are not registered with the CUDA runtime, so they |
| 375 | + // can't really be used to access their device-side |
| 376 | + // counterparts. It's not clear yet whether it's nvcc's bug or |
| 377 | + // a feature, but we've got to do the same for compatibility. |
| 378 | + if (d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() || |
| 379 | + d->hasAttr<CUDASharedAttr>()) { |
| 380 | + linkage = cir::GlobalLinkageKind::InternalLinkage; |
| 381 | + } |
| 382 | + |
| 383 | + if (d->getType()->isCUDADeviceBuiltinSurfaceType() || |
| 384 | + d->getType()->isCUDADeviceBuiltinTextureType()) |
| 385 | + llvm_unreachable("NYI"); |
| 386 | +} |
0 commit comments