Skip to content

Commit 67579ff

Browse files
seven-milekoparasy
andauthored
[CIR][CIRGen][CUDA][NFC] Mirror CUDARuntime skeleton of OG (#1527)
This is a rebased version of the inactive PR #1380. --------- Co-authored-by: koparasy <parasyris1@llnl.gov>
1 parent 4d2fb6c commit 67579ff

File tree

5 files changed

+400
-347
lines changed

5 files changed

+400
-347
lines changed

Diff for: clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp

+386
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This provides an abstract class for CUDA CIR generation. Concrete
10+
// subclasses of this implement code generation for specific OpenCL
11+
// runtime libraries.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#include "CIRGenCUDARuntime.h"
16+
#include "CIRGenCXXABI.h"
17+
#include "CIRGenFunction.h"
18+
#include "CIRGenModule.h"
19+
#include "mlir/IR/Operation.h"
20+
#include "clang/Basic/Cuda.h"
21+
#include "clang/CIR/Dialect/IR/CIRTypes.h"
22+
#include "llvm/Support/Casting.h"
23+
#include "llvm/Support/raw_ostream.h"
24+
25+
using namespace clang;
26+
using namespace clang::CIRGen;
27+
28+
static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
29+
// If the host and device have different C++ ABIs, mark it as the device
30+
// mangle context so that the mangling needs to retrieve the additional
31+
// device lambda mangling number instead of the regular host one.
32+
if (cgm.getASTContext().getAuxTargetInfo() &&
33+
cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
34+
cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
35+
return std::unique_ptr<MangleContext>(
36+
cgm.getASTContext().createDeviceMangleContext(
37+
*cgm.getASTContext().getAuxTargetInfo()));
38+
}
39+
40+
return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
41+
cgm.getASTContext().getAuxTargetInfo()));
42+
}
43+
44+
namespace {
45+
46+
class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
47+
protected:
48+
StringRef Prefix;
49+
50+
// Map a device stub function to a symbol for identifying kernel in host
51+
// code. For CUDA, the symbol for identifying the kernel is the same as the
52+
// device stub function. For HIP, they are different.
53+
llvm::DenseMap<StringRef, mlir::Operation *> KernelHandles;
54+
55+
// Map a kernel handle to the kernel stub.
56+
llvm::DenseMap<mlir::Operation *, mlir::Operation *> KernelStubs;
57+
58+
// Mangle context for device.
59+
std::unique_ptr<MangleContext> deviceMC;
60+
61+
private:
62+
void emitDeviceStubBodyLegacy(CIRGenFunction &cgf, cir::FuncOp fn,
63+
FunctionArgList &args);
64+
void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
65+
FunctionArgList &args);
66+
std::string addPrefixToName(StringRef FuncName) const;
67+
std::string addUnderscoredPrefixToName(StringRef FuncName) const;
68+
69+
public:
70+
CIRGenNVCUDARuntime(CIRGenModule &cgm);
71+
~CIRGenNVCUDARuntime();
72+
73+
void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
74+
FunctionArgList &args) override;
75+
76+
mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD) override;
77+
78+
void internalizeDeviceSideVar(const VarDecl *d,
79+
cir::GlobalLinkageKind &linkage) override;
80+
/// Returns function or variable name on device side even if the current
81+
/// compilation is for host.
82+
std::string getDeviceSideName(const NamedDecl *nd) override;
83+
};
84+
85+
} // namespace
86+
87+
CIRGenCUDARuntime *clang::CIRGen::CreateNVCUDARuntime(CIRGenModule &cgm) {
88+
return new CIRGenNVCUDARuntime(cgm);
89+
}
90+
91+
CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
92+
93+
CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
94+
: CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
95+
if (cgm.getLangOpts().OffloadViaLLVM)
96+
llvm_unreachable("NYI");
97+
else if (cgm.getLangOpts().HIP)
98+
Prefix = "hip";
99+
else
100+
Prefix = "cuda";
101+
}
102+
103+
std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
104+
return (Prefix + FuncName).str();
105+
}
106+
std::string
107+
CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
108+
return ("__" + Prefix + FuncName).str();
109+
}
110+
111+
void CIRGenNVCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf,
112+
cir::FuncOp fn,
113+
FunctionArgList &args) {
114+
llvm_unreachable("NYI");
115+
}
116+
117+
void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
118+
cir::FuncOp fn,
119+
FunctionArgList &args) {
120+
121+
// This requires arguments to be sent to kernels in a different way.
122+
if (cgm.getLangOpts().OffloadViaLLVM)
123+
llvm_unreachable("NYI");
124+
125+
auto &builder = cgm.getBuilder();
126+
127+
// For [cuda|hip]LaunchKernel, we must add another layer of indirection
128+
// to arguments. For example, for function `add(int a, float b)`,
129+
// we need to pass it as `void *args[2] = { &a, &b }`.
130+
131+
auto loc = fn.getLoc();
132+
auto voidPtrArrayTy =
133+
cir::ArrayType::get(&cgm.getMLIRContext(), cgm.VoidPtrTy, args.size());
134+
mlir::Value kernelArgs = builder.createAlloca(
135+
loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
136+
CharUnits::fromQuantity(16));
137+
138+
mlir::Value kernelArgsDecayed =
139+
builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
140+
cir::PointerType::get(cgm.VoidPtrTy));
141+
142+
// Store arguments into kernelArgs
143+
for (auto [i, arg] : llvm::enumerate(args)) {
144+
mlir::Value index =
145+
builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
146+
mlir::Value storePos =
147+
builder.createPtrStride(loc, kernelArgsDecayed, index);
148+
builder.CIRBaseBuilderTy::createStore(
149+
loc, cgf.GetAddrOfLocalVar(arg).getPointer(), storePos);
150+
}
151+
152+
// We retrieve dim3 type by looking into the second argument of
153+
// cudaLaunchKernel, as is done in OG.
154+
TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
155+
DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
156+
157+
// The default stream is usually stream 0 (the legacy default stream).
158+
// For per-thread default stream, we need a different LaunchKernel function.
159+
if (cgm.getLangOpts().GPUDefaultStream ==
160+
LangOptions::GPUDefaultStreamKind::PerThread)
161+
llvm_unreachable("NYI");
162+
163+
std::string launchAPI = addPrefixToName("LaunchKernel");
164+
const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchAPI);
165+
FunctionDecl *launchFD = nullptr;
166+
for (auto *result : dc->lookup(&launchII)) {
167+
if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
168+
launchFD = fd;
169+
}
170+
171+
if (launchFD == nullptr) {
172+
cgm.Error(cgf.CurFuncDecl->getLocation(),
173+
"Can't find declaration for " + launchAPI);
174+
return;
175+
}
176+
177+
// Use this function to retrieve arguments for cudaLaunchKernel:
178+
// int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
179+
// *sharedMem, cudaStream_t *stream)
180+
//
181+
// Here [cuda|hip]Stream_t, while also being the 6th argument of
182+
// [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
183+
184+
mlir::Type dim3Ty =
185+
cgf.getTypes().convertType(launchFD->getParamDecl(1)->getType());
186+
mlir::Type streamTy =
187+
cgf.getTypes().convertType(launchFD->getParamDecl(5)->getType());
188+
189+
mlir::Value gridDim =
190+
builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
191+
"grid_dim", CharUnits::fromQuantity(8));
192+
mlir::Value blockDim =
193+
builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
194+
"block_dim", CharUnits::fromQuantity(8));
195+
mlir::Value sharedMem =
196+
builder.createAlloca(loc, cir::PointerType::get(cgm.SizeTy), cgm.SizeTy,
197+
"shared_mem", cgm.getSizeAlign());
198+
mlir::Value stream =
199+
builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
200+
"stream", cgm.getPointerAlign());
201+
202+
cir::FuncOp popConfig = cgm.createRuntimeFunction(
203+
cir::FuncType::get({gridDim.getType(), blockDim.getType(),
204+
sharedMem.getType(), stream.getType()},
205+
cgm.SInt32Ty),
206+
addUnderscoredPrefixToName("PopCallConfiguration"));
207+
cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
208+
209+
// Now emit the call to cudaLaunchKernel
210+
// [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
211+
// dim3 blockDim,
212+
// void **args, size_t sharedMem,
213+
// [cuda|hip]Stream_t stream);
214+
215+
// We now either pick the function or the stub global for cuda, hip
216+
// resepectively.
217+
auto kernel = [&]() {
218+
if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
219+
KernelHandles[fn.getSymName()])) {
220+
auto kernelTy =
221+
cir::PointerType::get(&cgm.getMLIRContext(), globalOp.getSymType());
222+
mlir::Value kernel = builder.create<cir::GetGlobalOp>(
223+
loc, kernelTy, globalOp.getSymName());
224+
return kernel;
225+
}
226+
if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
227+
KernelHandles[fn.getSymName()])) {
228+
auto kernelTy = cir::PointerType::get(&cgm.getMLIRContext(),
229+
funcOp.getFunctionType());
230+
mlir::Value kernel =
231+
builder.create<cir::GetGlobalOp>(loc, kernelTy, funcOp.getSymName());
232+
mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
233+
return func;
234+
}
235+
assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
236+
}();
237+
// mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
238+
CallArgList launchArgs;
239+
240+
launchArgs.add(RValue::get(kernel), launchFD->getParamDecl(0)->getType());
241+
launchArgs.add(
242+
RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
243+
launchFD->getParamDecl(1)->getType());
244+
launchArgs.add(
245+
RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
246+
launchFD->getParamDecl(2)->getType());
247+
launchArgs.add(RValue::get(kernelArgsDecayed),
248+
launchFD->getParamDecl(3)->getType());
249+
launchArgs.add(
250+
RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
251+
launchFD->getParamDecl(4)->getType());
252+
launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
253+
launchFD->getParamDecl(5)->getType());
254+
255+
mlir::Type launchTy = cgm.getTypes().convertType(launchFD->getType());
256+
mlir::Operation *launchFn =
257+
cgm.createRuntimeFunction(cast<cir::FuncType>(launchTy), launchAPI);
258+
const auto &callInfo = cgm.getTypes().arrangeFunctionDeclaration(launchFD);
259+
cgf.emitCall(callInfo, CIRGenCallee::forDirect(launchFn), ReturnValueSlot(),
260+
launchArgs);
261+
}
262+
263+
void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
264+
FunctionArgList &args) {
265+
if (auto globalOp =
266+
llvm::dyn_cast<cir::GlobalOp>(KernelHandles[fn.getSymName()])) {
267+
auto symbol = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
268+
// Set the initializer for the global
269+
cgm.setInitializer(globalOp, symbol);
270+
}
271+
// CUDA 9.0 changed the way to launch kernels.
272+
if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
273+
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
274+
(cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
275+
cgm.getLangOpts().OffloadViaLLVM)
276+
emitDeviceStubBodyNew(cgf, fn, args);
277+
else
278+
emitDeviceStubBodyLegacy(cgf, fn, args);
279+
}
280+
281+
mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
282+
GlobalDecl GD) {
283+
284+
// Check if we already have a kernel handle for this function
285+
auto Loc = KernelHandles.find(fn.getSymName());
286+
if (Loc != KernelHandles.end()) {
287+
auto OldHandle = Loc->second;
288+
// Here we know that the fn did not change. Return it
289+
if (KernelStubs[OldHandle] == fn)
290+
return OldHandle;
291+
292+
// We've found the function name, but F itself has changed, so we need to
293+
// update the references.
294+
if (cgm.getLangOpts().HIP) {
295+
// For HIP compilation the handle itself does not change, so we only need
296+
// to update the Stub value.
297+
KernelStubs[OldHandle] = fn;
298+
return OldHandle;
299+
}
300+
// For non-HIP compilation, erase the old Stub and fall-through to creating
301+
// new entries.
302+
KernelStubs.erase(OldHandle);
303+
}
304+
305+
// If not targeting HIP, store the function itself
306+
if (!cgm.getLangOpts().HIP) {
307+
KernelHandles[fn.getSymName()] = fn;
308+
KernelStubs[fn] = fn;
309+
return fn;
310+
}
311+
312+
// Create a new CIR global variable to represent the kernel handle
313+
auto &builder = cgm.getBuilder();
314+
auto globalName = cgm.getMangledName(
315+
GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
316+
auto globalOp = cgm.getOrInsertGlobal(
317+
fn->getLoc(), globalName, fn.getFunctionType(), [&] {
318+
return CIRGenModule::createGlobalOp(
319+
cgm, fn->getLoc(), globalName,
320+
builder.getPointerTo(fn.getFunctionType()), true, /* addrSpace=*/{},
321+
/*insertPoint=*/nullptr, fn.getLinkage());
322+
});
323+
324+
globalOp->setAttr("alignment", builder.getI64IntegerAttr(
325+
cgm.getPointerAlign().getQuantity()));
326+
globalOp->setAttr("visibility", fn->getAttr("sym_visibility"));
327+
328+
// Store references
329+
KernelHandles[fn.getSymName()] = globalOp;
330+
KernelStubs[globalOp] = fn;
331+
332+
return globalOp;
333+
}
334+
335+
std::string CIRGenNVCUDARuntime::getDeviceSideName(const NamedDecl *nd) {
336+
GlobalDecl gd;
337+
// nd could be either a kernel or a variable.
338+
if (auto *fd = dyn_cast<FunctionDecl>(nd))
339+
gd = GlobalDecl(fd, KernelReferenceKind::Kernel);
340+
else
341+
gd = GlobalDecl(nd);
342+
std::string deviceSideName;
343+
MangleContext *mc;
344+
if (cgm.getLangOpts().CUDAIsDevice)
345+
mc = &cgm.getCXXABI().getMangleContext();
346+
else
347+
mc = deviceMC.get();
348+
if (mc->shouldMangleDeclName(nd)) {
349+
SmallString<256> buffer;
350+
llvm::raw_svector_ostream out(buffer);
351+
mc->mangleName(gd, out);
352+
deviceSideName = std::string(out.str());
353+
} else
354+
deviceSideName = std::string(nd->getIdentifier()->getName());
355+
356+
// Make unique name for device side static file-scope variable for HIP.
357+
if (cgm.getASTContext().shouldExternalize(nd) &&
358+
cgm.getLangOpts().GPURelocatableDeviceCode) {
359+
SmallString<256> buffer;
360+
llvm::raw_svector_ostream out(buffer);
361+
out << deviceSideName;
362+
cgm.printPostfixForExternalizedDecl(out, nd);
363+
deviceSideName = std::string(out.str());
364+
}
365+
return deviceSideName;
366+
}
367+
368+
void CIRGenNVCUDARuntime::internalizeDeviceSideVar(
369+
const VarDecl *d, cir::GlobalLinkageKind &linkage) {
370+
if (cgm.getLangOpts().GPURelocatableDeviceCode)
371+
llvm_unreachable("NYI");
372+
373+
// __shared__ variables are odd. Shadows do get created, but
374+
// they are not registered with the CUDA runtime, so they
375+
// can't really be used to access their device-side
376+
// counterparts. It's not clear yet whether it's nvcc's bug or
377+
// a feature, but we've got to do the same for compatibility.
378+
if (d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
379+
d->hasAttr<CUDASharedAttr>()) {
380+
linkage = cir::GlobalLinkageKind::InternalLinkage;
381+
}
382+
383+
if (d->getType()->isCUDADeviceBuiltinSurfaceType() ||
384+
d->getType()->isCUDADeviceBuiltinTextureType())
385+
llvm_unreachable("NYI");
386+
}

0 commit comments

Comments
 (0)