hiprtc_runtime.h 550 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045140461404714048140491405014051140521405314054140551405614057140581405914060140611406214063140641406514066140671406814069140701407114072140731407414075140761407714078140791408014081140821408314084140851408614087140881408914090140911409214093140941409514096140971409814099141001410114102141031410414105141061410714108141091411014111141121411314114141151411614117141181411914120141211412214123141241412514126141271412814129141301413114132141331413414135141361413714138141391414014141141421414314144141451414614147141481414914150141511415214153141541415514156141571415814159141601416114162141631416414165141661416714168141691417014171141721417314174141751417614177141781417914180141811418214183141841418514186141871418814189141901419114192141931419414195141961419714198141991420014201142021420314204142051420614207142081420914210142111421214213142141421514216142171421814219142201422114222142231422414225142261422714228142291423014231142321423314234142351423614237142381423914240142411424214243142441424514246142471424814249142501425114252142531425414255142561425714258142591426014261142621426314264142651426614267142681426914270142711427214273142741427514276142771427814279142801428114282142831428414285142861428714288142891429014291142921429314294142951429614297142981429914300143011430214303143041430514306143071430814309143101431114312143131431414315143161431714318143191432014321143221432314324143251432614327143281432914330143311433214333143341433514336143371433814339143401434114342143431434414345143461434714348143491435014351143521435314354143551435614357143581435914360143611436214363143641436514366143671436814369143701437114372143731437414375143761437714378143791438014381143821438314384143851438614387143881438914390143911439214393143941439514396143971439814399144001440114402144031440414405144061440714408144091441014411144121441314414144151441614417144181441914420144211442214423144241442514426144271442814429144301443114432144331443414435144361443714438144391444014441144421444314444144451444614447144481444914450144511445214453144541445514456144571445814459144601446114462144631446414465144661446714468144691447014471144721447314474144751447614477144781447914480144811448214483144841448514486144871448814489144901449114492144931449414495144961449714498144991450014501145021450314504145051450614507145081450914510145111451214513145141451514516145171451814519145201452114522145231452414525145261452714528145291453014531145321453314534145351453614537145381453914540145411454214543145441454514546145471454814549145501455114552145531455414555145561455714558145591456014561145621456314564145651456614567145681456914570145711457214573145741457514576145771457814579145801458114582145831458414585145861458714588145891459014591145921459314594145951459614597145981459914600146011460214603146041460514606146071460814609146101461114612146131461414615146161461714618146191462014621146221462314624146251462614627146281462914630146311463214633146341463514636146371463814639146401464114642146431464414645146461464714648146491465014651146521465314654146551465614657146581465914660146611466214663146641466514666146671466814669146701467114672146731467414675146761467714678146791468014681146821468314684146851468614687146881468914690146911469214693146941469514696146971469814699147001470114702147031470414705147061470714708147091471014711147121471314714147151471614717147181471914720147211472214723147241472514726147271472814729147301473114732147331473414735147361473714738147391474014741147421474314744147451474614747147481474914750147511475214753147541475514756147571475814759147601476114762147631476414765147661476714768147691477014771147721477314774147751477614777147781477914780147811478214783147841478514786147871478814789147901479114792147931479414795147961479714798147991480014801148021480314804148051480614807148081480914810148111481214813148141481514816148171481814819148201482114822148231482414825148261482714828148291483014831148321483314834148351483614837148381483914840148411484214843148441484514846148471484814849148501485114852148531485414855148561485714858148591486014861148621486314864148651486614867148681486914870148711487214873148741487514876148771487814879148801488114882148831488414885148861488714888148891489014891148921489314894148951489614897148981489914900149011490214903149041490514906149071490814909149101491114912149131491414915149161491714918149191492014921149221492314924149251492614927149281492914930149311493214933149341493514936149371493814939149401494114942149431494414945149461494714948149491495014951149521495314954149551495614957149581495914960149611496214963149641496514966149671496814969149701497114972149731497414975149761497714978149791498014981149821498314984149851498614987149881498914990149911499214993149941499514996149971499814999150001500115002150031500415005150061500715008150091501015011150121501315014150151501615017150181501915020150211502215023150241502515026150271502815029150301503115032150331503415035150361503715038150391504015041150421504315044150451504615047150481504915050150511505215053150541505515056150571505815059150601506115062150631506415065150661506715068150691507015071150721507315074150751507615077150781507915080150811508215083150841508515086150871508815089150901509115092150931509415095150961509715098150991510015101151021510315104151051510615107151081510915110151111511215113151141511515116151171511815119151201512115122151231512415125151261512715128151291513015131151321513315134151351513615137151381513915140151411514215143
  1. #pragma clang diagnostic ignored "-Weverything"
  2. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h"
  3. # 1 "<built-in>" 1
  4. # 1 "<built-in>" 3
  5. # 845 "<built-in>" 3
  6. # 1 "<command line>" 1
  7. # 1 "<built-in>" 2
  8. # 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 1 3
  9. # 33 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
  10. extern "C" {
  11. __attribute__((__visibility__("default")))
  12. __attribute__((weak))
  13. __attribute__((noreturn))
  14. __attribute__((device)) void __cxa_pure_virtual(void) {
  15. __builtin_trap();
  16. }
  17. __attribute__((__visibility__("default")))
  18. __attribute__((weak))
  19. __attribute__((noreturn))
  20. __attribute__((device)) void __cxa_deleted_virtual(void) {
  21. __builtin_trap();
  22. }
  23. }
  24. # 57 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
  25. typedef long unsigned int size_t;
  26. # 74 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
  27. typedef long unsigned int __hip_size_t;
  28. extern "C" {
  29. extern "C" __attribute__((device)) unsigned long long __ockl_dm_alloc(unsigned long long __size);
  30. extern "C" __attribute__((device)) void __ockl_dm_dealloc(unsigned long long __addr);
  31. # 95 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
  32. __attribute__((weak)) inline __attribute__((device)) void *malloc(__hip_size_t __size) {
  33. return (void *) __ockl_dm_alloc(__size);
  34. }
  35. __attribute__((weak)) inline __attribute__((device)) void free(void *__ptr) {
  36. __ockl_dm_dealloc((unsigned long long)__ptr);
  37. }
  38. # 124 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
  39. }
  40. # 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 1 3
  41. # 14 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 3
  42. extern "C" {
  43. __attribute__((device)) __attribute__((const)) float __ocml_acos_f32(float);
  44. __attribute__((device)) __attribute__((pure)) float __ocml_acosh_f32(float);
  45. __attribute__((device)) __attribute__((const)) float __ocml_asin_f32(float);
  46. __attribute__((device)) __attribute__((pure)) float __ocml_asinh_f32(float);
  47. __attribute__((device)) __attribute__((const)) float __ocml_atan2_f32(float, float);
  48. __attribute__((device)) __attribute__((const)) float __ocml_atan_f32(float);
  49. __attribute__((device)) __attribute__((pure)) float __ocml_atanh_f32(float);
  50. __attribute__((device)) __attribute__((pure)) float __ocml_cbrt_f32(float);
  51. __attribute__((device)) __attribute__((const)) float __ocml_ceil_f32(float);
  52. __attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_copysign_f32(float,
  53. float);
  54. __attribute__((device)) float __ocml_cos_f32(float);
  55. __attribute__((device)) float __ocml_native_cos_f32(float);
  56. __attribute__((device)) __attribute__((pure)) __attribute__((device)) float __ocml_cosh_f32(float);
  57. __attribute__((device)) float __ocml_cospi_f32(float);
  58. __attribute__((device)) float __ocml_i0_f32(float);
  59. __attribute__((device)) float __ocml_i1_f32(float);
  60. __attribute__((device)) __attribute__((pure)) float __ocml_erfc_f32(float);
  61. __attribute__((device)) __attribute__((pure)) float __ocml_erfcinv_f32(float);
  62. __attribute__((device)) __attribute__((pure)) float __ocml_erfcx_f32(float);
  63. __attribute__((device)) __attribute__((pure)) float __ocml_erf_f32(float);
  64. __attribute__((device)) __attribute__((pure)) float __ocml_erfinv_f32(float);
  65. __attribute__((device)) __attribute__((pure)) float __ocml_exp10_f32(float);
  66. __attribute__((device)) __attribute__((pure)) float __ocml_native_exp10_f32(float);
  67. __attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
  68. __attribute__((device)) __attribute__((pure)) float __ocml_exp_f32(float);
  69. __attribute__((device)) __attribute__((pure)) float __ocml_native_exp_f32(float);
  70. __attribute__((device)) __attribute__((pure)) float __ocml_expm1_f32(float);
  71. __attribute__((device)) __attribute__((const)) float __ocml_fabs_f32(float);
  72. __attribute__((device)) __attribute__((const)) float __ocml_fdim_f32(float, float);
  73. __attribute__((device)) __attribute__((const)) float __ocml_floor_f32(float);
  74. __attribute__((device)) __attribute__((const)) float __ocml_fma_f32(float, float, float);
  75. __attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
  76. __attribute__((device)) __attribute__((const)) float __ocml_fmin_f32(float, float);
  77. __attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_fmod_f32(float,
  78. float);
  79. __attribute__((device)) float __ocml_frexp_f32(float,
  80. __attribute__((address_space(5))) int *);
  81. __attribute__((device)) __attribute__((const)) float __ocml_hypot_f32(float, float);
  82. __attribute__((device)) __attribute__((const)) int __ocml_ilogb_f32(float);
  83. __attribute__((device)) __attribute__((const)) int __ocml_isfinite_f32(float);
  84. __attribute__((device)) __attribute__((const)) int __ocml_isinf_f32(float);
  85. __attribute__((device)) __attribute__((const)) int __ocml_isnan_f32(float);
  86. __attribute__((device)) float __ocml_j0_f32(float);
  87. __attribute__((device)) float __ocml_j1_f32(float);
  88. __attribute__((device)) __attribute__((const)) float __ocml_ldexp_f32(float, int);
  89. __attribute__((device)) float __ocml_lgamma_f32(float);
  90. __attribute__((device)) __attribute__((pure)) float __ocml_log10_f32(float);
  91. __attribute__((device)) __attribute__((pure)) float __ocml_native_log10_f32(float);
  92. __attribute__((device)) __attribute__((pure)) float __ocml_log1p_f32(float);
  93. __attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
  94. __attribute__((device)) __attribute__((pure)) float __ocml_native_log2_f32(float);
  95. __attribute__((device)) __attribute__((const)) float __ocml_logb_f32(float);
  96. __attribute__((device)) __attribute__((pure)) float __ocml_log_f32(float);
  97. __attribute__((device)) __attribute__((pure)) float __ocml_native_log_f32(float);
  98. __attribute__((device)) float __ocml_modf_f32(float,
  99. __attribute__((address_space(5))) float *);
  100. __attribute__((device)) __attribute__((const)) float __ocml_nearbyint_f32(float);
  101. __attribute__((device)) __attribute__((const)) float __ocml_nextafter_f32(float, float);
  102. __attribute__((device)) __attribute__((const)) float __ocml_len3_f32(float, float, float);
  103. __attribute__((device)) __attribute__((const)) float __ocml_len4_f32(float, float, float,
  104. float);
  105. __attribute__((device)) __attribute__((pure)) float __ocml_ncdf_f32(float);
  106. __attribute__((device)) __attribute__((pure)) float __ocml_ncdfinv_f32(float);
  107. __attribute__((device)) __attribute__((pure)) float __ocml_pow_f32(float, float);
  108. __attribute__((device)) __attribute__((pure)) float __ocml_pown_f32(float, int);
  109. __attribute__((device)) __attribute__((pure)) float __ocml_rcbrt_f32(float);
  110. __attribute__((device)) __attribute__((const)) float __ocml_remainder_f32(float, float);
  111. __attribute__((device)) float __ocml_remquo_f32(float, float,
  112. __attribute__((address_space(5))) int *);
  113. __attribute__((device)) __attribute__((const)) float __ocml_rhypot_f32(float, float);
  114. __attribute__((device)) __attribute__((const)) float __ocml_rint_f32(float);
  115. __attribute__((device)) __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
  116. __attribute__((device)) __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
  117. float);
  118. __attribute__((device)) __attribute__((const)) float __ocml_round_f32(float);
  119. __attribute__((device)) __attribute__((pure)) float __ocml_rsqrt_f32(float);
  120. __attribute__((device)) __attribute__((const)) float __ocml_scalb_f32(float, float);
  121. __attribute__((device)) __attribute__((const)) float __ocml_scalbn_f32(float, int);
  122. __attribute__((device)) __attribute__((const)) int __ocml_signbit_f32(float);
  123. __attribute__((device)) float __ocml_sincos_f32(float,
  124. __attribute__((address_space(5))) float *);
  125. __attribute__((device)) float __ocml_sincospi_f32(float,
  126. __attribute__((address_space(5))) float *);
  127. __attribute__((device)) float __ocml_sin_f32(float);
  128. __attribute__((device)) float __ocml_native_sin_f32(float);
  129. __attribute__((device)) __attribute__((pure)) float __ocml_sinh_f32(float);
  130. __attribute__((device)) float __ocml_sinpi_f32(float);
  131. __attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
  132. __attribute__((device)) __attribute__((const)) float __ocml_native_sqrt_f32(float);
  133. __attribute__((device)) float __ocml_tan_f32(float);
  134. __attribute__((device)) __attribute__((pure)) float __ocml_tanh_f32(float);
  135. __attribute__((device)) float __ocml_tgamma_f32(float);
  136. __attribute__((device)) __attribute__((const)) float __ocml_trunc_f32(float);
  137. __attribute__((device)) float __ocml_y0_f32(float);
  138. __attribute__((device)) float __ocml_y1_f32(float);
  139. __attribute__((device)) __attribute__((const)) float __ocml_add_rte_f32(float, float);
  140. __attribute__((device)) __attribute__((const)) float __ocml_add_rtn_f32(float, float);
  141. __attribute__((device)) __attribute__((const)) float __ocml_add_rtp_f32(float, float);
  142. __attribute__((device)) __attribute__((const)) float __ocml_add_rtz_f32(float, float);
  143. __attribute__((device)) __attribute__((const)) float __ocml_sub_rte_f32(float, float);
  144. __attribute__((device)) __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
  145. __attribute__((device)) __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
  146. __attribute__((device)) __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
  147. __attribute__((device)) __attribute__((const)) float __ocml_mul_rte_f32(float, float);
  148. __attribute__((device)) __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
  149. __attribute__((device)) __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
  150. __attribute__((device)) __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
  151. __attribute__((device)) __attribute__((const)) float __ocml_div_rte_f32(float, float);
  152. __attribute__((device)) __attribute__((const)) float __ocml_div_rtn_f32(float, float);
  153. __attribute__((device)) __attribute__((const)) float __ocml_div_rtp_f32(float, float);
  154. __attribute__((device)) __attribute__((const)) float __ocml_div_rtz_f32(float, float);
  155. __attribute__((device)) __attribute__((const)) float __ocml_sqrt_rte_f32(float);
  156. __attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
  157. __attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
  158. __attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
  159. __attribute__((device)) __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
  160. __attribute__((device)) __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
  161. __attribute__((device)) __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
  162. __attribute__((device)) __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
  163. __attribute__((device)) inline __attribute__((const)) float
  164. __llvm_amdgcn_cos_f32(float __x) {
  165. return __builtin_amdgcn_cosf(__x);
  166. }
  167. __attribute__((device)) inline __attribute__((const)) float
  168. __llvm_amdgcn_rcp_f32(float __x) {
  169. return __builtin_amdgcn_rcpf(__x);
  170. }
  171. __attribute__((device)) inline __attribute__((const)) float
  172. __llvm_amdgcn_rsq_f32(float __x) {
  173. return __builtin_amdgcn_rsqf(__x);
  174. }
  175. __attribute__((device)) inline __attribute__((const)) float
  176. __llvm_amdgcn_sin_f32(float __x) {
  177. return __builtin_amdgcn_sinf(__x);
  178. }
  179. __attribute__((device)) __attribute__((const)) double __ocml_acos_f64(double);
  180. __attribute__((device)) __attribute__((pure)) double __ocml_acosh_f64(double);
  181. __attribute__((device)) __attribute__((const)) double __ocml_asin_f64(double);
  182. __attribute__((device)) __attribute__((pure)) double __ocml_asinh_f64(double);
  183. __attribute__((device)) __attribute__((const)) double __ocml_atan2_f64(double, double);
  184. __attribute__((device)) __attribute__((const)) double __ocml_atan_f64(double);
  185. __attribute__((device)) __attribute__((pure)) double __ocml_atanh_f64(double);
  186. __attribute__((device)) __attribute__((pure)) double __ocml_cbrt_f64(double);
  187. __attribute__((device)) __attribute__((const)) double __ocml_ceil_f64(double);
  188. __attribute__((device)) __attribute__((const)) double __ocml_copysign_f64(double, double);
  189. __attribute__((device)) double __ocml_cos_f64(double);
  190. __attribute__((device)) __attribute__((pure)) double __ocml_cosh_f64(double);
  191. __attribute__((device)) double __ocml_cospi_f64(double);
  192. __attribute__((device)) double __ocml_i0_f64(double);
  193. __attribute__((device)) double __ocml_i1_f64(double);
  194. __attribute__((device)) __attribute__((pure)) double __ocml_erfc_f64(double);
  195. __attribute__((device)) __attribute__((pure)) double __ocml_erfcinv_f64(double);
  196. __attribute__((device)) __attribute__((pure)) double __ocml_erfcx_f64(double);
  197. __attribute__((device)) __attribute__((pure)) double __ocml_erf_f64(double);
  198. __attribute__((device)) __attribute__((pure)) double __ocml_erfinv_f64(double);
  199. __attribute__((device)) __attribute__((pure)) double __ocml_exp10_f64(double);
  200. __attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
  201. __attribute__((device)) __attribute__((pure)) double __ocml_exp_f64(double);
  202. __attribute__((device)) __attribute__((pure)) double __ocml_expm1_f64(double);
  203. __attribute__((device)) __attribute__((const)) double __ocml_fabs_f64(double);
  204. __attribute__((device)) __attribute__((const)) double __ocml_fdim_f64(double, double);
  205. __attribute__((device)) __attribute__((const)) double __ocml_floor_f64(double);
  206. __attribute__((device)) __attribute__((const)) double __ocml_fma_f64(double, double, double);
  207. __attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
  208. __attribute__((device)) __attribute__((const)) double __ocml_fmin_f64(double, double);
  209. __attribute__((device)) __attribute__((const)) double __ocml_fmod_f64(double, double);
  210. __attribute__((device)) double __ocml_frexp_f64(double,
  211. __attribute__((address_space(5))) int *);
  212. __attribute__((device)) __attribute__((const)) double __ocml_hypot_f64(double, double);
  213. __attribute__((device)) __attribute__((const)) int __ocml_ilogb_f64(double);
  214. __attribute__((device)) __attribute__((const)) int __ocml_isfinite_f64(double);
  215. __attribute__((device)) __attribute__((const)) int __ocml_isinf_f64(double);
  216. __attribute__((device)) __attribute__((const)) int __ocml_isnan_f64(double);
  217. __attribute__((device)) double __ocml_j0_f64(double);
  218. __attribute__((device)) double __ocml_j1_f64(double);
  219. __attribute__((device)) __attribute__((const)) double __ocml_ldexp_f64(double, int);
  220. __attribute__((device)) double __ocml_lgamma_f64(double);
  221. __attribute__((device)) __attribute__((pure)) double __ocml_log10_f64(double);
  222. __attribute__((device)) __attribute__((pure)) double __ocml_log1p_f64(double);
  223. __attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
  224. __attribute__((device)) __attribute__((const)) double __ocml_logb_f64(double);
  225. __attribute__((device)) __attribute__((pure)) double __ocml_log_f64(double);
  226. __attribute__((device)) double __ocml_modf_f64(double,
  227. __attribute__((address_space(5))) double *);
  228. __attribute__((device)) __attribute__((const)) double __ocml_nearbyint_f64(double);
  229. __attribute__((device)) __attribute__((const)) double __ocml_nextafter_f64(double, double);
  230. __attribute__((device)) __attribute__((const)) double __ocml_len3_f64(double, double,
  231. double);
  232. __attribute__((device)) __attribute__((const)) double __ocml_len4_f64(double, double, double,
  233. double);
  234. __attribute__((device)) __attribute__((pure)) double __ocml_ncdf_f64(double);
  235. __attribute__((device)) __attribute__((pure)) double __ocml_ncdfinv_f64(double);
  236. __attribute__((device)) __attribute__((pure)) double __ocml_pow_f64(double, double);
  237. __attribute__((device)) __attribute__((pure)) double __ocml_pown_f64(double, int);
  238. __attribute__((device)) __attribute__((pure)) double __ocml_rcbrt_f64(double);
  239. __attribute__((device)) __attribute__((const)) double __ocml_remainder_f64(double, double);
  240. __attribute__((device)) double __ocml_remquo_f64(double, double,
  241. __attribute__((address_space(5))) int *);
  242. __attribute__((device)) __attribute__((const)) double __ocml_rhypot_f64(double, double);
  243. __attribute__((device)) __attribute__((const)) double __ocml_rint_f64(double);
  244. __attribute__((device)) __attribute__((const)) double __ocml_rlen3_f64(double, double,
  245. double);
  246. __attribute__((device)) __attribute__((const)) double __ocml_rlen4_f64(double, double,
  247. double, double);
  248. __attribute__((device)) __attribute__((const)) double __ocml_round_f64(double);
  249. __attribute__((device)) __attribute__((pure)) double __ocml_rsqrt_f64(double);
  250. __attribute__((device)) __attribute__((const)) double __ocml_scalb_f64(double, double);
  251. __attribute__((device)) __attribute__((const)) double __ocml_scalbn_f64(double, int);
  252. __attribute__((device)) __attribute__((const)) int __ocml_signbit_f64(double);
  253. __attribute__((device)) double __ocml_sincos_f64(double,
  254. __attribute__((address_space(5))) double *);
  255. __attribute__((device)) double
  256. __ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
  257. __attribute__((device)) double __ocml_sin_f64(double);
  258. __attribute__((device)) __attribute__((pure)) double __ocml_sinh_f64(double);
  259. __attribute__((device)) double __ocml_sinpi_f64(double);
  260. __attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
  261. __attribute__((device)) double __ocml_tan_f64(double);
  262. __attribute__((device)) __attribute__((pure)) double __ocml_tanh_f64(double);
  263. __attribute__((device)) double __ocml_tgamma_f64(double);
  264. __attribute__((device)) __attribute__((const)) double __ocml_trunc_f64(double);
  265. __attribute__((device)) double __ocml_y0_f64(double);
  266. __attribute__((device)) double __ocml_y1_f64(double);
  267. __attribute__((device)) __attribute__((const)) double __ocml_add_rte_f64(double, double);
  268. __attribute__((device)) __attribute__((const)) double __ocml_add_rtn_f64(double, double);
  269. __attribute__((device)) __attribute__((const)) double __ocml_add_rtp_f64(double, double);
  270. __attribute__((device)) __attribute__((const)) double __ocml_add_rtz_f64(double, double);
  271. __attribute__((device)) __attribute__((const)) double __ocml_sub_rte_f64(double, double);
  272. __attribute__((device)) __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
  273. __attribute__((device)) __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
  274. __attribute__((device)) __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
  275. __attribute__((device)) __attribute__((const)) double __ocml_mul_rte_f64(double, double);
  276. __attribute__((device)) __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
  277. __attribute__((device)) __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
  278. __attribute__((device)) __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
  279. __attribute__((device)) __attribute__((const)) double __ocml_div_rte_f64(double, double);
  280. __attribute__((device)) __attribute__((const)) double __ocml_div_rtn_f64(double, double);
  281. __attribute__((device)) __attribute__((const)) double __ocml_div_rtp_f64(double, double);
  282. __attribute__((device)) __attribute__((const)) double __ocml_div_rtz_f64(double, double);
  283. __attribute__((device)) __attribute__((const)) double __ocml_sqrt_rte_f64(double);
  284. __attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
  285. __attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
  286. __attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
  287. __attribute__((device)) __attribute__((const)) double __ocml_fma_rte_f64(double, double,
  288. double);
  289. __attribute__((device)) __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
  290. double);
  291. __attribute__((device)) __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
  292. double);
  293. __attribute__((device)) __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
  294. double);
  295. __attribute__((device)) inline __attribute__((const)) double
  296. __llvm_amdgcn_rcp_f64(double __x) {
  297. return __builtin_amdgcn_rcp(__x);
  298. }
  299. __attribute__((device)) inline __attribute__((const)) double
  300. __llvm_amdgcn_rsq_f64(double __x) {
  301. return __builtin_amdgcn_rsq(__x);
  302. }
  303. __attribute__((device)) __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
  304. __attribute__((device)) _Float16 __ocml_cos_f16(_Float16);
  305. __attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
  306. __attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
  307. __attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
  308. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
  309. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
  310. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
  311. __attribute__((device)) __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
  312. __attribute__((device)) __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
  313. _Float16);
  314. __attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
  315. __attribute__((device)) __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
  316. __attribute__((device)) __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
  317. __attribute__((device)) __attribute__((const)) int __ocml_isinf_f16(_Float16);
  318. __attribute__((device)) __attribute__((const)) int __ocml_isnan_f16(_Float16);
  319. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
  320. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
  321. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
  322. __attribute__((device)) __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
  323. __attribute__((device)) __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
  324. __attribute__((device)) __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
  325. __attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
  326. __attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
  327. __attribute__((device)) __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
  328. __attribute__((device)) __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
  329. typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
  330. typedef short __2i16 __attribute__((ext_vector_type(2)));
  331. __attribute__((device)) __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
  332. float c, bool s);
  333. __attribute__((device)) __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
  334. __attribute__((device)) __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
  335. __attribute__((device)) __2f16 __ocml_cos_2f16(__2f16);
  336. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
  337. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
  338. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
  339. __attribute__((device)) __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
  340. __attribute__((device)) __attribute__((const))
  341. __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
  342. __attribute__((device)) __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
  343. __attribute__((device)) __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
  344. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
  345. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
  346. __attribute__((device)) __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
  347. __attribute__((device)) inline __2f16
  348. __llvm_amdgcn_rcp_2f16(__2f16 __x)
  349. {
  350. return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
  351. }
  352. __attribute__((device)) __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
  353. __attribute__((device)) __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
  354. __attribute__((device)) __2f16 __ocml_sin_2f16(__2f16);
  355. __attribute__((device)) __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
  356. __attribute__((device)) __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
  357. __attribute__((device)) __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
  358. }
  359. # 128 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
  360. # 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 1 3
  361. # 94 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  362. static __attribute__((device)) inline __attribute__((always_inline))
  363. long unsigned int __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
  364. long unsigned int __r = 0;
  365. while (*__tagp != '\0') {
  366. char __tmp = *__tagp;
  367. if (__tmp >= '0' && __tmp <= '7')
  368. __r = (__r * 8u) + __tmp - '0';
  369. else
  370. return 0;
  371. ++__tagp;
  372. }
  373. return __r;
  374. }
  375. static __attribute__((device)) inline __attribute__((always_inline))
  376. long unsigned int __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
  377. long unsigned int __r = 0;
  378. while (*__tagp != '\0') {
  379. char __tmp = *__tagp;
  380. if (__tmp >= '0' && __tmp <= '9')
  381. __r = (__r * 10u) + __tmp - '0';
  382. else
  383. return 0;
  384. ++__tagp;
  385. }
  386. return __r;
  387. }
  388. static __attribute__((device)) inline __attribute__((always_inline))
  389. long unsigned int __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
  390. long unsigned int __r = 0;
  391. while (*__tagp != '\0') {
  392. char __tmp = *__tagp;
  393. if (__tmp >= '0' && __tmp <= '9')
  394. __r = (__r * 16u) + __tmp - '0';
  395. else if (__tmp >= 'a' && __tmp <= 'f')
  396. __r = (__r * 16u) + __tmp - 'a' + 10;
  397. else if (__tmp >= 'A' && __tmp <= 'F')
  398. __r = (__r * 16u) + __tmp - 'A' + 10;
  399. else
  400. return 0;
  401. ++__tagp;
  402. }
  403. return __r;
  404. }
  405. static __attribute__((device)) inline __attribute__((always_inline))
  406. long unsigned int __make_mantissa(const char *__tagp __attribute__((nonnull))) {
  407. if (*__tagp == '0') {
  408. ++__tagp;
  409. if (*__tagp == 'x' || *__tagp == 'X')
  410. return __make_mantissa_base16(__tagp);
  411. else
  412. return __make_mantissa_base8(__tagp);
  413. }
  414. return __make_mantissa_base10(__tagp);
  415. }
  416. static __attribute__((device)) inline __attribute__((always_inline))
  417. int abs(int __x) {
  418. int __sgn = __x >> (sizeof(int) * 8 - 1);
  419. return (__x ^ __sgn) - __sgn;
  420. }
  421. static __attribute__((device)) inline __attribute__((always_inline))
  422. long labs(long __x) {
  423. long __sgn = __x >> (sizeof(long) * 8 - 1);
  424. return (__x ^ __sgn) - __sgn;
  425. }
  426. static __attribute__((device)) inline __attribute__((always_inline))
  427. long long llabs(long long __x) {
  428. long long __sgn = __x >> (sizeof(long long) * 8 - 1);
  429. return (__x ^ __sgn) - __sgn;
  430. }
  431. static __attribute__((device)) inline __attribute__((always_inline))
  432. float acosf(float __x) { return __ocml_acos_f32(__x); }
  433. static __attribute__((device)) inline __attribute__((always_inline))
  434. float acoshf(float __x) { return __ocml_acosh_f32(__x); }
  435. static __attribute__((device)) inline __attribute__((always_inline))
  436. float asinf(float __x) { return __ocml_asin_f32(__x); }
  437. static __attribute__((device)) inline __attribute__((always_inline))
  438. float asinhf(float __x) { return __ocml_asinh_f32(__x); }
  439. static __attribute__((device)) inline __attribute__((always_inline))
  440. float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
  441. static __attribute__((device)) inline __attribute__((always_inline))
  442. float atanf(float __x) { return __ocml_atan_f32(__x); }
  443. static __attribute__((device)) inline __attribute__((always_inline))
  444. float atanhf(float __x) { return __ocml_atanh_f32(__x); }
  445. static __attribute__((device)) inline __attribute__((always_inline))
  446. float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
  447. static __attribute__((device)) inline __attribute__((always_inline))
  448. float ceilf(float __x) { return __ocml_ceil_f32(__x); }
  449. static __attribute__((device)) inline __attribute__((always_inline))
  450. float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
  451. static __attribute__((device)) inline __attribute__((always_inline))
  452. float cosf(float __x) { return __ocml_cos_f32(__x); }
  453. static __attribute__((device)) inline __attribute__((always_inline))
  454. float coshf(float __x) { return __ocml_cosh_f32(__x); }
  455. static __attribute__((device)) inline __attribute__((always_inline))
  456. float cospif(float __x) { return __ocml_cospi_f32(__x); }
  457. static __attribute__((device)) inline __attribute__((always_inline))
  458. float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
  459. static __attribute__((device)) inline __attribute__((always_inline))
  460. float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
  461. static __attribute__((device)) inline __attribute__((always_inline))
  462. float erfcf(float __x) { return __ocml_erfc_f32(__x); }
  463. static __attribute__((device)) inline __attribute__((always_inline))
  464. float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
  465. static __attribute__((device)) inline __attribute__((always_inline))
  466. float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
  467. static __attribute__((device)) inline __attribute__((always_inline))
  468. float erff(float __x) { return __ocml_erf_f32(__x); }
  469. static __attribute__((device)) inline __attribute__((always_inline))
  470. float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
  471. static __attribute__((device)) inline __attribute__((always_inline))
  472. float exp10f(float __x) { return __ocml_exp10_f32(__x); }
  473. static __attribute__((device)) inline __attribute__((always_inline))
  474. float exp2f(float __x) { return __ocml_exp2_f32(__x); }
  475. static __attribute__((device)) inline __attribute__((always_inline))
  476. float expf(float __x) { return __ocml_exp_f32(__x); }
  477. static __attribute__((device)) inline __attribute__((always_inline))
  478. float expm1f(float __x) { return __ocml_expm1_f32(__x); }
  479. static __attribute__((device)) inline __attribute__((always_inline))
  480. float fabsf(float __x) { return __builtin_fabsf(__x); }
  481. static __attribute__((device)) inline __attribute__((always_inline))
  482. float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
  483. static __attribute__((device)) inline __attribute__((always_inline))
  484. float fdividef(float __x, float __y) { return __x / __y; }
  485. static __attribute__((device)) inline __attribute__((always_inline))
  486. float floorf(float __x) { return __ocml_floor_f32(__x); }
  487. static __attribute__((device)) inline __attribute__((always_inline))
  488. float fmaf(float __x, float __y, float __z) {
  489. return __ocml_fma_f32(__x, __y, __z);
  490. }
  491. static __attribute__((device)) inline __attribute__((always_inline))
  492. float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
  493. static __attribute__((device)) inline __attribute__((always_inline))
  494. float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
  495. static __attribute__((device)) inline __attribute__((always_inline))
  496. float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
  497. static __attribute__((device)) inline __attribute__((always_inline))
  498. float frexpf(float __x, int *__nptr) {
  499. int __tmp;
  500. float __r =
  501. __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
  502. *__nptr = __tmp;
  503. return __r;
  504. }
  505. static __attribute__((device)) inline __attribute__((always_inline))
  506. float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
  507. static __attribute__((device)) inline __attribute__((always_inline))
  508. int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
  509. static __attribute__((device)) inline __attribute__((always_inline))
  510. bool __finitef(float __x) { return __ocml_isfinite_f32(__x); }
  511. static __attribute__((device)) inline __attribute__((always_inline))
  512. bool __isinff(float __x) { return __ocml_isinf_f32(__x); }
  513. static __attribute__((device)) inline __attribute__((always_inline))
  514. bool __isnanf(float __x) { return __ocml_isnan_f32(__x); }
  515. static __attribute__((device)) inline __attribute__((always_inline))
  516. float j0f(float __x) { return __ocml_j0_f32(__x); }
  517. static __attribute__((device)) inline __attribute__((always_inline))
  518. float j1f(float __x) { return __ocml_j1_f32(__x); }
  519. static __attribute__((device)) inline __attribute__((always_inline))
  520. float jnf(int __n, float __x) {
  521. if (__n == 0)
  522. return j0f(__x);
  523. if (__n == 1)
  524. return j1f(__x);
  525. float __x0 = j0f(__x);
  526. float __x1 = j1f(__x);
  527. for (int __i = 1; __i < __n; ++__i) {
  528. float __x2 = (2 * __i) / __x * __x1 - __x0;
  529. __x0 = __x1;
  530. __x1 = __x2;
  531. }
  532. return __x1;
  533. }
  534. static __attribute__((device)) inline __attribute__((always_inline))
  535. float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
  536. static __attribute__((device)) inline __attribute__((always_inline))
  537. float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
  538. static __attribute__((device)) inline __attribute__((always_inline))
  539. long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
  540. static __attribute__((device)) inline __attribute__((always_inline))
  541. long long int llroundf(float __x) { return __ocml_round_f32(__x); }
  542. static __attribute__((device)) inline __attribute__((always_inline))
  543. float log10f(float __x) { return __ocml_log10_f32(__x); }
  544. static __attribute__((device)) inline __attribute__((always_inline))
  545. float log1pf(float __x) { return __ocml_log1p_f32(__x); }
  546. static __attribute__((device)) inline __attribute__((always_inline))
  547. float log2f(float __x) { return __ocml_log2_f32(__x); }
  548. static __attribute__((device)) inline __attribute__((always_inline))
  549. float log2fi(int __x) { return __ocml_log2_f32((float) __x); }
  550. static __attribute__((device)) inline __attribute__((always_inline))
  551. float logbf(float __x) { return __ocml_logb_f32(__x); }
  552. static __attribute__((device)) inline __attribute__((always_inline))
  553. float logf(float __x) { return __ocml_log_f32(__x); }
  554. static __attribute__((device)) inline __attribute__((always_inline))
  555. long int lrintf(float __x) { return __ocml_rint_f32(__x); }
  556. static __attribute__((device)) inline __attribute__((always_inline))
  557. long int lroundf(float __x) { return __ocml_round_f32(__x); }
  558. static __attribute__((device)) inline __attribute__((always_inline))
  559. float modff(float __x, float *__iptr) {
  560. float __tmp;
  561. float __r =
  562. __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  563. *__iptr = __tmp;
  564. return __r;
  565. }
  566. static __attribute__((device)) inline __attribute__((always_inline))
  567. float nanf(const char *__tagp __attribute__((nonnull))) {
  568. union {
  569. float val;
  570. struct ieee_float {
  571. unsigned int mantissa : 22;
  572. unsigned int quiet : 1;
  573. unsigned int exponent : 8;
  574. unsigned int sign : 1;
  575. } bits;
  576. } __tmp;
  577. static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");
  578. __tmp.bits.sign = 0u;
  579. __tmp.bits.exponent = ~0u;
  580. __tmp.bits.quiet = 1u;
  581. __tmp.bits.mantissa = __make_mantissa(__tagp);
  582. return __tmp.val;
  583. }
  584. static __attribute__((device)) inline __attribute__((always_inline))
  585. float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
  586. static __attribute__((device)) inline __attribute__((always_inline))
  587. float nextafterf(float __x, float __y) {
  588. return __ocml_nextafter_f32(__x, __y);
  589. }
  590. static __attribute__((device)) inline __attribute__((always_inline))
  591. float norm3df(float __x, float __y, float __z) {
  592. return __ocml_len3_f32(__x, __y, __z);
  593. }
  594. static __attribute__((device)) inline __attribute__((always_inline))
  595. float norm4df(float __x, float __y, float __z, float __w) {
  596. return __ocml_len4_f32(__x, __y, __z, __w);
  597. }
  598. static __attribute__((device)) inline __attribute__((always_inline))
  599. float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
  600. static __attribute__((device)) inline __attribute__((always_inline))
  601. float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
  602. static __attribute__((device)) inline __attribute__((always_inline))
  603. float normf(int __dim,
  604. const float *__a) {
  605. float __r = 0;
  606. while (__dim--) {
  607. __r += __a[0] * __a[0];
  608. ++__a;
  609. }
  610. return __ocml_sqrt_f32(__r);
  611. }
  612. static __attribute__((device)) inline __attribute__((always_inline))
  613. float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
  614. static __attribute__((device)) inline __attribute__((always_inline))
  615. float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
  616. static __attribute__((device)) inline __attribute__((always_inline))
  617. int powii(int __base, int __exp) {
  618. if (__exp < 0 )
  619. return -1;
  620. int __result = 1;
  621. for (;;) {
  622. if (__exp & 1)
  623. __result *= __base;
  624. __exp >>= 1;
  625. if (!__exp)
  626. break;
  627. __base *= __base;
  628. }
  629. return __result;
  630. }
  631. static __attribute__((device)) inline __attribute__((always_inline))
  632. float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
  633. static __attribute__((device)) inline __attribute__((always_inline))
  634. float remainderf(float __x, float __y) {
  635. return __ocml_remainder_f32(__x, __y);
  636. }
  637. static __attribute__((device)) inline __attribute__((always_inline))
  638. float remquof(float __x, float __y, int *__quo) {
  639. int __tmp;
  640. float __r = __ocml_remquo_f32(
  641. __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  642. *__quo = __tmp;
  643. return __r;
  644. }
  645. static __attribute__((device)) inline __attribute__((always_inline))
  646. float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
  647. static __attribute__((device)) inline __attribute__((always_inline))
  648. float rintf(float __x) { return __ocml_rint_f32(__x); }
  649. static __attribute__((device)) inline __attribute__((always_inline))
  650. float rnorm3df(float __x, float __y, float __z) {
  651. return __ocml_rlen3_f32(__x, __y, __z);
  652. }
  653. static __attribute__((device)) inline __attribute__((always_inline))
  654. float rnorm4df(float __x, float __y, float __z, float __w) {
  655. return __ocml_rlen4_f32(__x, __y, __z, __w);
  656. }
  657. static __attribute__((device)) inline __attribute__((always_inline))
  658. float rnormf(int __dim,
  659. const float *__a) {
  660. float __r = 0;
  661. while (__dim--) {
  662. __r += __a[0] * __a[0];
  663. ++__a;
  664. }
  665. return __ocml_rsqrt_f32(__r);
  666. }
  667. static __attribute__((device)) inline __attribute__((always_inline))
  668. float roundf(float __x) { return __ocml_round_f32(__x); }
  669. static __attribute__((device)) inline __attribute__((always_inline))
  670. float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
  671. static __attribute__((device)) inline __attribute__((always_inline))
  672. float scalblnf(float __x, long int __n) {
  673. return (__n < 9223372036854775807L) ? __ocml_scalbn_f32(__x, __n)
  674. : __ocml_scalb_f32(__x, __n);
  675. }
  676. static __attribute__((device)) inline __attribute__((always_inline))
  677. float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
  678. static __attribute__((device)) inline __attribute__((always_inline))
  679. bool __signbitf(float __x) { return __ocml_signbit_f32(__x); }
  680. static __attribute__((device)) inline __attribute__((always_inline))
  681. void sincosf(float __x, float *__sinptr, float *__cosptr) {
  682. float __tmp;
  683. *__sinptr =
  684. __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  685. *__cosptr = __tmp;
  686. }
  687. static __attribute__((device)) inline __attribute__((always_inline))
  688. void sincospif(float __x, float *__sinptr, float *__cosptr) {
  689. float __tmp;
  690. *__sinptr = __ocml_sincospi_f32(
  691. __x, (__attribute__((address_space(5))) float *)&__tmp);
  692. *__cosptr = __tmp;
  693. }
  694. static __attribute__((device)) inline __attribute__((always_inline))
  695. float sinf(float __x) { return __ocml_sin_f32(__x); }
  696. static __attribute__((device)) inline __attribute__((always_inline))
  697. float sinhf(float __x) { return __ocml_sinh_f32(__x); }
  698. static __attribute__((device)) inline __attribute__((always_inline))
  699. float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
  700. static __attribute__((device)) inline __attribute__((always_inline))
  701. float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
  702. static __attribute__((device)) inline __attribute__((always_inline))
  703. float tanf(float __x) { return __ocml_tan_f32(__x); }
  704. static __attribute__((device)) inline __attribute__((always_inline))
  705. float tanhf(float __x) { return __ocml_tanh_f32(__x); }
  706. static __attribute__((device)) inline __attribute__((always_inline))
  707. float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
  708. static __attribute__((device)) inline __attribute__((always_inline))
  709. float truncf(float __x) { return __ocml_trunc_f32(__x); }
  710. static __attribute__((device)) inline __attribute__((always_inline))
  711. float y0f(float __x) { return __ocml_y0_f32(__x); }
  712. static __attribute__((device)) inline __attribute__((always_inline))
  713. float y1f(float __x) { return __ocml_y1_f32(__x); }
  714. static __attribute__((device)) inline __attribute__((always_inline))
  715. float ynf(int __n, float __x) {
  716. if (__n == 0)
  717. return y0f(__x);
  718. if (__n == 1)
  719. return y1f(__x);
  720. float __x0 = y0f(__x);
  721. float __x1 = y1f(__x);
  722. for (int __i = 1; __i < __n; ++__i) {
  723. float __x2 = (2 * __i) / __x * __x1 - __x0;
  724. __x0 = __x1;
  725. __x1 = __x2;
  726. }
  727. return __x1;
  728. }
  729. static __attribute__((device)) inline __attribute__((always_inline))
  730. float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
  731. static __attribute__((device)) inline __attribute__((always_inline))
  732. float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
  733. static __attribute__((device)) inline __attribute__((always_inline))
  734. float __expf(float __x) { return __ocml_native_exp_f32(__x); }
  735. # 627 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  736. static __attribute__((device)) inline __attribute__((always_inline))
  737. float __fadd_rn(float __x, float __y) { return __x + __y; }
  738. # 641 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  739. static __attribute__((device)) inline __attribute__((always_inline))
  740. float __fdiv_rn(float __x, float __y) { return __x / __y; }
  741. static __attribute__((device)) inline __attribute__((always_inline))
  742. float __fdividef(float __x, float __y) { return __x / __y; }
  743. # 666 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  744. static __attribute__((device)) inline __attribute__((always_inline))
  745. float __fmaf_rn(float __x, float __y, float __z) {
  746. return __ocml_fma_f32(__x, __y, __z);
  747. }
  748. # 682 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  749. static __attribute__((device)) inline __attribute__((always_inline))
  750. float __fmul_rn(float __x, float __y) { return __x * __y; }
  751. # 696 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  752. static __attribute__((device)) inline __attribute__((always_inline))
  753. float __frcp_rn(float __x) { return 1.0f / __x; }
  754. static __attribute__((device)) inline __attribute__((always_inline))
  755. float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
  756. # 713 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  757. static __attribute__((device)) inline __attribute__((always_inline))
  758. float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
  759. # 727 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  760. static __attribute__((device)) inline __attribute__((always_inline))
  761. float __fsub_rn(float __x, float __y) { return __x - __y; }
  762. static __attribute__((device)) inline __attribute__((always_inline))
  763. float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
  764. static __attribute__((device)) inline __attribute__((always_inline))
  765. float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
  766. static __attribute__((device)) inline __attribute__((always_inline))
  767. float __logf(float __x) { return __ocml_native_log_f32(__x); }
  768. static __attribute__((device)) inline __attribute__((always_inline))
  769. float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
  770. static __attribute__((device)) inline __attribute__((always_inline))
  771. float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
  772. static __attribute__((device)) inline __attribute__((always_inline))
  773. void __sincosf(float __x, float *__sinptr, float *__cosptr) {
  774. *__sinptr = __ocml_native_sin_f32(__x);
  775. *__cosptr = __ocml_native_cos_f32(__x);
  776. }
  777. static __attribute__((device)) inline __attribute__((always_inline))
  778. float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
  779. static __attribute__((device)) inline __attribute__((always_inline))
  780. float __tanf(float __x) { return __ocml_tan_f32(__x); }
  781. static __attribute__((device)) inline __attribute__((always_inline))
  782. double acos(double __x) { return __ocml_acos_f64(__x); }
  783. static __attribute__((device)) inline __attribute__((always_inline))
  784. double acosh(double __x) { return __ocml_acosh_f64(__x); }
  785. static __attribute__((device)) inline __attribute__((always_inline))
  786. double asin(double __x) { return __ocml_asin_f64(__x); }
  787. static __attribute__((device)) inline __attribute__((always_inline))
  788. double asinh(double __x) { return __ocml_asinh_f64(__x); }
  789. static __attribute__((device)) inline __attribute__((always_inline))
  790. double atan(double __x) { return __ocml_atan_f64(__x); }
  791. static __attribute__((device)) inline __attribute__((always_inline))
  792. double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
  793. static __attribute__((device)) inline __attribute__((always_inline))
  794. double atanh(double __x) { return __ocml_atanh_f64(__x); }
  795. static __attribute__((device)) inline __attribute__((always_inline))
  796. double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
  797. static __attribute__((device)) inline __attribute__((always_inline))
  798. double ceil(double __x) { return __ocml_ceil_f64(__x); }
  799. static __attribute__((device)) inline __attribute__((always_inline))
  800. double copysign(double __x, double __y) {
  801. return __ocml_copysign_f64(__x, __y);
  802. }
  803. static __attribute__((device)) inline __attribute__((always_inline))
  804. double cos(double __x) { return __ocml_cos_f64(__x); }
  805. static __attribute__((device)) inline __attribute__((always_inline))
  806. double cosh(double __x) { return __ocml_cosh_f64(__x); }
  807. static __attribute__((device)) inline __attribute__((always_inline))
  808. double cospi(double __x) { return __ocml_cospi_f64(__x); }
  809. static __attribute__((device)) inline __attribute__((always_inline))
  810. double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
  811. static __attribute__((device)) inline __attribute__((always_inline))
  812. double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
  813. static __attribute__((device)) inline __attribute__((always_inline))
  814. double erf(double __x) { return __ocml_erf_f64(__x); }
  815. static __attribute__((device)) inline __attribute__((always_inline))
  816. double erfc(double __x) { return __ocml_erfc_f64(__x); }
  817. static __attribute__((device)) inline __attribute__((always_inline))
  818. double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
  819. static __attribute__((device)) inline __attribute__((always_inline))
  820. double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
  821. static __attribute__((device)) inline __attribute__((always_inline))
  822. double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
  823. static __attribute__((device)) inline __attribute__((always_inline))
  824. double exp(double __x) { return __ocml_exp_f64(__x); }
  825. static __attribute__((device)) inline __attribute__((always_inline))
  826. double exp10(double __x) { return __ocml_exp10_f64(__x); }
  827. static __attribute__((device)) inline __attribute__((always_inline))
  828. double exp2(double __x) { return __ocml_exp2_f64(__x); }
  829. static __attribute__((device)) inline __attribute__((always_inline))
  830. double expm1(double __x) { return __ocml_expm1_f64(__x); }
  831. static __attribute__((device)) inline __attribute__((always_inline))
  832. double fabs(double __x) { return __builtin_fabs(__x); }
  833. static __attribute__((device)) inline __attribute__((always_inline))
  834. double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
  835. static __attribute__((device)) inline __attribute__((always_inline))
  836. double floor(double __x) { return __ocml_floor_f64(__x); }
  837. static __attribute__((device)) inline __attribute__((always_inline))
  838. double fma(double __x, double __y, double __z) {
  839. return __ocml_fma_f64(__x, __y, __z);
  840. }
  841. static __attribute__((device)) inline __attribute__((always_inline))
  842. double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
  843. static __attribute__((device)) inline __attribute__((always_inline))
  844. double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
  845. static __attribute__((device)) inline __attribute__((always_inline))
  846. double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
  847. static __attribute__((device)) inline __attribute__((always_inline))
  848. double frexp(double __x, int *__nptr) {
  849. int __tmp;
  850. double __r =
  851. __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
  852. *__nptr = __tmp;
  853. return __r;
  854. }
  855. static __attribute__((device)) inline __attribute__((always_inline))
  856. double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
  857. static __attribute__((device)) inline __attribute__((always_inline))
  858. int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
  859. static __attribute__((device)) inline __attribute__((always_inline))
  860. bool __finite(double __x) { return __ocml_isfinite_f64(__x); }
  861. static __attribute__((device)) inline __attribute__((always_inline))
  862. bool __isinf(double __x) { return __ocml_isinf_f64(__x); }
  863. static __attribute__((device)) inline __attribute__((always_inline))
  864. bool __isnan(double __x) { return __ocml_isnan_f64(__x); }
  865. static __attribute__((device)) inline __attribute__((always_inline))
  866. double j0(double __x) { return __ocml_j0_f64(__x); }
  867. static __attribute__((device)) inline __attribute__((always_inline))
  868. double j1(double __x) { return __ocml_j1_f64(__x); }
  869. static __attribute__((device)) inline __attribute__((always_inline))
  870. double jn(int __n, double __x) {
  871. if (__n == 0)
  872. return j0(__x);
  873. if (__n == 1)
  874. return j1(__x);
  875. double __x0 = j0(__x);
  876. double __x1 = j1(__x);
  877. for (int __i = 1; __i < __n; ++__i) {
  878. double __x2 = (2 * __i) / __x * __x1 - __x0;
  879. __x0 = __x1;
  880. __x1 = __x2;
  881. }
  882. return __x1;
  883. }
  884. static __attribute__((device)) inline __attribute__((always_inline))
  885. double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
  886. static __attribute__((device)) inline __attribute__((always_inline))
  887. double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
  888. static __attribute__((device)) inline __attribute__((always_inline))
  889. long long int llrint(double __x) { return __ocml_rint_f64(__x); }
  890. static __attribute__((device)) inline __attribute__((always_inline))
  891. long long int llround(double __x) { return __ocml_round_f64(__x); }
  892. static __attribute__((device)) inline __attribute__((always_inline))
  893. double log(double __x) { return __ocml_log_f64(__x); }
  894. static __attribute__((device)) inline __attribute__((always_inline))
  895. double log10(double __x) { return __ocml_log10_f64(__x); }
  896. static __attribute__((device)) inline __attribute__((always_inline))
  897. double log1p(double __x) { return __ocml_log1p_f64(__x); }
  898. static __attribute__((device)) inline __attribute__((always_inline))
  899. double log2(double __x) { return __ocml_log2_f64(__x); }
  900. static __attribute__((device)) inline __attribute__((always_inline))
  901. double logb(double __x) { return __ocml_logb_f64(__x); }
  902. static __attribute__((device)) inline __attribute__((always_inline))
  903. long int lrint(double __x) { return __ocml_rint_f64(__x); }
  904. static __attribute__((device)) inline __attribute__((always_inline))
  905. long int lround(double __x) { return __ocml_round_f64(__x); }
  906. static __attribute__((device)) inline __attribute__((always_inline))
  907. double modf(double __x, double *__iptr) {
  908. double __tmp;
  909. double __r =
  910. __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
  911. *__iptr = __tmp;
  912. return __r;
  913. }
  914. static __attribute__((device)) inline __attribute__((always_inline))
  915. double nan(const char *__tagp) {
  916. union {
  917. double val;
  918. struct ieee_double {
  919. long unsigned int mantissa : 51;
  920. unsigned int quiet : 1;
  921. unsigned int exponent : 11;
  922. unsigned int sign : 1;
  923. } bits;
  924. } __tmp;
  925. static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");
  926. __tmp.bits.sign = 0u;
  927. __tmp.bits.exponent = ~0u;
  928. __tmp.bits.quiet = 1u;
  929. __tmp.bits.mantissa = __make_mantissa(__tagp);
  930. return __tmp.val;
  931. }
  932. static __attribute__((device)) inline __attribute__((always_inline))
  933. double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
  934. static __attribute__((device)) inline __attribute__((always_inline))
  935. double nextafter(double __x, double __y) {
  936. return __ocml_nextafter_f64(__x, __y);
  937. }
  938. static __attribute__((device)) inline __attribute__((always_inline))
  939. double norm(int __dim,
  940. const double *__a) {
  941. double __r = 0;
  942. while (__dim--) {
  943. __r += __a[0] * __a[0];
  944. ++__a;
  945. }
  946. return __ocml_sqrt_f64(__r);
  947. }
  948. static __attribute__((device)) inline __attribute__((always_inline))
  949. double norm3d(double __x, double __y, double __z) {
  950. return __ocml_len3_f64(__x, __y, __z);
  951. }
  952. static __attribute__((device)) inline __attribute__((always_inline))
  953. double norm4d(double __x, double __y, double __z, double __w) {
  954. return __ocml_len4_f64(__x, __y, __z, __w);
  955. }
  956. static __attribute__((device)) inline __attribute__((always_inline))
  957. double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
  958. static __attribute__((device)) inline __attribute__((always_inline))
  959. double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
  960. static __attribute__((device)) inline __attribute__((always_inline))
  961. double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
  962. static __attribute__((device)) inline __attribute__((always_inline))
  963. double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
  964. static __attribute__((device)) inline __attribute__((always_inline))
  965. double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
  966. static __attribute__((device)) inline __attribute__((always_inline))
  967. double remainder(double __x, double __y) {
  968. return __ocml_remainder_f64(__x, __y);
  969. }
  970. static __attribute__((device)) inline __attribute__((always_inline))
  971. double remquo(double __x, double __y, int *__quo) {
  972. int __tmp;
  973. double __r = __ocml_remquo_f64(
  974. __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  975. *__quo = __tmp;
  976. return __r;
  977. }
  978. static __attribute__((device)) inline __attribute__((always_inline))
  979. double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
  980. static __attribute__((device)) inline __attribute__((always_inline))
  981. double rint(double __x) { return __ocml_rint_f64(__x); }
  982. static __attribute__((device)) inline __attribute__((always_inline))
  983. double rnorm(int __dim,
  984. const double *__a) {
  985. double __r = 0;
  986. while (__dim--) {
  987. __r += __a[0] * __a[0];
  988. ++__a;
  989. }
  990. return __ocml_rsqrt_f64(__r);
  991. }
  992. static __attribute__((device)) inline __attribute__((always_inline))
  993. double rnorm3d(double __x, double __y, double __z) {
  994. return __ocml_rlen3_f64(__x, __y, __z);
  995. }
  996. static __attribute__((device)) inline __attribute__((always_inline))
  997. double rnorm4d(double __x, double __y, double __z, double __w) {
  998. return __ocml_rlen4_f64(__x, __y, __z, __w);
  999. }
  1000. static __attribute__((device)) inline __attribute__((always_inline))
  1001. double round(double __x) { return __ocml_round_f64(__x); }
  1002. static __attribute__((device)) inline __attribute__((always_inline))
  1003. double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
  1004. static __attribute__((device)) inline __attribute__((always_inline))
  1005. double scalbln(double __x, long int __n) {
  1006. return (__n < 9223372036854775807L) ? __ocml_scalbn_f64(__x, __n)
  1007. : __ocml_scalb_f64(__x, __n);
  1008. }
  1009. static __attribute__((device)) inline __attribute__((always_inline))
  1010. double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
  1011. static __attribute__((device)) inline __attribute__((always_inline))
  1012. bool __signbit(double __x) { return __ocml_signbit_f64(__x); }
  1013. static __attribute__((device)) inline __attribute__((always_inline))
  1014. double sin(double __x) { return __ocml_sin_f64(__x); }
  1015. static __attribute__((device)) inline __attribute__((always_inline))
  1016. void sincos(double __x, double *__sinptr, double *__cosptr) {
  1017. double __tmp;
  1018. *__sinptr = __ocml_sincos_f64(
  1019. __x, (__attribute__((address_space(5))) double *)&__tmp);
  1020. *__cosptr = __tmp;
  1021. }
  1022. static __attribute__((device)) inline __attribute__((always_inline))
  1023. void sincospi(double __x, double *__sinptr, double *__cosptr) {
  1024. double __tmp;
  1025. *__sinptr = __ocml_sincospi_f64(
  1026. __x, (__attribute__((address_space(5))) double *)&__tmp);
  1027. *__cosptr = __tmp;
  1028. }
  1029. static __attribute__((device)) inline __attribute__((always_inline))
  1030. double sinh(double __x) { return __ocml_sinh_f64(__x); }
  1031. static __attribute__((device)) inline __attribute__((always_inline))
  1032. double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
  1033. static __attribute__((device)) inline __attribute__((always_inline))
  1034. double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
  1035. static __attribute__((device)) inline __attribute__((always_inline))
  1036. double tan(double __x) { return __ocml_tan_f64(__x); }
  1037. static __attribute__((device)) inline __attribute__((always_inline))
  1038. double tanh(double __x) { return __ocml_tanh_f64(__x); }
  1039. static __attribute__((device)) inline __attribute__((always_inline))
  1040. double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
  1041. static __attribute__((device)) inline __attribute__((always_inline))
  1042. double trunc(double __x) { return __ocml_trunc_f64(__x); }
  1043. static __attribute__((device)) inline __attribute__((always_inline))
  1044. double y0(double __x) { return __ocml_y0_f64(__x); }
  1045. static __attribute__((device)) inline __attribute__((always_inline))
  1046. double y1(double __x) { return __ocml_y1_f64(__x); }
  1047. static __attribute__((device)) inline __attribute__((always_inline))
  1048. double yn(int __n, double __x) {
  1049. if (__n == 0)
  1050. return y0(__x);
  1051. if (__n == 1)
  1052. return y1(__x);
  1053. double __x0 = y0(__x);
  1054. double __x1 = y1(__x);
  1055. for (int __i = 1; __i < __n; ++__i) {
  1056. double __x2 = (2 * __i) / __x * __x1 - __x0;
  1057. __x0 = __x1;
  1058. __x1 = __x2;
  1059. }
  1060. return __x1;
  1061. }
  1062. # 1190 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1063. static __attribute__((device)) inline __attribute__((always_inline))
  1064. double __dadd_rn(double __x, double __y) { return __x + __y; }
  1065. # 1212 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1066. static __attribute__((device)) inline __attribute__((always_inline))
  1067. double __ddiv_rn(double __x, double __y) { return __x / __y; }
  1068. # 1234 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1069. static __attribute__((device)) inline __attribute__((always_inline))
  1070. double __dmul_rn(double __x, double __y) { return __x * __y; }
  1071. # 1248 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1072. static __attribute__((device)) inline __attribute__((always_inline))
  1073. double __drcp_rn(double __x) { return 1.0 / __x; }
  1074. # 1262 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1075. static __attribute__((device)) inline __attribute__((always_inline))
  1076. double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
  1077. # 1284 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1078. static __attribute__((device)) inline __attribute__((always_inline))
  1079. double __dsub_rn(double __x, double __y) { return __x - __y; }
  1080. # 1306 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1081. static __attribute__((device)) inline __attribute__((always_inline))
  1082. double __fma_rn(double __x, double __y, double __z) {
  1083. return __ocml_fma_f64(__x, __y, __z);
  1084. }
  1085. # 1325 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
  1086. template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T min(T __arg1, T __arg2) {
  1087. return (__arg1 < __arg2) ? __arg1 : __arg2;
  1088. }
  1089. template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T max(T __arg1, T __arg2) {
  1090. return (__arg1 > __arg2) ? __arg1 : __arg2;
  1091. }
  1092. static __attribute__((device)) inline __attribute__((always_inline)) int min(int __arg1, int __arg2) {
  1093. return (__arg1 < __arg2) ? __arg1 : __arg2;
  1094. }
  1095. static __attribute__((device)) inline __attribute__((always_inline)) int max(int __arg1, int __arg2) {
  1096. return (__arg1 > __arg2) ? __arg1 : __arg2;
  1097. }
  1098. static __attribute__((device)) inline __attribute__((always_inline))
  1099. float max(float __x, float __y) { return fmaxf(__x, __y); }
  1100. static __attribute__((device)) inline __attribute__((always_inline))
  1101. double max(double __x, double __y) { return fmax(__x, __y); }
  1102. static __attribute__((device)) inline __attribute__((always_inline))
  1103. float min(float __x, float __y) { return fminf(__x, __y); }
  1104. static __attribute__((device)) inline __attribute__((always_inline))
  1105. double min(double __x, double __y) { return fmin(__x, __y); }
  1106. # 129 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
  1107. # 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_stdlib.h" 1 3
  1108. # 130 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
  1109. # 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 1 3
  1110. # 41 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1111. static __attribute__((device)) inline __attribute__((always_inline)) double abs(double __x) { return ::fabs(__x); }
  1112. static __attribute__((device)) inline __attribute__((always_inline)) float abs(float __x) { return ::fabsf(__x); }
  1113. static __attribute__((device)) inline __attribute__((always_inline)) long long abs(long long __n) { return ::llabs(__n); }
  1114. static __attribute__((device)) inline __attribute__((always_inline)) long abs(long __n) { return ::labs(__n); }
  1115. static __attribute__((device)) inline __attribute__((always_inline)) float fma(float __x, float __y, float __z) {
  1116. return ::fmaf(__x, __y, __z);
  1117. }
  1118. # 61 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1119. static __attribute__((device)) inline __attribute__((always_inline)) float frexp(float __arg, int *__exp) {
  1120. return ::frexpf(__arg, __exp);
  1121. }
  1122. # 93 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1123. static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(float __x) { return ::__isinff(__x); }
  1124. static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(double __x) { return ::__isinf(__x); }
  1125. static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(float __x) { return ::__finitef(__x); }
  1126. static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(double __x) { return ::__finite(__x); }
  1127. static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(float __x) { return ::__isnanf(__x); }
  1128. static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(double __x) { return ::__isnan(__x); }
  1129. static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(float __x, float __y) {
  1130. return __builtin_isgreater(__x, __y);
  1131. }
  1132. static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(double __x, double __y) {
  1133. return __builtin_isgreater(__x, __y);
  1134. }
  1135. static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(float __x, float __y) {
  1136. return __builtin_isgreaterequal(__x, __y);
  1137. }
  1138. static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(double __x, double __y) {
  1139. return __builtin_isgreaterequal(__x, __y);
  1140. }
  1141. static __attribute__((device)) inline __attribute__((always_inline)) bool isless(float __x, float __y) {
  1142. return __builtin_isless(__x, __y);
  1143. }
  1144. static __attribute__((device)) inline __attribute__((always_inline)) bool isless(double __x, double __y) {
  1145. return __builtin_isless(__x, __y);
  1146. }
  1147. static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(float __x, float __y) {
  1148. return __builtin_islessequal(__x, __y);
  1149. }
  1150. static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(double __x, double __y) {
  1151. return __builtin_islessequal(__x, __y);
  1152. }
  1153. static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(float __x, float __y) {
  1154. return __builtin_islessgreater(__x, __y);
  1155. }
  1156. static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(double __x, double __y) {
  1157. return __builtin_islessgreater(__x, __y);
  1158. }
  1159. static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(float __x) {
  1160. return __builtin_isnormal(__x);
  1161. }
  1162. static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(double __x) {
  1163. return __builtin_isnormal(__x);
  1164. }
  1165. static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(float __x, float __y) {
  1166. return __builtin_isunordered(__x, __y);
  1167. }
  1168. static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(double __x, double __y) {
  1169. return __builtin_isunordered(__x, __y);
  1170. }
  1171. static __attribute__((device)) inline __attribute__((always_inline)) float modf(float __x, float *__iptr) {
  1172. return ::modff(__x, __iptr);
  1173. }
  1174. static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __base, int __iexp) {
  1175. return ::powif(__base, __iexp);
  1176. }
  1177. static __attribute__((device)) inline __attribute__((always_inline)) double pow(double __base, int __iexp) {
  1178. return ::powi(__base, __iexp);
  1179. }
  1180. static __attribute__((device)) inline __attribute__((always_inline)) float remquo(float __x, float __y, int *__quo) {
  1181. return ::remquof(__x, __y, __quo);
  1182. }
  1183. static __attribute__((device)) inline __attribute__((always_inline)) float scalbln(float __x, long int __n) {
  1184. return ::scalblnf(__x, __n);
  1185. }
  1186. static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(float __x) { return ::__signbitf(__x); }
  1187. static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(double __x) { return ::__signbit(__x); }
  1188. static __attribute__((device)) inline __attribute__((always_inline)) _Float16 fma(_Float16 __x, _Float16 __y,
  1189. _Float16 __z) {
  1190. return __ocml_fma_f16(__x, __y, __z);
  1191. }
  1192. static __attribute__((device)) inline __attribute__((always_inline)) _Float16 pow(_Float16 __base, int __iexp) {
  1193. return __ocml_pown_f16(__base, __iexp);
  1194. }
  1195. # 202 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1196. static __attribute__((device)) inline __attribute__((always_inline)) float acos(float __x) { return acosf(__x); }
  1197. static __attribute__((device)) inline __attribute__((always_inline)) float acosh(float __x) { return acoshf(__x); }
  1198. static __attribute__((device)) inline __attribute__((always_inline)) float asin(float __x) { return asinf(__x); }
  1199. static __attribute__((device)) inline __attribute__((always_inline)) float asinh(float __x) { return asinhf(__x); }
  1200. static __attribute__((device)) inline __attribute__((always_inline)) float atan(float __x) { return atanf(__x); }
  1201. static __attribute__((device)) inline __attribute__((always_inline)) float atan2(float __x, float __y) { return atan2f(__x, __y); }
  1202. static __attribute__((device)) inline __attribute__((always_inline)) float atanh(float __x) { return atanhf(__x); }
  1203. static __attribute__((device)) inline __attribute__((always_inline)) float cbrt(float __x) { return cbrtf(__x); }
  1204. static __attribute__((device)) inline __attribute__((always_inline)) float ceil(float __x) { return ceilf(__x); }
  1205. static __attribute__((device)) inline __attribute__((always_inline)) float copysign(float __x, float __y) { return copysignf(__x, __y); }
  1206. static __attribute__((device)) inline __attribute__((always_inline)) float cos(float __x) { return cosf(__x); }
  1207. static __attribute__((device)) inline __attribute__((always_inline)) float cosh(float __x) { return coshf(__x); }
  1208. static __attribute__((device)) inline __attribute__((always_inline)) float erf(float __x) { return erff(__x); }
  1209. static __attribute__((device)) inline __attribute__((always_inline)) float erfc(float __x) { return erfcf(__x); }
  1210. static __attribute__((device)) inline __attribute__((always_inline)) float exp(float __x) { return expf(__x); }
  1211. static __attribute__((device)) inline __attribute__((always_inline)) float exp2(float __x) { return exp2f(__x); }
  1212. static __attribute__((device)) inline __attribute__((always_inline)) float expm1(float __x) { return expm1f(__x); }
  1213. static __attribute__((device)) inline __attribute__((always_inline)) float fabs(float __x) { return fabsf(__x); }
  1214. static __attribute__((device)) inline __attribute__((always_inline)) float fdim(float __x, float __y) { return fdimf(__x, __y); }
  1215. static __attribute__((device)) inline __attribute__((always_inline)) float floor(float __x) { return floorf(__x); }
  1216. static __attribute__((device)) inline __attribute__((always_inline)) float fmax(float __x, float __y) { return fmaxf(__x, __y); }
  1217. static __attribute__((device)) inline __attribute__((always_inline)) float fmin(float __x, float __y) { return fminf(__x, __y); }
  1218. static __attribute__((device)) inline __attribute__((always_inline)) float fmod(float __x, float __y) { return fmodf(__x, __y); }
  1219. static __attribute__((device)) inline __attribute__((always_inline)) float hypot(float __x, float __y) { return hypotf(__x, __y); }
  1220. static __attribute__((device)) inline __attribute__((always_inline)) int ilogb(float __x) { return ilogbf(__x); }
  1221. static __attribute__((device)) inline __attribute__((always_inline)) float ldexp(float __x, int __y) { return ldexpf(__x, __y); }
  1222. static __attribute__((device)) inline __attribute__((always_inline)) float lgamma(float __x) { return lgammaf(__x); }
  1223. static __attribute__((device)) inline __attribute__((always_inline)) float log(float __x) { return logf(__x); }
  1224. static __attribute__((device)) inline __attribute__((always_inline)) float log10(float __x) { return log10f(__x); }
  1225. static __attribute__((device)) inline __attribute__((always_inline)) float log1p(float __x) { return log1pf(__x); }
  1226. static __attribute__((device)) inline __attribute__((always_inline)) float log2(float __x) { return log2f(__x); }
  1227. static __attribute__((device)) inline __attribute__((always_inline)) float logb(float __x) { return logbf(__x); }
  1228. static __attribute__((device)) inline __attribute__((always_inline)) long long llrint(float __x) { return llrintf(__x); }
  1229. static __attribute__((device)) inline __attribute__((always_inline)) long long llround(float __x) { return llroundf(__x); }
  1230. static __attribute__((device)) inline __attribute__((always_inline)) long lrint(float __x) { return lrintf(__x); }
  1231. static __attribute__((device)) inline __attribute__((always_inline)) long lround(float __x) { return lroundf(__x); }
  1232. static __attribute__((device)) inline __attribute__((always_inline)) float nearbyint(float __x) { return nearbyintf(__x); }
  1233. static __attribute__((device)) inline __attribute__((always_inline)) float nextafter(float __x, float __y) { return nextafterf(__x, __y); }
  1234. static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __x, float __y) { return powf(__x, __y); }
  1235. static __attribute__((device)) inline __attribute__((always_inline)) float remainder(float __x, float __y) { return remainderf(__x, __y); }
  1236. static __attribute__((device)) inline __attribute__((always_inline)) float rint(float __x) { return rintf(__x); }
  1237. static __attribute__((device)) inline __attribute__((always_inline)) float round(float __x) { return roundf(__x); }
  1238. static __attribute__((device)) inline __attribute__((always_inline)) float scalbn(float __x, int __y) { return scalbnf(__x, __y); }
  1239. static __attribute__((device)) inline __attribute__((always_inline)) float sin(float __x) { return sinf(__x); }
  1240. static __attribute__((device)) inline __attribute__((always_inline)) float sinh(float __x) { return sinhf(__x); }
  1241. static __attribute__((device)) inline __attribute__((always_inline)) float sqrt(float __x) { return sqrtf(__x); }
  1242. static __attribute__((device)) inline __attribute__((always_inline)) float tan(float __x) { return tanf(__x); }
  1243. static __attribute__((device)) inline __attribute__((always_inline)) float tanh(float __x) { return tanhf(__x); }
  1244. static __attribute__((device)) inline __attribute__((always_inline)) float tgamma(float __x) { return tgammaf(__x); }
  1245. static __attribute__((device)) inline __attribute__((always_inline)) float trunc(float __x) { return truncf(__x); }
  1246. # 265 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1247. template <bool __B, class __T = void> struct __hip_enable_if {};
  1248. template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
  1249. namespace __hip {
  1250. template <class _Tp> struct is_integral {
  1251. enum { value = 0 };
  1252. };
  1253. template <> struct is_integral<bool> {
  1254. enum { value = 1 };
  1255. };
  1256. template <> struct is_integral<char> {
  1257. enum { value = 1 };
  1258. };
  1259. template <> struct is_integral<signed char> {
  1260. enum { value = 1 };
  1261. };
  1262. template <> struct is_integral<unsigned char> {
  1263. enum { value = 1 };
  1264. };
  1265. template <> struct is_integral<wchar_t> {
  1266. enum { value = 1 };
  1267. };
  1268. template <> struct is_integral<short> {
  1269. enum { value = 1 };
  1270. };
  1271. template <> struct is_integral<unsigned short> {
  1272. enum { value = 1 };
  1273. };
  1274. template <> struct is_integral<int> {
  1275. enum { value = 1 };
  1276. };
  1277. template <> struct is_integral<unsigned int> {
  1278. enum { value = 1 };
  1279. };
  1280. template <> struct is_integral<long> {
  1281. enum { value = 1 };
  1282. };
  1283. template <> struct is_integral<unsigned long> {
  1284. enum { value = 1 };
  1285. };
  1286. template <> struct is_integral<long long> {
  1287. enum { value = 1 };
  1288. };
  1289. template <> struct is_integral<unsigned long long> {
  1290. enum { value = 1 };
  1291. };
  1292. template <class _Tp> struct is_arithmetic {
  1293. enum { value = 0 };
  1294. };
  1295. template <> struct is_arithmetic<bool> {
  1296. enum { value = 1 };
  1297. };
  1298. template <> struct is_arithmetic<char> {
  1299. enum { value = 1 };
  1300. };
  1301. template <> struct is_arithmetic<signed char> {
  1302. enum { value = 1 };
  1303. };
  1304. template <> struct is_arithmetic<unsigned char> {
  1305. enum { value = 1 };
  1306. };
  1307. template <> struct is_arithmetic<wchar_t> {
  1308. enum { value = 1 };
  1309. };
  1310. template <> struct is_arithmetic<short> {
  1311. enum { value = 1 };
  1312. };
  1313. template <> struct is_arithmetic<unsigned short> {
  1314. enum { value = 1 };
  1315. };
  1316. template <> struct is_arithmetic<int> {
  1317. enum { value = 1 };
  1318. };
  1319. template <> struct is_arithmetic<unsigned int> {
  1320. enum { value = 1 };
  1321. };
  1322. template <> struct is_arithmetic<long> {
  1323. enum { value = 1 };
  1324. };
  1325. template <> struct is_arithmetic<unsigned long> {
  1326. enum { value = 1 };
  1327. };
  1328. template <> struct is_arithmetic<long long> {
  1329. enum { value = 1 };
  1330. };
  1331. template <> struct is_arithmetic<unsigned long long> {
  1332. enum { value = 1 };
  1333. };
  1334. template <> struct is_arithmetic<float> {
  1335. enum { value = 1 };
  1336. };
  1337. template <> struct is_arithmetic<double> {
  1338. enum { value = 1 };
  1339. };
  1340. struct true_type {
  1341. static const __attribute__((constant)) bool value = true;
  1342. };
  1343. struct false_type {
  1344. static const __attribute__((constant)) bool value = false;
  1345. };
  1346. template <typename __T, typename __U> struct is_same : public false_type {};
  1347. template <typename __T> struct is_same<__T, __T> : public true_type {};
  1348. template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
  1349. template <typename __T> typename add_rvalue_reference<__T>::type declval();
  1350. template <class _Tp> struct __numeric_type {
  1351. static void __test(...);
  1352. static _Float16 __test(_Float16);
  1353. static float __test(float);
  1354. static double __test(char);
  1355. static double __test(int);
  1356. static double __test(unsigned);
  1357. static double __test(long);
  1358. static double __test(unsigned long);
  1359. static double __test(long long);
  1360. static double __test(unsigned long long);
  1361. static double __test(double);
  1362. static double __test(long double);
  1363. typedef decltype(__test(declval<_Tp>())) type;
  1364. static const bool value = !is_same<type, void>::value;
  1365. };
  1366. template <> struct __numeric_type<void> { static const bool value = true; };
  1367. template <class _A1, class _A2 = void, class _A3 = void,
  1368. bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
  1369. &&__numeric_type<_A3>::value>
  1370. class __promote_imp {
  1371. public:
  1372. static const bool value = false;
  1373. };
  1374. template <class _A1, class _A2, class _A3>
  1375. class __promote_imp<_A1, _A2, _A3, true> {
  1376. private:
  1377. typedef typename __promote_imp<_A1>::type __type1;
  1378. typedef typename __promote_imp<_A2>::type __type2;
  1379. typedef typename __promote_imp<_A3>::type __type3;
  1380. public:
  1381. typedef decltype(__type1() + __type2() + __type3()) type;
  1382. static const bool value = true;
  1383. };
  1384. template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
  1385. private:
  1386. typedef typename __promote_imp<_A1>::type __type1;
  1387. typedef typename __promote_imp<_A2>::type __type2;
  1388. public:
  1389. typedef decltype(__type1() + __type2()) type;
  1390. static const bool value = true;
  1391. };
  1392. template <class _A1> class __promote_imp<_A1, void, void, true> {
  1393. public:
  1394. typedef typename __numeric_type<_A1>::type type;
  1395. static const bool value = true;
  1396. };
  1397. template <class _A1, class _A2 = void, class _A3 = void>
  1398. class __promote : public __promote_imp<_A1, _A2, _A3> {};
  1399. }
  1400. # 478 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1401. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acos(__T __x) { return ::acos((double)__x); }
  1402. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acosh(__T __x) { return ::acosh((double)__x); }
  1403. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asin(__T __x) { return ::asin((double)__x); }
  1404. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asinh(__T __x) { return ::asinh((double)__x); }
  1405. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atan(__T __x) { return ::atan((double)__x); }
  1406. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type atan2(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return atan2((__result_type)__x, (__result_type)__y); }
  1407. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atanh(__T __x) { return ::atanh((double)__x); }
  1408. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cbrt(__T __x) { return ::cbrt((double)__x); }
  1409. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type ceil(__T __x) { return ::ceil((double)__x); }
  1410. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type copysign(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return copysign((__result_type)__x, (__result_type)__y); }
  1411. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cos(__T __x) { return ::cos((double)__x); }
  1412. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cosh(__T __x) { return ::cosh((double)__x); }
  1413. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erf(__T __x) { return ::erf((double)__x); }
  1414. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erfc(__T __x) { return ::erfc((double)__x); }
  1415. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp(__T __x) { return ::exp((double)__x); }
  1416. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp2(__T __x) { return ::exp2((double)__x); }
  1417. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type expm1(__T __x) { return ::expm1((double)__x); }
  1418. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type fabs(__T __x) { return ::fabs((double)__x); }
  1419. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fdim(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fdim((__result_type)__x, (__result_type)__y); }
  1420. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type floor(__T __x) { return ::floor((double)__x); }
  1421. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmax(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmax((__result_type)__x, (__result_type)__y); }
  1422. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmin(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmin((__result_type)__x, (__result_type)__y); }
  1423. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmod(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmod((__result_type)__x, (__result_type)__y); }
  1424. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type hypot(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return hypot((__result_type)__x, (__result_type)__y); }
  1425. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, int>::type ilogb(__T __x) { return ::ilogb((double)__x); }
  1426. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isfinite(__T __x) { return ::isfinite((double)__x); }
  1427. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreater((__result_type)__x, (__result_type)__y); }
  1428. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreaterequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreaterequal((__result_type)__x, (__result_type)__y); }
  1429. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isinf(__T __x) { return ::isinf((double)__x); }
  1430. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isless(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isless((__result_type)__x, (__result_type)__y); }
  1431. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessequal((__result_type)__x, (__result_type)__y); }
  1432. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessgreater((__result_type)__x, (__result_type)__y); }
  1433. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnan(__T __x) { return ::isnan((double)__x); }
  1434. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnormal(__T __x) { return ::isnormal((double)__x); }
  1435. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isunordered(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isunordered((__result_type)__x, (__result_type)__y); }
  1436. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type lgamma(__T __x) { return ::lgamma((double)__x); }
  1437. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log(__T __x) { return ::log((double)__x); }
  1438. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log10(__T __x) { return ::log10((double)__x); }
  1439. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log1p(__T __x) { return ::log1p((double)__x); }
  1440. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log2(__T __x) { return ::log2((double)__x); }
  1441. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type logb(__T __x) { return ::logb((double)__x); }
  1442. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llrint(__T __x) { return ::llrint((double)__x); }
  1443. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llround(__T __x) { return ::llround((double)__x); }
  1444. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lrint(__T __x) { return ::lrint((double)__x); }
  1445. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lround(__T __x) { return ::lround((double)__x); }
  1446. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type nearbyint(__T __x) { return ::nearbyint((double)__x); }
  1447. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type nextafter(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return nextafter((__result_type)__x, (__result_type)__y); }
  1448. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type pow(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return pow((__result_type)__x, (__result_type)__y); }
  1449. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type remainder(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return remainder((__result_type)__x, (__result_type)__y); }
  1450. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type rint(__T __x) { return ::rint((double)__x); }
  1451. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type round(__T __x) { return ::round((double)__x); }
  1452. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type signbit(__T __x) { return ::signbit((double)__x); }
  1453. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sin(__T __x) { return ::sin((double)__x); }
  1454. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sinh(__T __x) { return ::sinh((double)__x); }
  1455. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sqrt(__T __x) { return ::sqrt((double)__x); }
  1456. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tan(__T __x) { return ::tan((double)__x); }
  1457. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tanh(__T __x) { return ::tanh((double)__x); }
  1458. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tgamma(__T __x) { return ::tgamma((double)__x); }
  1459. template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type trunc(__T __x) { return ::trunc((double)__x); }
  1460. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type max(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return max((__result_type)__x, (__result_type)__y); }
  1461. template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type min(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return min((__result_type)__x, (__result_type)__y); }
  1462. template <typename __T1, typename __T2, typename __T3>
  1463. static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<
  1464. __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
  1465. __hip::is_arithmetic<__T3>::value,
  1466. typename __hip::__promote<__T1, __T2, __T3>::type>::type
  1467. fma(__T1 __x, __T2 __y, __T3 __z) {
  1468. typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
  1469. return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
  1470. }
  1471. # 568 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1472. template <typename __T>
  1473. static __attribute__((device)) inline __attribute__((always_inline))
  1474. typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
  1475. frexp(__T __x, int *__exp) {
  1476. return ::frexp((double)__x, __exp);
  1477. }
  1478. template <typename __T>
  1479. static __attribute__((device)) inline __attribute__((always_inline))
  1480. typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
  1481. ldexp(__T __x, int __exp) {
  1482. return ::ldexp((double)__x, __exp);
  1483. }
  1484. template <typename __T>
  1485. static __attribute__((device)) inline __attribute__((always_inline))
  1486. typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
  1487. modf(__T __x, double *__exp) {
  1488. return ::modf((double)__x, __exp);
  1489. }
  1490. template <typename __T1, typename __T2>
  1491. static __attribute__((device)) inline __attribute__((always_inline))
  1492. typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
  1493. __hip::is_arithmetic<__T2>::value,
  1494. typename __hip::__promote<__T1, __T2>::type>::type
  1495. remquo(__T1 __x, __T2 __y, int *__quo) {
  1496. typedef typename __hip::__promote<__T1, __T2>::type __result_type;
  1497. return ::remquo((__result_type)__x, (__result_type)__y, __quo);
  1498. }
  1499. # 610 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
  1500. template <typename __T>
  1501. static __attribute__((device)) inline __attribute__((always_inline))
  1502. typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
  1503. scalbln(__T __x, long int __exp) {
  1504. return ::scalbln((double)__x, __exp);
  1505. }
  1506. template <typename __T>
  1507. static __attribute__((device)) inline __attribute__((always_inline))
  1508. typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
  1509. scalbn(__T __x, int __exp) {
  1510. return ::scalbn((double)__x, __exp);
  1511. }
  1512. # 133 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
  1513. # 2 "<built-in>" 2
  1514. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
  1515. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 1 3
  1516. # 58 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
  1517. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/include/hip/hip_version.h" 1 3
  1518. # 59 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
  1519. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 1 3
  1520. # 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
  1521. #pragma clang diagnostic push
  1522. #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
  1523. # 97 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
  1524. #pragma clang diagnostic pop
  1525. # 60 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
  1526. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 1 3
  1527. # 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1528. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_common.h" 1 3
  1529. # 33 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
  1530. # 43 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1531. extern "C" {
  1532. # 54 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1533. const char* amd_dbgapi_get_build_name();
  1534. # 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1535. const char* amd_dbgapi_get_git_hash();
  1536. # 72 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1537. size_t amd_dbgapi_get_build_id();
  1538. }
  1539. # 92 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1540. typedef unsigned int uint32_t;
  1541. typedef unsigned long long uint64_t;
  1542. typedef signed int int32_t;
  1543. typedef signed long long int64_t;
  1544. namespace std {
  1545. using ::uint32_t;
  1546. using ::uint64_t;
  1547. using ::int32_t;
  1548. using ::int64_t;
  1549. }
  1550. # 124 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  1551. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 1 3
  1552. # 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 3
  1553. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 1 3
  1554. # 31 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
  1555. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 1 3
  1556. # 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 3
  1557. namespace __hip_internal {
  1558. typedef unsigned char uint8_t;
  1559. typedef unsigned short uint16_t;
  1560. typedef unsigned int uint32_t;
  1561. typedef unsigned long long uint64_t;
  1562. typedef signed char int8_t;
  1563. typedef signed short int16_t;
  1564. typedef signed int int32_t;
  1565. typedef signed long long int64_t;
  1566. template <class _Tp, _Tp __v> struct integral_constant {
  1567. static constexpr const _Tp value = __v;
  1568. typedef _Tp value_type;
  1569. typedef integral_constant type;
  1570. constexpr operator value_type() const { return value; }
  1571. constexpr value_type operator()() const { return value; }
  1572. };
  1573. template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
  1574. typedef integral_constant<bool, true> true_type;
  1575. typedef integral_constant<bool, false> false_type;
  1576. template <bool B> using bool_constant = integral_constant<bool, B>;
  1577. typedef bool_constant<true> true_type;
  1578. typedef bool_constant<false> false_type;
  1579. template <bool __B, class __T = void> struct enable_if {};
  1580. template <class __T> struct enable_if<true, __T> { typedef __T type; };
  1581. template<bool _B> struct true_or_false_type : public false_type {};
  1582. template<> struct true_or_false_type<true> : public true_type {};
  1583. template <class _Tp> struct is_integral : public false_type {};
  1584. template <> struct is_integral<bool> : public true_type {};
  1585. template <> struct is_integral<char> : public true_type {};
  1586. template <> struct is_integral<signed char> : public true_type {};
  1587. template <> struct is_integral<unsigned char> : public true_type {};
  1588. template <> struct is_integral<wchar_t> : public true_type {};
  1589. template <> struct is_integral<short> : public true_type {};
  1590. template <> struct is_integral<unsigned short> : public true_type {};
  1591. template <> struct is_integral<int> : public true_type {};
  1592. template <> struct is_integral<unsigned int> : public true_type {};
  1593. template <> struct is_integral<long> : public true_type {};
  1594. template <> struct is_integral<unsigned long> : public true_type {};
  1595. template <> struct is_integral<long long> : public true_type {};
  1596. template <> struct is_integral<unsigned long long> : public true_type {};
  1597. template <class _Tp> struct is_arithmetic : public false_type {};
  1598. template <> struct is_arithmetic<bool> : public true_type {};
  1599. template <> struct is_arithmetic<char> : public true_type {};
  1600. template <> struct is_arithmetic<signed char> : public true_type {};
  1601. template <> struct is_arithmetic<unsigned char> : public true_type {};
  1602. template <> struct is_arithmetic<wchar_t> : public true_type {};
  1603. template <> struct is_arithmetic<short> : public true_type {};
  1604. template <> struct is_arithmetic<unsigned short> : public true_type {};
  1605. template <> struct is_arithmetic<int> : public true_type {};
  1606. template <> struct is_arithmetic<unsigned int> : public true_type {};
  1607. template <> struct is_arithmetic<long> : public true_type {};
  1608. template <> struct is_arithmetic<unsigned long> : public true_type {};
  1609. template <> struct is_arithmetic<long long> : public true_type {};
  1610. template <> struct is_arithmetic<unsigned long long> : public true_type {};
  1611. template <> struct is_arithmetic<float> : public true_type {};
  1612. template <> struct is_arithmetic<double> : public true_type {};
  1613. template<typename _Tp> struct is_floating_point : public false_type {};
  1614. template<> struct is_floating_point<float> : public true_type {};
  1615. template<> struct is_floating_point<double> : public true_type {};
  1616. template<> struct is_floating_point<long double> : public true_type {};
  1617. template <typename __T, typename __U> struct is_same : public false_type {};
  1618. template <typename __T> struct is_same<__T, __T> : public true_type {};
  1619. template<typename _Tp, bool = is_arithmetic<_Tp>::value>
  1620. struct is_signed : public false_type {};
  1621. template<typename _Tp>
  1622. struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
  1623. template<typename _CharT> struct char_traits;
  1624. template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
  1625. template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
  1626. typedef basic_istream<char> istream;
  1627. typedef basic_ostream<char> ostream;
  1628. template<typename _Tp>
  1629. struct is_standard_layout
  1630. : public integral_constant<bool, __is_standard_layout(_Tp)>
  1631. { };
  1632. template<typename _Tp>
  1633. struct is_trivial
  1634. : public integral_constant<bool, __is_trivial(_Tp)>
  1635. { };
  1636. }
  1637. typedef __hip_internal::uint8_t __hip_uint8_t;
  1638. typedef __hip_internal::uint16_t __hip_uint16_t;
  1639. typedef __hip_internal::uint32_t __hip_uint32_t;
  1640. typedef __hip_internal::uint64_t __hip_uint64_t;
  1641. typedef __hip_internal::int8_t __hip_int8_t;
  1642. typedef __hip_internal::int16_t __hip_int16_t;
  1643. typedef __hip_internal::int32_t __hip_int32_t;
  1644. typedef __hip_internal::int64_t __hip_int64_t;
  1645. # 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 2 3
  1646. # 52 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
  1647. namespace std {
  1648. using ::size_t;
  1649. template <class _Tp, _Tp __v> struct integral_constant {
  1650. static constexpr const _Tp value = __v;
  1651. typedef _Tp value_type;
  1652. typedef integral_constant type;
  1653. constexpr operator value_type() const { return value; }
  1654. constexpr value_type operator()() const { return value; }
  1655. };
  1656. template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
  1657. typedef integral_constant<bool, true> true_type;
  1658. typedef integral_constant<bool, false> false_type;
  1659. template <bool B> using bool_constant = integral_constant<bool, B>;
  1660. typedef bool_constant<true> true_type;
  1661. typedef bool_constant<false> false_type;
  1662. template <bool __B, class __T = void> struct enable_if {};
  1663. template <class __T> struct enable_if<true, __T> { typedef __T type; };
  1664. template<bool _B> struct true_or_false_type : public false_type {};
  1665. template<> struct true_or_false_type<true> : public true_type {};
  1666. template <class _Tp> struct is_integral : public false_type {};
  1667. template <> struct is_integral<bool> : public true_type {};
  1668. template <> struct is_integral<char> : public true_type {};
  1669. template <> struct is_integral<signed char> : public true_type {};
  1670. template <> struct is_integral<unsigned char> : public true_type {};
  1671. template <> struct is_integral<wchar_t> : public true_type {};
  1672. template <> struct is_integral<short> : public true_type {};
  1673. template <> struct is_integral<unsigned short> : public true_type {};
  1674. template <> struct is_integral<int> : public true_type {};
  1675. template <> struct is_integral<unsigned int> : public true_type {};
  1676. template <> struct is_integral<long> : public true_type {};
  1677. template <> struct is_integral<unsigned long> : public true_type {};
  1678. template <> struct is_integral<long long> : public true_type {};
  1679. template <> struct is_integral<unsigned long long> : public true_type {};
  1680. template <class _Tp> struct is_arithmetic : public false_type {};
  1681. template <> struct is_arithmetic<bool> : public true_type {};
  1682. template <> struct is_arithmetic<char> : public true_type {};
  1683. template <> struct is_arithmetic<signed char> : public true_type {};
  1684. template <> struct is_arithmetic<unsigned char> : public true_type {};
  1685. template <> struct is_arithmetic<wchar_t> : public true_type {};
  1686. template <> struct is_arithmetic<short> : public true_type {};
  1687. template <> struct is_arithmetic<unsigned short> : public true_type {};
  1688. template <> struct is_arithmetic<int> : public true_type {};
  1689. template <> struct is_arithmetic<unsigned int> : public true_type {};
  1690. template <> struct is_arithmetic<long> : public true_type {};
  1691. template <> struct is_arithmetic<unsigned long> : public true_type {};
  1692. template <> struct is_arithmetic<long long> : public true_type {};
  1693. template <> struct is_arithmetic<unsigned long long> : public true_type {};
  1694. template <> struct is_arithmetic<float> : public true_type {};
  1695. template <> struct is_arithmetic<double> : public true_type {};
  1696. template<typename _Tp> struct is_floating_point : public false_type {};
  1697. template<> struct is_floating_point<float> : public true_type {};
  1698. template<> struct is_floating_point<double> : public true_type {};
  1699. template<> struct is_floating_point<long double> : public true_type {};
  1700. template <typename __T, typename __U> struct is_same : public false_type {};
  1701. template <typename __T> struct is_same<__T, __T> : public true_type {};
  1702. template<typename _Tp, bool = is_arithmetic<_Tp>::value>
  1703. struct is_signed : public false_type {};
  1704. template<typename _Tp>
  1705. struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
  1706. template <class _T1, class _T2> struct is_convertible
  1707. : public true_or_false_type<__is_convertible_to(_T1, _T2)> {};
  1708. template<typename _CharT> struct char_traits;
  1709. template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
  1710. template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
  1711. typedef basic_istream<char> istream;
  1712. typedef basic_ostream<char> ostream;
  1713. template <typename __T> struct is_scalar : public integral_constant<bool, __is_scalar(__T)> {};
  1714. }
  1715. namespace hip_impl {
  1716. inline
  1717. constexpr
  1718. unsigned int next_pot(unsigned int x) {
  1719. return 1u << (32u - __builtin_clz(x - 1u));
  1720. }
  1721. }
  1722. template<typename T, unsigned int n> struct HIP_vector_base;
  1723. template<typename T>
  1724. struct HIP_vector_base<T, 1> {
  1725. using Native_vec_ = T __attribute__((ext_vector_type(1)));
  1726. union {
  1727. Native_vec_ data;
  1728. struct {
  1729. T x;
  1730. };
  1731. };
  1732. using value_type = T;
  1733. __attribute__((device))
  1734. HIP_vector_base() = default;
  1735. __attribute__((device))
  1736. explicit
  1737. constexpr
  1738. HIP_vector_base(T x_) noexcept : data{x_} {}
  1739. __attribute__((device))
  1740. constexpr
  1741. HIP_vector_base(const HIP_vector_base&) = default;
  1742. __attribute__((device))
  1743. constexpr
  1744. HIP_vector_base(HIP_vector_base&&) = default;
  1745. __attribute__((device))
  1746. ~HIP_vector_base() = default;
  1747. __attribute__((device))
  1748. HIP_vector_base& operator=(const HIP_vector_base&) = default;
  1749. };
  1750. template<typename T>
  1751. struct HIP_vector_base<T, 2> {
  1752. using Native_vec_ = T __attribute__((ext_vector_type(2)));
  1753. union
  1754. {
  1755. Native_vec_ data;
  1756. struct {
  1757. T x;
  1758. T y;
  1759. };
  1760. };
  1761. using value_type = T;
  1762. __attribute__((device))
  1763. HIP_vector_base() = default;
  1764. __attribute__((device))
  1765. explicit
  1766. constexpr
  1767. HIP_vector_base(T x_) noexcept : data{x_, x_} {}
  1768. __attribute__((device))
  1769. constexpr
  1770. HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {}
  1771. __attribute__((device))
  1772. constexpr
  1773. HIP_vector_base(const HIP_vector_base&) = default;
  1774. __attribute__((device))
  1775. constexpr
  1776. HIP_vector_base(HIP_vector_base&&) = default;
  1777. __attribute__((device))
  1778. ~HIP_vector_base() = default;
  1779. __attribute__((device))
  1780. HIP_vector_base& operator=(const HIP_vector_base&) = default;
  1781. };
  1782. template<typename T>
  1783. struct HIP_vector_base<T, 3> {
  1784. struct Native_vec_ {
  1785. T d[3];
  1786. __attribute__((device))
  1787. Native_vec_() = default;
  1788. __attribute__((device))
  1789. explicit
  1790. constexpr
  1791. Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
  1792. __attribute__((device))
  1793. constexpr
  1794. Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
  1795. __attribute__((device))
  1796. constexpr
  1797. Native_vec_(const Native_vec_&) = default;
  1798. __attribute__((device))
  1799. constexpr
  1800. Native_vec_(Native_vec_&&) = default;
  1801. __attribute__((device))
  1802. ~Native_vec_() = default;
  1803. __attribute__((device))
  1804. Native_vec_& operator=(const Native_vec_&) = default;
  1805. __attribute__((device))
  1806. Native_vec_& operator=(Native_vec_&&) = default;
  1807. __attribute__((device))
  1808. T& operator[](unsigned int idx) noexcept { return d[idx]; }
  1809. __attribute__((device))
  1810. T operator[](unsigned int idx) const noexcept { return d[idx]; }
  1811. __attribute__((device))
  1812. Native_vec_& operator+=(const Native_vec_& x_) noexcept
  1813. {
  1814. for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
  1815. return *this;
  1816. }
  1817. __attribute__((device))
  1818. Native_vec_& operator-=(const Native_vec_& x_) noexcept
  1819. {
  1820. for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
  1821. return *this;
  1822. }
  1823. __attribute__((device))
  1824. Native_vec_& operator*=(const Native_vec_& x_) noexcept
  1825. {
  1826. for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
  1827. return *this;
  1828. }
  1829. __attribute__((device))
  1830. Native_vec_& operator/=(const Native_vec_& x_) noexcept
  1831. {
  1832. for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
  1833. return *this;
  1834. }
  1835. template<
  1836. typename U = T,
  1837. typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
  1838. __attribute__((device))
  1839. Native_vec_ operator-() const noexcept
  1840. {
  1841. auto r{*this};
  1842. for (auto&& x : r.d) x = -x;
  1843. return r;
  1844. }
  1845. template<
  1846. typename U = T,
  1847. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1848. __attribute__((device))
  1849. Native_vec_ operator~() const noexcept
  1850. {
  1851. auto r{*this};
  1852. for (auto&& x : r.d) x = ~x;
  1853. return r;
  1854. }
  1855. template<
  1856. typename U = T,
  1857. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1858. __attribute__((device))
  1859. Native_vec_& operator%=(const Native_vec_& x_) noexcept
  1860. {
  1861. for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
  1862. return *this;
  1863. }
  1864. template<
  1865. typename U = T,
  1866. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1867. __attribute__((device))
  1868. Native_vec_& operator^=(const Native_vec_& x_) noexcept
  1869. {
  1870. for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
  1871. return *this;
  1872. }
  1873. template<
  1874. typename U = T,
  1875. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1876. __attribute__((device))
  1877. Native_vec_& operator|=(const Native_vec_& x_) noexcept
  1878. {
  1879. for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
  1880. return *this;
  1881. }
  1882. template<
  1883. typename U = T,
  1884. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1885. __attribute__((device))
  1886. Native_vec_& operator&=(const Native_vec_& x_) noexcept
  1887. {
  1888. for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
  1889. return *this;
  1890. }
  1891. template<
  1892. typename U = T,
  1893. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1894. __attribute__((device))
  1895. Native_vec_& operator>>=(const Native_vec_& x_) noexcept
  1896. {
  1897. for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
  1898. return *this;
  1899. }
  1900. template<
  1901. typename U = T,
  1902. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  1903. __attribute__((device))
  1904. Native_vec_& operator<<=(const Native_vec_& x_) noexcept
  1905. {
  1906. for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
  1907. return *this;
  1908. }
  1909. using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
  1910. __attribute__((device))
  1911. Vec3_cmp operator==(const Native_vec_& x_) const noexcept
  1912. {
  1913. return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
  1914. }
  1915. };
  1916. union {
  1917. Native_vec_ data;
  1918. struct {
  1919. T x;
  1920. T y;
  1921. T z;
  1922. };
  1923. };
  1924. using value_type = T;
  1925. __attribute__((device))
  1926. HIP_vector_base() = default;
  1927. __attribute__((device))
  1928. explicit
  1929. constexpr
  1930. HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {}
  1931. __attribute__((device))
  1932. constexpr
  1933. HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {}
  1934. __attribute__((device))
  1935. constexpr
  1936. HIP_vector_base(const HIP_vector_base&) = default;
  1937. __attribute__((device))
  1938. constexpr
  1939. HIP_vector_base(HIP_vector_base&&) = default;
  1940. __attribute__((device))
  1941. ~HIP_vector_base() = default;
  1942. __attribute__((device))
  1943. HIP_vector_base& operator=(const HIP_vector_base&) = default;
  1944. __attribute__((device))
  1945. HIP_vector_base& operator=(HIP_vector_base&&) = default;
  1946. };
  1947. template<typename T>
  1948. struct HIP_vector_base<T, 4> {
  1949. using Native_vec_ = T __attribute__((ext_vector_type(4)));
  1950. union
  1951. {
  1952. Native_vec_ data;
  1953. struct {
  1954. T x;
  1955. T y;
  1956. T z;
  1957. T w;
  1958. };
  1959. };
  1960. using value_type = T;
  1961. __attribute__((device))
  1962. HIP_vector_base() = default;
  1963. __attribute__((device))
  1964. explicit
  1965. constexpr
  1966. HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {}
  1967. __attribute__((device))
  1968. constexpr
  1969. HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {}
  1970. __attribute__((device))
  1971. constexpr
  1972. HIP_vector_base(const HIP_vector_base&) = default;
  1973. __attribute__((device))
  1974. constexpr
  1975. HIP_vector_base(HIP_vector_base&&) = default;
  1976. __attribute__((device))
  1977. ~HIP_vector_base() = default;
  1978. __attribute__((device))
  1979. HIP_vector_base& operator=(const HIP_vector_base&) = default;
  1980. };
  1981. template<typename T, unsigned int rank>
  1982. struct HIP_vector_type : public HIP_vector_base<T, rank> {
  1983. using HIP_vector_base<T, rank>::data;
  1984. using typename HIP_vector_base<T, rank>::Native_vec_;
  1985. __attribute__((device))
  1986. HIP_vector_type() = default;
  1987. template<
  1988. typename U,
  1989. typename std::enable_if<
  1990. std::is_convertible<U, T>::value>::type* = nullptr>
  1991. __attribute__((device))
  1992. explicit
  1993. constexpr
  1994. HIP_vector_type(U x_) noexcept
  1995. : HIP_vector_base<T, rank>{static_cast<T>(x_)}
  1996. {}
  1997. template<
  1998. typename... Us,
  1999. typename std::enable_if<
  2000. (rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
  2001. __attribute__((device))
  2002. constexpr
  2003. HIP_vector_type(Us... xs) noexcept
  2004. : HIP_vector_base<T, rank>{static_cast<T>(xs)...}
  2005. {}
  2006. __attribute__((device))
  2007. constexpr
  2008. HIP_vector_type(const HIP_vector_type&) = default;
  2009. __attribute__((device))
  2010. constexpr
  2011. HIP_vector_type(HIP_vector_type&&) = default;
  2012. __attribute__((device))
  2013. ~HIP_vector_type() = default;
  2014. __attribute__((device))
  2015. HIP_vector_type& operator=(const HIP_vector_type&) = default;
  2016. __attribute__((device))
  2017. HIP_vector_type& operator=(HIP_vector_type&&) = default;
  2018. __attribute__((device))
  2019. HIP_vector_type& operator++() noexcept
  2020. {
  2021. return *this += HIP_vector_type{1};
  2022. }
  2023. __attribute__((device))
  2024. HIP_vector_type operator++(int) noexcept
  2025. {
  2026. auto tmp(*this);
  2027. ++*this;
  2028. return tmp;
  2029. }
  2030. __attribute__((device))
  2031. HIP_vector_type& operator--() noexcept
  2032. {
  2033. return *this -= HIP_vector_type{1};
  2034. }
  2035. __attribute__((device))
  2036. HIP_vector_type operator--(int) noexcept
  2037. {
  2038. auto tmp(*this);
  2039. --*this;
  2040. return tmp;
  2041. }
  2042. __attribute__((device))
  2043. HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
  2044. {
  2045. data += x.data;
  2046. return *this;
  2047. }
  2048. template<
  2049. typename U,
  2050. typename std::enable_if<
  2051. std::is_convertible<U, T>{}>::type* = nullptr>
  2052. __attribute__((device))
  2053. HIP_vector_type& operator+=(U x) noexcept
  2054. {
  2055. return *this += HIP_vector_type{x};
  2056. }
  2057. __attribute__((device))
  2058. HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
  2059. {
  2060. data -= x.data;
  2061. return *this;
  2062. }
  2063. template<
  2064. typename U,
  2065. typename std::enable_if<
  2066. std::is_convertible<U, T>{}>::type* = nullptr>
  2067. __attribute__((device))
  2068. HIP_vector_type& operator-=(U x) noexcept
  2069. {
  2070. return *this -= HIP_vector_type{x};
  2071. }
  2072. __attribute__((device))
  2073. HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
  2074. {
  2075. data *= x.data;
  2076. return *this;
  2077. }
  2078. friend __attribute__((device)) inline constexpr HIP_vector_type operator*(
  2079. HIP_vector_type x, const HIP_vector_type& y) noexcept
  2080. {
  2081. return HIP_vector_type{ x } *= y;
  2082. }
  2083. template<
  2084. typename U,
  2085. typename std::enable_if<
  2086. std::is_convertible<U, T>{}>::type* = nullptr>
  2087. __attribute__((device))
  2088. HIP_vector_type& operator*=(U x) noexcept
  2089. {
  2090. return *this *= HIP_vector_type{x};
  2091. }
  2092. friend __attribute__((device)) inline constexpr HIP_vector_type operator/(
  2093. HIP_vector_type x, const HIP_vector_type& y) noexcept
  2094. {
  2095. return HIP_vector_type{ x } /= y;
  2096. }
  2097. __attribute__((device))
  2098. HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
  2099. {
  2100. data /= x.data;
  2101. return *this;
  2102. }
  2103. template<
  2104. typename U,
  2105. typename std::enable_if<
  2106. std::is_convertible<U, T>{}>::type* = nullptr>
  2107. __attribute__((device))
  2108. HIP_vector_type& operator/=(U x) noexcept
  2109. {
  2110. return *this /= HIP_vector_type{x};
  2111. }
  2112. template<
  2113. typename U = T,
  2114. typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
  2115. __attribute__((device))
  2116. HIP_vector_type operator-() const noexcept
  2117. {
  2118. auto tmp(*this);
  2119. tmp.data = -tmp.data;
  2120. return tmp;
  2121. }
  2122. template<
  2123. typename U = T,
  2124. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2125. __attribute__((device))
  2126. HIP_vector_type operator~() const noexcept
  2127. {
  2128. HIP_vector_type r{*this};
  2129. r.data = ~r.data;
  2130. return r;
  2131. }
  2132. template<
  2133. typename U = T,
  2134. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2135. __attribute__((device))
  2136. HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
  2137. {
  2138. data %= x.data;
  2139. return *this;
  2140. }
  2141. template<
  2142. typename U = T,
  2143. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2144. __attribute__((device))
  2145. HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
  2146. {
  2147. data ^= x.data;
  2148. return *this;
  2149. }
  2150. template<
  2151. typename U = T,
  2152. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2153. __attribute__((device))
  2154. HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
  2155. {
  2156. data |= x.data;
  2157. return *this;
  2158. }
  2159. template<
  2160. typename U = T,
  2161. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2162. __attribute__((device))
  2163. HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
  2164. {
  2165. data &= x.data;
  2166. return *this;
  2167. }
  2168. template<
  2169. typename U = T,
  2170. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2171. __attribute__((device))
  2172. HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
  2173. {
  2174. data >>= x.data;
  2175. return *this;
  2176. }
  2177. template<
  2178. typename U = T,
  2179. typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
  2180. __attribute__((device))
  2181. HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
  2182. {
  2183. data <<= x.data;
  2184. return *this;
  2185. }
  2186. };
  2187. template<typename T, unsigned int n>
  2188. __attribute__((device))
  2189. inline
  2190. constexpr
  2191. HIP_vector_type<T, n> operator+(
  2192. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2193. {
  2194. return HIP_vector_type<T, n>{x} += y;
  2195. }
  2196. template<typename T, unsigned int n, typename U>
  2197. __attribute__((device))
  2198. inline
  2199. constexpr
  2200. HIP_vector_type<T, n> operator+(
  2201. const HIP_vector_type<T, n>& x, U y) noexcept
  2202. {
  2203. return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
  2204. }
  2205. template<typename T, unsigned int n, typename U>
  2206. __attribute__((device))
  2207. inline
  2208. constexpr
  2209. HIP_vector_type<T, n> operator+(
  2210. U x, const HIP_vector_type<T, n>& y) noexcept
  2211. {
  2212. return HIP_vector_type<T, n>{x} += y;
  2213. }
  2214. template<typename T, unsigned int n>
  2215. __attribute__((device))
  2216. inline
  2217. constexpr
  2218. HIP_vector_type<T, n> operator-(
  2219. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2220. {
  2221. return HIP_vector_type<T, n>{x} -= y;
  2222. }
  2223. template<typename T, unsigned int n, typename U>
  2224. __attribute__((device))
  2225. inline
  2226. constexpr
  2227. HIP_vector_type<T, n> operator-(
  2228. const HIP_vector_type<T, n>& x, U y) noexcept
  2229. {
  2230. return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
  2231. }
  2232. template<typename T, unsigned int n, typename U>
  2233. __attribute__((device))
  2234. inline
  2235. constexpr
  2236. HIP_vector_type<T, n> operator-(
  2237. U x, const HIP_vector_type<T, n>& y) noexcept
  2238. {
  2239. return HIP_vector_type<T, n>{x} -= y;
  2240. }
  2241. template<typename T, unsigned int n, typename U>
  2242. __attribute__((device))
  2243. inline
  2244. constexpr
  2245. HIP_vector_type<T, n> operator*(
  2246. const HIP_vector_type<T, n>& x, U y) noexcept
  2247. {
  2248. return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
  2249. }
  2250. template<typename T, unsigned int n, typename U>
  2251. __attribute__((device))
  2252. inline
  2253. constexpr
  2254. HIP_vector_type<T, n> operator*(
  2255. U x, const HIP_vector_type<T, n>& y) noexcept
  2256. {
  2257. return HIP_vector_type<T, n>{x} *= y;
  2258. }
  2259. template<typename T, unsigned int n, typename U>
  2260. __attribute__((device))
  2261. inline
  2262. constexpr
  2263. HIP_vector_type<T, n> operator/(
  2264. const HIP_vector_type<T, n>& x, U y) noexcept
  2265. {
  2266. return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
  2267. }
  2268. template<typename T, unsigned int n, typename U>
  2269. __attribute__((device))
  2270. inline
  2271. constexpr
  2272. HIP_vector_type<T, n> operator/(
  2273. U x, const HIP_vector_type<T, n>& y) noexcept
  2274. {
  2275. return HIP_vector_type<T, n>{x} /= y;
  2276. }
  2277. template<typename V>
  2278. __attribute__((device))
  2279. inline
  2280. constexpr
  2281. bool _hip_any_zero(const V& x, int n) noexcept
  2282. {
  2283. return
  2284. (n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1));
  2285. }
  2286. template<typename T, unsigned int n>
  2287. __attribute__((device))
  2288. inline
  2289. constexpr
  2290. bool operator==(
  2291. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2292. {
  2293. return _hip_any_zero(x.data == y.data, n - 1);
  2294. }
  2295. template<typename T, unsigned int n, typename U>
  2296. __attribute__((device))
  2297. inline
  2298. constexpr
  2299. bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
  2300. {
  2301. return x == HIP_vector_type<T, n>{y};
  2302. }
  2303. template<typename T, unsigned int n, typename U>
  2304. __attribute__((device))
  2305. inline
  2306. constexpr
  2307. bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
  2308. {
  2309. return HIP_vector_type<T, n>{x} == y;
  2310. }
  2311. template<typename T, unsigned int n>
  2312. __attribute__((device))
  2313. inline
  2314. constexpr
  2315. bool operator!=(
  2316. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2317. {
  2318. return !(x == y);
  2319. }
  2320. template<typename T, unsigned int n, typename U>
  2321. __attribute__((device))
  2322. inline
  2323. constexpr
  2324. bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
  2325. {
  2326. return !(x == y);
  2327. }
  2328. template<typename T, unsigned int n, typename U>
  2329. __attribute__((device))
  2330. inline
  2331. constexpr
  2332. bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
  2333. {
  2334. return !(x == y);
  2335. }
  2336. template<
  2337. typename T,
  2338. unsigned int n,
  2339. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2340. __attribute__((device))
  2341. inline
  2342. constexpr
  2343. HIP_vector_type<T, n> operator%(
  2344. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2345. {
  2346. return HIP_vector_type<T, n>{x} %= y;
  2347. }
  2348. template<
  2349. typename T,
  2350. unsigned int n,
  2351. typename U,
  2352. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2353. __attribute__((device))
  2354. inline
  2355. constexpr
  2356. HIP_vector_type<T, n> operator%(
  2357. const HIP_vector_type<T, n>& x, U y) noexcept
  2358. {
  2359. return HIP_vector_type<T, n>{x} %= HIP_vector_type<T, n>{y};
  2360. }
  2361. template<
  2362. typename T,
  2363. unsigned int n,
  2364. typename U,
  2365. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2366. __attribute__((device))
  2367. inline
  2368. constexpr
  2369. HIP_vector_type<T, n> operator%(
  2370. U x, const HIP_vector_type<T, n>& y) noexcept
  2371. {
  2372. return HIP_vector_type<T, n>{x} %= y;
  2373. }
  2374. template<
  2375. typename T,
  2376. unsigned int n,
  2377. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2378. __attribute__((device))
  2379. inline
  2380. constexpr
  2381. HIP_vector_type<T, n> operator^(
  2382. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2383. {
  2384. return HIP_vector_type<T, n>{x} ^= y;
  2385. }
  2386. template<
  2387. typename T,
  2388. unsigned int n,
  2389. typename U,
  2390. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2391. __attribute__((device))
  2392. inline
  2393. constexpr
  2394. HIP_vector_type<T, n> operator^(
  2395. const HIP_vector_type<T, n>& x, U y) noexcept
  2396. {
  2397. return HIP_vector_type<T, n>{x} ^= HIP_vector_type<T, n>{y};
  2398. }
  2399. template<
  2400. typename T,
  2401. unsigned int n,
  2402. typename U,
  2403. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2404. __attribute__((device))
  2405. inline
  2406. constexpr
  2407. HIP_vector_type<T, n> operator^(
  2408. U x, const HIP_vector_type<T, n>& y) noexcept
  2409. {
  2410. return HIP_vector_type<T, n>{x} ^= y;
  2411. }
  2412. template<
  2413. typename T,
  2414. unsigned int n,
  2415. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2416. __attribute__((device))
  2417. inline
  2418. constexpr
  2419. HIP_vector_type<T, n> operator|(
  2420. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2421. {
  2422. return HIP_vector_type<T, n>{x} |= y;
  2423. }
  2424. template<
  2425. typename T,
  2426. unsigned int n,
  2427. typename U,
  2428. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2429. __attribute__((device))
  2430. inline
  2431. constexpr
  2432. HIP_vector_type<T, n> operator|(
  2433. const HIP_vector_type<T, n>& x, U y) noexcept
  2434. {
  2435. return HIP_vector_type<T, n>{x} |= HIP_vector_type<T, n>{y};
  2436. }
  2437. template<
  2438. typename T,
  2439. unsigned int n,
  2440. typename U,
  2441. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2442. __attribute__((device))
  2443. inline
  2444. constexpr
  2445. HIP_vector_type<T, n> operator|(
  2446. U x, const HIP_vector_type<T, n>& y) noexcept
  2447. {
  2448. return HIP_vector_type<T, n>{x} |= y;
  2449. }
  2450. template<
  2451. typename T,
  2452. unsigned int n,
  2453. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2454. __attribute__((device))
  2455. inline
  2456. constexpr
  2457. HIP_vector_type<T, n> operator&(
  2458. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2459. {
  2460. return HIP_vector_type<T, n>{x} &= y;
  2461. }
  2462. template<
  2463. typename T,
  2464. unsigned int n,
  2465. typename U,
  2466. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2467. __attribute__((device))
  2468. inline
  2469. constexpr
  2470. HIP_vector_type<T, n> operator&(
  2471. const HIP_vector_type<T, n>& x, U y) noexcept
  2472. {
  2473. return HIP_vector_type<T, n>{x} &= HIP_vector_type<T, n>{y};
  2474. }
  2475. template<
  2476. typename T,
  2477. unsigned int n,
  2478. typename U,
  2479. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2480. __attribute__((device))
  2481. inline
  2482. constexpr
  2483. HIP_vector_type<T, n> operator&(
  2484. U x, const HIP_vector_type<T, n>& y) noexcept
  2485. {
  2486. return HIP_vector_type<T, n>{x} &= y;
  2487. }
  2488. template<
  2489. typename T,
  2490. unsigned int n,
  2491. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2492. __attribute__((device))
  2493. inline
  2494. constexpr
  2495. HIP_vector_type<T, n> operator>>(
  2496. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2497. {
  2498. return HIP_vector_type<T, n>{x} >>= y;
  2499. }
  2500. template<
  2501. typename T,
  2502. unsigned int n,
  2503. typename U,
  2504. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2505. __attribute__((device))
  2506. inline
  2507. constexpr
  2508. HIP_vector_type<T, n> operator>>(
  2509. const HIP_vector_type<T, n>& x, U y) noexcept
  2510. {
  2511. return HIP_vector_type<T, n>{x} >>= HIP_vector_type<T, n>{y};
  2512. }
  2513. template<
  2514. typename T,
  2515. unsigned int n,
  2516. typename U,
  2517. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2518. __attribute__((device))
  2519. inline
  2520. constexpr
  2521. HIP_vector_type<T, n> operator>>(
  2522. U x, const HIP_vector_type<T, n>& y) noexcept
  2523. {
  2524. return HIP_vector_type<T, n>{x} >>= y;
  2525. }
  2526. template<
  2527. typename T,
  2528. unsigned int n,
  2529. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2530. __attribute__((device))
  2531. inline
  2532. constexpr
  2533. HIP_vector_type<T, n> operator<<(
  2534. const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
  2535. {
  2536. return HIP_vector_type<T, n>{x} <<= y;
  2537. }
  2538. template<
  2539. typename T,
  2540. unsigned int n,
  2541. typename U,
  2542. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2543. __attribute__((device))
  2544. inline
  2545. constexpr
  2546. HIP_vector_type<T, n> operator<<(
  2547. const HIP_vector_type<T, n>& x, U y) noexcept
  2548. {
  2549. return HIP_vector_type<T, n>{x} <<= HIP_vector_type<T, n>{y};
  2550. }
  2551. template<
  2552. typename T,
  2553. unsigned int n,
  2554. typename U,
  2555. typename std::enable_if<std::is_arithmetic<U>::value>::type,
  2556. typename std::enable_if<std::is_integral<T>{}>* = nullptr>
  2557. __attribute__((device))
  2558. inline
  2559. constexpr
  2560. HIP_vector_type<T, n> operator<<(
  2561. U x, const HIP_vector_type<T, n>& y) noexcept
  2562. {
  2563. return HIP_vector_type<T, n>{x} <<= y;
  2564. }
  2565. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2566. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 1 && rankU >= 1),
  2567. const HIP_vector_type<T, rankT>>::type
  2568. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2569. return HIP_vector_type<T, rankT>(static_cast<T>(u.x));
  2570. };
  2571. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2572. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU == 1),
  2573. const HIP_vector_type<T, rankT>>::type
  2574. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2575. return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0));
  2576. };
  2577. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2578. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU >= 2),
  2579. const HIP_vector_type<T, rankT>>::type
  2580. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2581. return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y));
  2582. };
  2583. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2584. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 1),
  2585. const HIP_vector_type<T, rankT>>::type
  2586. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2587. return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0),
  2588. static_cast<T>(0), static_cast<T>(0));
  2589. };
  2590. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2591. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 2),
  2592. const HIP_vector_type<T, rankT>>::type
  2593. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2594. return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y),
  2595. static_cast<T>(0), static_cast<T>(0));
  2596. };
  2597. template <typename T, unsigned int rankT, typename U, unsigned int rankU>
  2598. inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 4),
  2599. const HIP_vector_type<T, rankT>>::type
  2600. __hipMapVector(const HIP_vector_type<U, rankU>& u) {
  2601. return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y),
  2602. static_cast<T>(u.z), static_cast<T>(u.w));
  2603. };
  2604. # 1135 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
  2605. using uchar1 = HIP_vector_type<unsigned char, 1>; using uchar2 = HIP_vector_type<unsigned char, 2>; using uchar3 = HIP_vector_type<unsigned char, 3>; using uchar4 = HIP_vector_type<unsigned char, 4>;;
  2606. using char1 = HIP_vector_type<char, 1>; using char2 = HIP_vector_type<char, 2>; using char3 = HIP_vector_type<char, 3>; using char4 = HIP_vector_type<char, 4>;;
  2607. using ushort1 = HIP_vector_type<unsigned short, 1>; using ushort2 = HIP_vector_type<unsigned short, 2>; using ushort3 = HIP_vector_type<unsigned short, 3>; using ushort4 = HIP_vector_type<unsigned short, 4>;;
  2608. using short1 = HIP_vector_type<short, 1>; using short2 = HIP_vector_type<short, 2>; using short3 = HIP_vector_type<short, 3>; using short4 = HIP_vector_type<short, 4>;;
  2609. using uint1 = HIP_vector_type<unsigned int, 1>; using uint2 = HIP_vector_type<unsigned int, 2>; using uint3 = HIP_vector_type<unsigned int, 3>; using uint4 = HIP_vector_type<unsigned int, 4>;;
  2610. using int1 = HIP_vector_type<int, 1>; using int2 = HIP_vector_type<int, 2>; using int3 = HIP_vector_type<int, 3>; using int4 = HIP_vector_type<int, 4>;;
  2611. using ulong1 = HIP_vector_type<unsigned long, 1>; using ulong2 = HIP_vector_type<unsigned long, 2>; using ulong3 = HIP_vector_type<unsigned long, 3>; using ulong4 = HIP_vector_type<unsigned long, 4>;;
  2612. using long1 = HIP_vector_type<long, 1>; using long2 = HIP_vector_type<long, 2>; using long3 = HIP_vector_type<long, 3>; using long4 = HIP_vector_type<long, 4>;;
  2613. using ulonglong1 = HIP_vector_type<unsigned long long, 1>; using ulonglong2 = HIP_vector_type<unsigned long long, 2>; using ulonglong3 = HIP_vector_type<unsigned long long, 3>; using ulonglong4 = HIP_vector_type<unsigned long long, 4>;;
  2614. using longlong1 = HIP_vector_type<long long, 1>; using longlong2 = HIP_vector_type<long long, 2>; using longlong3 = HIP_vector_type<long long, 3>; using longlong4 = HIP_vector_type<long long, 4>;;
  2615. using float1 = HIP_vector_type<float, 1>; using float2 = HIP_vector_type<float, 2>; using float3 = HIP_vector_type<float, 3>; using float4 = HIP_vector_type<float, 4>;;
  2616. using double1 = HIP_vector_type<double, 1>; using double2 = HIP_vector_type<double, 2>; using double3 = HIP_vector_type<double, 3>; using double4 = HIP_vector_type<double, 4>;;
  2617. # 2117 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
  2618. static inline __attribute__((device)) uchar1 make_uchar1(unsigned char x) { uchar1 r{x}; return r; };
  2619. static inline __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y) { uchar2 r{x, y}; return r; };
  2620. static inline __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) { uchar3 r{x, y, z}; return r; };
  2621. static inline __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) { uchar4 r{x, y, z, w}; return r; };
  2622. static inline __attribute__((device)) char1 make_char1(signed char x) { char1 r{x}; return r; };
  2623. static inline __attribute__((device)) char2 make_char2(signed char x, signed char y) { char2 r{x, y}; return r; };
  2624. static inline __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z) { char3 r{x, y, z}; return r; };
  2625. static inline __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w) { char4 r{x, y, z, w}; return r; };
  2626. static inline __attribute__((device)) ushort1 make_ushort1(unsigned short x) { ushort1 r{x}; return r; };
  2627. static inline __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y) { ushort2 r{x, y}; return r; };
  2628. static inline __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) { ushort3 r{x, y, z}; return r; };
  2629. static inline __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) { ushort4 r{x, y, z, w}; return r; };
  2630. static inline __attribute__((device)) short1 make_short1(signed short x) { short1 r{x}; return r; };
  2631. static inline __attribute__((device)) short2 make_short2(signed short x, signed short y) { short2 r{x, y}; return r; };
  2632. static inline __attribute__((device)) short3 make_short3(signed short x, signed short y, signed short z) { short3 r{x, y, z}; return r; };
  2633. static inline __attribute__((device)) short4 make_short4(signed short x, signed short y, signed short z, signed short w) { short4 r{x, y, z, w}; return r; };
  2634. static inline __attribute__((device)) uint1 make_uint1(unsigned int x) { uint1 r{x}; return r; };
  2635. static inline __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y) { uint2 r{x, y}; return r; };
  2636. static inline __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) { uint3 r{x, y, z}; return r; };
  2637. static inline __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) { uint4 r{x, y, z, w}; return r; };
  2638. static inline __attribute__((device)) int1 make_int1(signed int x) { int1 r{x}; return r; };
  2639. static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { int2 r{x, y}; return r; };
  2640. static inline __attribute__((device)) int3 make_int3(signed int x, signed int y, signed int z) { int3 r{x, y, z}; return r; };
  2641. static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { int4 r{x, y, z, w}; return r; };
  2642. static inline __attribute__((device)) float1 make_float1(float x) { float1 r{x}; return r; };
  2643. static inline __attribute__((device)) float2 make_float2(float x, float y) { float2 r{x, y}; return r; };
  2644. static inline __attribute__((device)) float3 make_float3(float x, float y, float z) { float3 r{x, y, z}; return r; };
  2645. static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { float4 r{x, y, z, w}; return r; };
  2646. static inline __attribute__((device)) double1 make_double1(double x) { double1 r{x}; return r; };
  2647. static inline __attribute__((device)) double2 make_double2(double x, double y) { double2 r{x, y}; return r; };
  2648. static inline __attribute__((device)) double3 make_double3(double x, double y, double z) { double3 r{x, y, z}; return r; };
  2649. static inline __attribute__((device)) double4 make_double4(double x, double y, double z, double w) { double4 r{x, y, z, w}; return r; };
  2650. static inline __attribute__((device)) ulong1 make_ulong1(unsigned long x) { ulong1 r{x}; return r; };
  2651. static inline __attribute__((device)) ulong2 make_ulong2(unsigned long x, unsigned long y) { ulong2 r{x, y}; return r; };
  2652. static inline __attribute__((device)) ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z) { ulong3 r{x, y, z}; return r; };
  2653. static inline __attribute__((device)) ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w) { ulong4 r{x, y, z, w}; return r; };
  2654. static inline __attribute__((device)) long1 make_long1(signed long x) { long1 r{x}; return r; };
  2655. static inline __attribute__((device)) long2 make_long2(signed long x, signed long y) { long2 r{x, y}; return r; };
  2656. static inline __attribute__((device)) long3 make_long3(signed long x, signed long y, signed long z) { long3 r{x, y, z}; return r; };
  2657. static inline __attribute__((device)) long4 make_long4(signed long x, signed long y, signed long z, signed long w) { long4 r{x, y, z, w}; return r; };
  2658. static inline __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long x) { ulonglong1 r{x}; return r; };
  2659. static inline __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y) { ulonglong2 r{x, y}; return r; };
  2660. static inline __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z) { ulonglong3 r{x, y, z}; return r; };
  2661. static inline __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w) { ulonglong4 r{x, y, z, w}; return r; };
  2662. static inline __attribute__((device)) longlong1 make_longlong1(signed long long x) { longlong1 r{x}; return r; };
  2663. static inline __attribute__((device)) longlong2 make_longlong2(signed long long x, signed long long y) { longlong2 r{x, y}; return r; };
  2664. static inline __attribute__((device)) longlong3 make_longlong3(signed long long x, signed long long y, signed long long z) { longlong3 r{x, y, z}; return r; };
  2665. static inline __attribute__((device)) longlong4 make_longlong4(signed long long x, signed long long y, signed long long z, signed long long w) { longlong4 r{x, y, z, w}; return r; };
  2666. # 28 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 2 3
  2667. __attribute__((device)) inline static char __ldg(const char* ptr) { return *ptr; }
  2668. __attribute__((device)) inline static char2 __ldg(const char2* ptr) { return *ptr; }
  2669. __attribute__((device)) inline static char4 __ldg(const char4* ptr) { return *ptr; }
  2670. __attribute__((device)) inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
  2671. __attribute__((device)) inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
  2672. __attribute__((device)) inline static short __ldg(const short* ptr) { return ptr[0]; }
  2673. __attribute__((device)) inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
  2674. __attribute__((device)) inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
  2675. __attribute__((device)) inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
  2676. __attribute__((device)) inline static int __ldg(const int* ptr) { return ptr[0]; }
  2677. __attribute__((device)) inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
  2678. __attribute__((device)) inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
  2679. __attribute__((device)) inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
  2680. __attribute__((device)) inline static long __ldg(const long* ptr) { return ptr[0]; }
  2681. __attribute__((device)) inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
  2682. __attribute__((device)) inline static long long __ldg(const long long* ptr) { return ptr[0]; }
  2683. __attribute__((device)) inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
  2684. __attribute__((device)) inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
  2685. __attribute__((device)) inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
  2686. __attribute__((device)) inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
  2687. __attribute__((device)) inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
  2688. __attribute__((device)) inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
  2689. __attribute__((device)) inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
  2690. __attribute__((device)) inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
  2691. __attribute__((device)) inline static float __ldg(const float* ptr) { return ptr[0]; }
  2692. __attribute__((device)) inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
  2693. __attribute__((device)) inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
  2694. __attribute__((device)) inline static double __ldg(const double* ptr) { return ptr[0]; }
  2695. __attribute__((device)) inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
  2696. # 125 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
  2697. # 250 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
  2698. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
  2699. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
  2700. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
  2701. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
  2702. struct __HIP_BlockIdx {
  2703. __attribute__((device))
  2704. std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
  2705. };
  2706. struct __HIP_BlockDim {
  2707. __attribute__((device))
  2708. std::uint32_t operator()(std::uint32_t x) const noexcept {
  2709. return __ockl_get_local_size(x);
  2710. }
  2711. };
  2712. struct __HIP_GridDim {
  2713. __attribute__((device))
  2714. std::uint32_t operator()(std::uint32_t x) const noexcept {
  2715. return __ockl_get_num_groups(x);
  2716. }
  2717. };
  2718. struct __HIP_ThreadIdx {
  2719. __attribute__((device))
  2720. std::uint32_t operator()(std::uint32_t x) const noexcept {
  2721. return __ockl_get_local_id(x);
  2722. }
  2723. };
  2724. typedef struct dim3 {
  2725. uint32_t x;
  2726. uint32_t y;
  2727. uint32_t z;
  2728. constexpr __attribute__((device)) dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
  2729. } dim3;
  2730. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_global_size(unsigned int);
  2731. template <typename F> struct __HIP_Coordinates {
  2732. using R = decltype(F{}(0));
  2733. struct __X {
  2734. __attribute__((device)) operator R() const noexcept { return F{}(0); }
  2735. __attribute__((device)) R operator+=(const R& rhs) { return F{}(0) + rhs; }
  2736. };
  2737. struct __Y {
  2738. __attribute__((device)) operator R() const noexcept { return F{}(1); }
  2739. __attribute__((device)) R operator+=(const R& rhs) { return F{}(1) + rhs; }
  2740. };
  2741. struct __Z {
  2742. __attribute__((device)) operator R() const noexcept { return F{}(2); }
  2743. __attribute__((device)) R operator+=(const R& rhs) { return F{}(2) + rhs; }
  2744. };
  2745. __attribute__((weak))
  2746. __attribute__((device)) static constexpr __X x{};
  2747. __attribute__((weak))
  2748. __attribute__((device)) static constexpr __Y y{};
  2749. __attribute__((weak))
  2750. __attribute__((device)) static constexpr __Z z{};
  2751. __attribute__((device)) operator dim3() const { return dim3(x, y, z); }
  2752. };
  2753. template <typename F>
  2754. constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
  2755. template <typename F>
  2756. constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
  2757. template <typename F>
  2758. constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
  2759. inline
  2760. __attribute__((device))
  2761. std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
  2762. __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
  2763. return __ockl_get_global_size(0);
  2764. }
  2765. inline
  2766. __attribute__((device))
  2767. std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
  2768. __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
  2769. return __ockl_get_global_size(0);
  2770. }
  2771. inline
  2772. __attribute__((device))
  2773. std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
  2774. __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
  2775. return __ockl_get_global_size(1);
  2776. }
  2777. inline
  2778. __attribute__((device))
  2779. std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
  2780. __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
  2781. return __ockl_get_global_size(1);
  2782. }
  2783. inline
  2784. __attribute__((device))
  2785. std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
  2786. __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
  2787. return __ockl_get_global_size(2);
  2788. }
  2789. inline
  2790. __attribute__((device))
  2791. std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
  2792. __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
  2793. return __ockl_get_global_size(2);
  2794. }
  2795. static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
  2796. static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
  2797. static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
  2798. static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
  2799. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
  2800. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
  2801. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
  2802. extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
  2803. # 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
  2804. # 73 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
  2805. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_vector_types.h" 1 3
  2806. # 74 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
  2807. # 6 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
  2808. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 1 3
  2809. # 37 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 3
  2810. # 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 1 3
  2811. # 55 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
  2812. #pragma clang diagnostic push
  2813. #pragma clang diagnostic ignored "-Wshadow"
  2814. struct hip_bfloat16
  2815. {
  2816. __hip_uint16_t data;
  2817. enum truncate_t
  2818. {
  2819. truncate
  2820. };
  2821. __attribute__((device)) hip_bfloat16() = default;
  2822. explicit __attribute__((device)) hip_bfloat16(float f)
  2823. : data(float_to_bfloat16(f))
  2824. {
  2825. }
  2826. explicit __attribute__((device)) hip_bfloat16(float f, truncate_t)
  2827. : data(truncate_float_to_bfloat16(f))
  2828. {
  2829. }
  2830. __attribute__((device)) operator float() const
  2831. {
  2832. union
  2833. {
  2834. uint32_t int32;
  2835. float fp32;
  2836. } u = {uint32_t(data) << 16};
  2837. return u.fp32;
  2838. }
  2839. __attribute__((device)) hip_bfloat16 &operator=(const float& f)
  2840. {
  2841. data = float_to_bfloat16(f);
  2842. return *this;
  2843. }
  2844. static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f)
  2845. {
  2846. hip_bfloat16 output;
  2847. output.data = float_to_bfloat16(f);
  2848. return output;
  2849. }
  2850. static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f, truncate_t)
  2851. {
  2852. hip_bfloat16 output;
  2853. output.data = truncate_float_to_bfloat16(f);
  2854. return output;
  2855. }
  2856. private:
  2857. static __attribute__((device)) __hip_uint16_t float_to_bfloat16(float f)
  2858. {
  2859. union
  2860. {
  2861. float fp32;
  2862. uint32_t int32;
  2863. } u = {f};
  2864. if(~u.int32 & 0x7f800000)
  2865. {
  2866. # 136 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
  2867. u.int32 += 0x7fff + ((u.int32 >> 16) & 1);
  2868. }
  2869. else if(u.int32 & 0xffff)
  2870. {
  2871. # 148 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
  2872. u.int32 |= 0x10000;
  2873. }
  2874. return __hip_uint16_t(u.int32 >> 16);
  2875. }
  2876. static __attribute__((device)) __hip_uint16_t truncate_float_to_bfloat16(float f)
  2877. {
  2878. union
  2879. {
  2880. float fp32;
  2881. uint32_t int32;
  2882. } u = {f};
  2883. return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
  2884. }
  2885. };
  2886. #pragma clang diagnostic pop
  2887. typedef struct
  2888. {
  2889. __hip_uint16_t data;
  2890. } hip_bfloat16_public;
  2891. static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
  2892. "hip_bfloat16 is not a standard layout type, and thus is "
  2893. "incompatible with C.");
  2894. static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
  2895. "hip_bfloat16 is not a trivial type, and thus is "
  2896. "incompatible with C.");
  2897. # 189 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
  2898. inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a)
  2899. {
  2900. return a;
  2901. }
  2902. inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a)
  2903. {
  2904. a.data ^= 0x8000;
  2905. return a;
  2906. }
  2907. inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
  2908. {
  2909. return hip_bfloat16(float(a) + float(b));
  2910. }
  2911. inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
  2912. {
  2913. return hip_bfloat16(float(a) - float(b));
  2914. }
  2915. inline __attribute__((device)) hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
  2916. {
  2917. return hip_bfloat16(float(a) * float(b));
  2918. }
  2919. inline __attribute__((device)) hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
  2920. {
  2921. return hip_bfloat16(float(a) / float(b));
  2922. }
  2923. inline __attribute__((device)) bool operator<(hip_bfloat16 a, hip_bfloat16 b)
  2924. {
  2925. return float(a) < float(b);
  2926. }
  2927. inline __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b)
  2928. {
  2929. return float(a) == float(b);
  2930. }
  2931. inline __attribute__((device)) bool operator>(hip_bfloat16 a, hip_bfloat16 b)
  2932. {
  2933. return b < a;
  2934. }
  2935. inline __attribute__((device)) bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
  2936. {
  2937. return !(a > b);
  2938. }
  2939. inline __attribute__((device)) bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
  2940. {
  2941. return !(a == b);
  2942. }
  2943. inline __attribute__((device)) bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
  2944. {
  2945. return !(a < b);
  2946. }
  2947. inline __attribute__((device)) hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
  2948. {
  2949. return a = a + b;
  2950. }
  2951. inline __attribute__((device)) hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
  2952. {
  2953. return a = a - b;
  2954. }
  2955. inline __attribute__((device)) hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
  2956. {
  2957. return a = a * b;
  2958. }
  2959. inline __attribute__((device)) hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
  2960. {
  2961. return a = a / b;
  2962. }
  2963. inline __attribute__((device)) hip_bfloat16& operator++(hip_bfloat16& a)
  2964. {
  2965. return a += hip_bfloat16(1.0f);
  2966. }
  2967. inline __attribute__((device)) hip_bfloat16& operator--(hip_bfloat16& a)
  2968. {
  2969. return a -= hip_bfloat16(1.0f);
  2970. }
  2971. inline __attribute__((device)) hip_bfloat16 operator++(hip_bfloat16& a, int)
  2972. {
  2973. hip_bfloat16 orig = a;
  2974. ++a;
  2975. return orig;
  2976. }
  2977. inline __attribute__((device)) hip_bfloat16 operator--(hip_bfloat16& a, int)
  2978. {
  2979. hip_bfloat16 orig = a;
  2980. --a;
  2981. return orig;
  2982. }
  2983. namespace std
  2984. {
  2985. constexpr __attribute__((device)) bool isinf(hip_bfloat16 a)
  2986. {
  2987. return !(~a.data & 0x7f80) && !(a.data & 0x7f);
  2988. }
  2989. constexpr __attribute__((device)) bool isnan(hip_bfloat16 a)
  2990. {
  2991. return !(~a.data & 0x7f80) && +(a.data & 0x7f);
  2992. }
  2993. constexpr __attribute__((device)) bool iszero(hip_bfloat16 a)
  2994. {
  2995. return !(a.data & 0x7fff);
  2996. }
  2997. }
  2998. # 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 2 3
  2999. # 7 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
  3000. #pragma clang diagnostic push
  3001. #pragma clang diagnostic ignored "-Wreserved-id-macro"
  3002. #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
  3003. #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
  3004. #pragma clang diagnostic ignored "-Wundef"
  3005. #define __device__ __attribute__((device))
  3006. #define __host__ __attribute__((host))
  3007. #define __global__ __attribute__((global))
  3008. #define __constant__ __attribute__((constant))
  3009. #define __shared__ __attribute__((shared))
  3010. #define __align__(x) __attribute__((aligned(x)))
  3011. #if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
  3012. #define __noinline__ __attribute__((noinline))
  3013. #endif
  3014. #define __forceinline__ inline __attribute__((always_inline))
  3015. #if __HIP_NO_IMAGE_SUPPORT
  3016. #define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
  3017. #else
  3018. #define __hip_img_chk__
  3019. #endif
  3020. #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
  3021. __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
  3022. #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
  3023. __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
  3024. amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
  3025. #define select_impl_(_1, _2, impl_, ...) impl_
  3026. #define __launch_bounds__(...) \
  3027. select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
  3028. #define HIP_INCLUDE_HIP_HIP_RUNTIME_H
  3029. #define _HIP_BFLOAT16_H_
  3030. #define HIP_INCLUDE_HIP_MATH_FUNCTIONS_H
  3031. #define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
  3032. #if !__HIP_NO_STD_DEFS__
  3033. #if defined(__HIPRTC_PTRDIFF_T_IS_LONG_LONG__) && __HIPRTC_PTRDIFF_T_IS_LONG_LONG__==1
  3034. typedef long long ptrdiff_t;
  3035. #else
  3036. typedef __PTRDIFF_TYPE__ ptrdiff_t;
  3037. #endif
  3038. typedef long clock_t;
  3039. namespace std {
  3040. using ::ptrdiff_t;
  3041. using ::clock_t;
  3042. }
  3043. #endif // __HIP_NO_STD_DEFS__
  3044. #pragma clang diagnostic pop/*
  3045. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  3046. Permission is hereby granted, free of charge, to any person obtaining a copy
  3047. of this software and associated documentation files (the "Software"), to deal
  3048. in the Software without restriction, including without limitation the rights
  3049. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  3050. copies of the Software, and to permit persons to whom the Software is
  3051. furnished to do so, subject to the following conditions:
  3052. The above copyright notice and this permission notice shall be included in
  3053. all copies or substantial portions of the Software.
  3054. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3055. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3056. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3057. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3058. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  3059. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  3060. THE SOFTWARE.
  3061. */
  3062. #ifndef HIP_INCLUDE_HIP_HIP_COMMON_H
  3063. #define HIP_INCLUDE_HIP_HIP_COMMON_H
  3064. #if defined(__clang__)
  3065. #pragma clang diagnostic push
  3066. #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
  3067. #endif
  3068. // Common code included at start of every hip file.
  3069. // Auto enable __HIP_PLATFORM_AMD__ if compiling on AMD platform
  3070. // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
  3071. #if defined(__clang__) && defined(__HIP__)
  3072. #ifndef __HIP_PLATFORM_AMD__
  3073. #define __HIP_PLATFORM_AMD__
  3074. #endif
  3075. #endif // defined(__clang__) && defined(__HIP__)
  3076. // Auto enable __HIP_PLATFORM_NVIDIA__ if compiling with NVIDIA platform
  3077. #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
  3078. #ifndef __HIP_PLATFORM_NVIDIA__
  3079. #define __HIP_PLATFORM_NVIDIA__
  3080. #endif
  3081. #ifdef __CUDACC__
  3082. #define __HIPCC__
  3083. #endif
  3084. #endif //__NVCC__
  3085. // Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
  3086. #if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || \
  3087. (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
  3088. #define __HIP_DEVICE_COMPILE__ 1
  3089. #endif
  3090. #ifdef __GNUC__
  3091. #define HIP_PUBLIC_API __attribute__ ((visibility ("default")))
  3092. #define HIP_INTERNAL_EXPORTED_API __attribute__ ((visibility ("default")))
  3093. #else
  3094. #define HIP_PUBLIC_API
  3095. #define HIP_INTERNAL_EXPORTED_API
  3096. #endif
  3097. #if __HIP_DEVICE_COMPILE__ == 0
  3098. // 32-bit Atomics
  3099. #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
  3100. #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
  3101. #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
  3102. #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
  3103. #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
  3104. // 64-bit Atomics
  3105. #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
  3106. #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
  3107. // Doubles
  3108. #define __HIP_ARCH_HAS_DOUBLES__ (0)
  3109. // Warp cross-lane operations
  3110. #define __HIP_ARCH_HAS_WARP_VOTE__ (0)
  3111. #define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
  3112. #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
  3113. #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
  3114. // Sync
  3115. #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
  3116. #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
  3117. // Misc
  3118. #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
  3119. #define __HIP_ARCH_HAS_3DGRID__ (0)
  3120. #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
  3121. #endif
  3122. #if defined(__clang__)
  3123. #pragma clang diagnostic pop
  3124. #endif
  3125. #endif
  3126. /*
  3127. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  3128. Permission is hereby granted, free of charge, to any person obtaining a copy
  3129. of this software and associated documentation files (the "Software"), to deal
  3130. in the Software without restriction, including without limitation the rights
  3131. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  3132. copies of the Software, and to permit persons to whom the Software is
  3133. furnished to do so, subject to the following conditions:
  3134. The above copyright notice and this permission notice shall be included in
  3135. all copies or substantial portions of the Software.
  3136. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3137. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3138. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3139. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3140. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  3141. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  3142. THE SOFTWARE.
  3143. */
  3144. #ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H
  3145. #define HIP_INCLUDE_HIP_LIBRARY_TYPES_H
  3146. #if !defined(__HIPCC_RTC__)
  3147. #include <hip/hip_common.h>
  3148. #endif
  3149. #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
  3150. typedef enum hipDataType {
  3151. HIP_R_32F = 0,
  3152. HIP_R_64F = 1,
  3153. HIP_R_16F = 2,
  3154. HIP_R_8I = 3,
  3155. HIP_C_32F = 4,
  3156. HIP_C_64F = 5,
  3157. HIP_C_16F = 6,
  3158. HIP_C_8I = 7,
  3159. HIP_R_8U = 8,
  3160. HIP_C_8U = 9,
  3161. HIP_R_32I = 10,
  3162. HIP_C_32I = 11,
  3163. HIP_R_32U = 12,
  3164. HIP_C_32U = 13,
  3165. HIP_R_16BF = 14,
  3166. HIP_C_16BF = 15,
  3167. HIP_R_4I = 16,
  3168. HIP_C_4I = 17,
  3169. HIP_R_4U = 18,
  3170. HIP_C_4U = 19,
  3171. HIP_R_16I = 20,
  3172. HIP_C_16I = 21,
  3173. HIP_R_16U = 22,
  3174. HIP_C_16U = 23,
  3175. HIP_R_64I = 24,
  3176. HIP_C_64I = 25,
  3177. HIP_R_64U = 26,
  3178. HIP_C_64U = 27,
  3179. // HIP specific Data Types
  3180. HIP_R_8F_E4M3_FNUZ = 1000,
  3181. HIP_R_8F_E5M2_FNUZ = 1001
  3182. } hipDataType;
  3183. typedef enum hipLibraryPropertyType {
  3184. HIP_LIBRARY_MAJOR_VERSION,
  3185. HIP_LIBRARY_MINOR_VERSION,
  3186. HIP_LIBRARY_PATCH_LEVEL
  3187. } hipLibraryPropertyType;
  3188. #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
  3189. #include "library_types.h"
  3190. #else
  3191. #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
  3192. #endif
  3193. #endif
  3194. /*
  3195. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  3196. Permission is hereby granted, free of charge, to any person obtaining a copy
  3197. of this software and associated documentation files (the "Software"), to deal
  3198. in the Software without restriction, including without limitation the rights
  3199. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  3200. copies of the Software, and to permit persons to whom the Software is
  3201. furnished to do so, subject to the following conditions:
  3202. The above copyright notice and this permission notice shall be included in
  3203. all copies or substantial portions of the Software.
  3204. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3205. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3206. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3207. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3208. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  3209. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  3210. THE SOFTWARE.
  3211. */
  3212. #ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H
  3213. #define HIP_INCLUDE_HIP_DRIVER_TYPES_H
  3214. #if !defined(__HIPCC_RTC__)
  3215. #include <hip/hip_common.h>
  3216. #endif
  3217. #if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
  3218. #include "driver_types.h"
  3219. #elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
  3220. #if !defined(__HIPCC_RTC__)
  3221. #ifndef __cplusplus
  3222. #include <stdbool.h>
  3223. #endif
  3224. #endif // !defined(__HIPCC_RTC__)
  3225. typedef void* hipDeviceptr_t;
  3226. typedef enum hipChannelFormatKind {
  3227. hipChannelFormatKindSigned = 0,
  3228. hipChannelFormatKindUnsigned = 1,
  3229. hipChannelFormatKindFloat = 2,
  3230. hipChannelFormatKindNone = 3
  3231. }hipChannelFormatKind;
  3232. typedef struct hipChannelFormatDesc {
  3233. int x;
  3234. int y;
  3235. int z;
  3236. int w;
  3237. enum hipChannelFormatKind f;
  3238. }hipChannelFormatDesc;
  3239. #define HIP_TRSA_OVERRIDE_FORMAT 0x01
  3240. #define HIP_TRSF_READ_AS_INTEGER 0x01
  3241. #define HIP_TRSF_NORMALIZED_COORDINATES 0x02
  3242. #define HIP_TRSF_SRGB 0x10
  3243. typedef struct hipArray* hipArray_t;
  3244. typedef const struct hipArray* hipArray_const_t;
  3245. typedef enum hipArray_Format {
  3246. HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
  3247. HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
  3248. HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
  3249. HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
  3250. HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
  3251. HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
  3252. HIP_AD_FORMAT_HALF = 0x10,
  3253. HIP_AD_FORMAT_FLOAT = 0x20
  3254. }hipArray_Format;
  3255. typedef struct HIP_ARRAY_DESCRIPTOR {
  3256. size_t Width;
  3257. size_t Height;
  3258. enum hipArray_Format Format;
  3259. unsigned int NumChannels;
  3260. }HIP_ARRAY_DESCRIPTOR;
  3261. typedef struct HIP_ARRAY3D_DESCRIPTOR {
  3262. size_t Width;
  3263. size_t Height;
  3264. size_t Depth;
  3265. enum hipArray_Format Format;
  3266. unsigned int NumChannels;
  3267. unsigned int Flags;
  3268. }HIP_ARRAY3D_DESCRIPTOR;
  3269. #if !defined(__HIPCC_RTC__)
  3270. typedef struct hip_Memcpy2D {
  3271. size_t srcXInBytes;
  3272. size_t srcY;
  3273. hipMemoryType srcMemoryType;
  3274. const void* srcHost;
  3275. hipDeviceptr_t srcDevice;
  3276. hipArray_t srcArray;
  3277. size_t srcPitch;
  3278. size_t dstXInBytes;
  3279. size_t dstY;
  3280. hipMemoryType dstMemoryType;
  3281. void* dstHost;
  3282. hipDeviceptr_t dstDevice;
  3283. hipArray_t dstArray;
  3284. size_t dstPitch;
  3285. size_t WidthInBytes;
  3286. size_t Height;
  3287. } hip_Memcpy2D;
  3288. #endif // !defined(__HIPCC_RTC__)
  3289. typedef struct hipMipmappedArray {
  3290. void* data;
  3291. struct hipChannelFormatDesc desc;
  3292. unsigned int type;
  3293. unsigned int width;
  3294. unsigned int height;
  3295. unsigned int depth;
  3296. unsigned int min_mipmap_level;
  3297. unsigned int max_mipmap_level;
  3298. unsigned int flags;
  3299. enum hipArray_Format format;
  3300. unsigned int num_channels;
  3301. } hipMipmappedArray;
  3302. typedef struct hipMipmappedArray* hipMipmappedArray_t;
  3303. typedef hipMipmappedArray_t hipmipmappedArray;
  3304. typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
  3305. /**
  3306. * hip resource types
  3307. */
  3308. typedef enum hipResourceType {
  3309. hipResourceTypeArray = 0x00,
  3310. hipResourceTypeMipmappedArray = 0x01,
  3311. hipResourceTypeLinear = 0x02,
  3312. hipResourceTypePitch2D = 0x03
  3313. }hipResourceType;
  3314. typedef enum HIPresourcetype_enum {
  3315. HIP_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
  3316. HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
  3317. HIP_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
  3318. HIP_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */
  3319. } HIPresourcetype, hipResourcetype;
  3320. /**
  3321. * hip address modes
  3322. */
  3323. typedef enum HIPaddress_mode_enum {
  3324. HIP_TR_ADDRESS_MODE_WRAP = 0,
  3325. HIP_TR_ADDRESS_MODE_CLAMP = 1,
  3326. HIP_TR_ADDRESS_MODE_MIRROR = 2,
  3327. HIP_TR_ADDRESS_MODE_BORDER = 3
  3328. } HIPaddress_mode;
  3329. /**
  3330. * hip filter modes
  3331. */
  3332. typedef enum HIPfilter_mode_enum {
  3333. HIP_TR_FILTER_MODE_POINT = 0,
  3334. HIP_TR_FILTER_MODE_LINEAR = 1
  3335. } HIPfilter_mode;
  3336. /**
  3337. * Texture descriptor
  3338. */
  3339. typedef struct HIP_TEXTURE_DESC_st {
  3340. HIPaddress_mode addressMode[3]; /**< Address modes */
  3341. HIPfilter_mode filterMode; /**< Filter mode */
  3342. unsigned int flags; /**< Flags */
  3343. unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */
  3344. HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
  3345. float mipmapLevelBias; /**< Mipmap level bias */
  3346. float minMipmapLevelClamp; /**< Mipmap minimum level clamp */
  3347. float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */
  3348. float borderColor[4]; /**< Border Color */
  3349. int reserved[12];
  3350. } HIP_TEXTURE_DESC;
  3351. /**
  3352. * hip texture resource view formats
  3353. */
  3354. typedef enum hipResourceViewFormat {
  3355. hipResViewFormatNone = 0x00,
  3356. hipResViewFormatUnsignedChar1 = 0x01,
  3357. hipResViewFormatUnsignedChar2 = 0x02,
  3358. hipResViewFormatUnsignedChar4 = 0x03,
  3359. hipResViewFormatSignedChar1 = 0x04,
  3360. hipResViewFormatSignedChar2 = 0x05,
  3361. hipResViewFormatSignedChar4 = 0x06,
  3362. hipResViewFormatUnsignedShort1 = 0x07,
  3363. hipResViewFormatUnsignedShort2 = 0x08,
  3364. hipResViewFormatUnsignedShort4 = 0x09,
  3365. hipResViewFormatSignedShort1 = 0x0a,
  3366. hipResViewFormatSignedShort2 = 0x0b,
  3367. hipResViewFormatSignedShort4 = 0x0c,
  3368. hipResViewFormatUnsignedInt1 = 0x0d,
  3369. hipResViewFormatUnsignedInt2 = 0x0e,
  3370. hipResViewFormatUnsignedInt4 = 0x0f,
  3371. hipResViewFormatSignedInt1 = 0x10,
  3372. hipResViewFormatSignedInt2 = 0x11,
  3373. hipResViewFormatSignedInt4 = 0x12,
  3374. hipResViewFormatHalf1 = 0x13,
  3375. hipResViewFormatHalf2 = 0x14,
  3376. hipResViewFormatHalf4 = 0x15,
  3377. hipResViewFormatFloat1 = 0x16,
  3378. hipResViewFormatFloat2 = 0x17,
  3379. hipResViewFormatFloat4 = 0x18,
  3380. hipResViewFormatUnsignedBlockCompressed1 = 0x19,
  3381. hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
  3382. hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
  3383. hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
  3384. hipResViewFormatSignedBlockCompressed4 = 0x1d,
  3385. hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
  3386. hipResViewFormatSignedBlockCompressed5 = 0x1f,
  3387. hipResViewFormatUnsignedBlockCompressed6H = 0x20,
  3388. hipResViewFormatSignedBlockCompressed6H = 0x21,
  3389. hipResViewFormatUnsignedBlockCompressed7 = 0x22
  3390. }hipResourceViewFormat;
  3391. typedef enum HIPresourceViewFormat_enum
  3392. {
  3393. HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
  3394. HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
  3395. HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
  3396. HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */
  3397. HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */
  3398. HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */
  3399. HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */
  3400. HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */
  3401. HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */
  3402. HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */
  3403. HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */
  3404. HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */
  3405. HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */
  3406. HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */
  3407. HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */
  3408. HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */
  3409. HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */
  3410. HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */
  3411. HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */
  3412. HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */
  3413. HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */
  3414. HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */
  3415. HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */
  3416. HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */
  3417. HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */
  3418. HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */
  3419. HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */
  3420. HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */
  3421. HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */
  3422. HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */
  3423. HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */
  3424. HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */
  3425. HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
  3426. HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */
  3427. HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */
  3428. } HIPresourceViewFormat;
  3429. /**
  3430. * HIP resource descriptor
  3431. */
  3432. typedef struct hipResourceDesc {
  3433. enum hipResourceType resType;
  3434. union {
  3435. struct {
  3436. hipArray_t array;
  3437. } array;
  3438. struct {
  3439. hipMipmappedArray_t mipmap;
  3440. } mipmap;
  3441. struct {
  3442. void* devPtr;
  3443. struct hipChannelFormatDesc desc;
  3444. size_t sizeInBytes;
  3445. } linear;
  3446. struct {
  3447. void* devPtr;
  3448. struct hipChannelFormatDesc desc;
  3449. size_t width;
  3450. size_t height;
  3451. size_t pitchInBytes;
  3452. } pitch2D;
  3453. } res;
  3454. }hipResourceDesc;
  3455. typedef struct HIP_RESOURCE_DESC_st
  3456. {
  3457. HIPresourcetype resType; /**< Resource type */
  3458. union {
  3459. struct {
  3460. hipArray_t hArray; /**< HIP array */
  3461. } array;
  3462. struct {
  3463. hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
  3464. } mipmap;
  3465. struct {
  3466. hipDeviceptr_t devPtr; /**< Device pointer */
  3467. hipArray_Format format; /**< Array format */
  3468. unsigned int numChannels; /**< Channels per array element */
  3469. size_t sizeInBytes; /**< Size in bytes */
  3470. } linear;
  3471. struct {
  3472. hipDeviceptr_t devPtr; /**< Device pointer */
  3473. hipArray_Format format; /**< Array format */
  3474. unsigned int numChannels; /**< Channels per array element */
  3475. size_t width; /**< Width of the array in elements */
  3476. size_t height; /**< Height of the array in elements */
  3477. size_t pitchInBytes; /**< Pitch between two rows in bytes */
  3478. } pitch2D;
  3479. struct {
  3480. int reserved[32];
  3481. } reserved;
  3482. } res;
  3483. unsigned int flags; /**< Flags (must be zero) */
  3484. } HIP_RESOURCE_DESC;
  3485. /**
  3486. * hip resource view descriptor
  3487. */
  3488. struct hipResourceViewDesc {
  3489. enum hipResourceViewFormat format;
  3490. size_t width;
  3491. size_t height;
  3492. size_t depth;
  3493. unsigned int firstMipmapLevel;
  3494. unsigned int lastMipmapLevel;
  3495. unsigned int firstLayer;
  3496. unsigned int lastLayer;
  3497. };
  3498. /**
  3499. * Resource view descriptor
  3500. */
  3501. typedef struct HIP_RESOURCE_VIEW_DESC_st
  3502. {
  3503. HIPresourceViewFormat format; /**< Resource view format */
  3504. size_t width; /**< Width of the resource view */
  3505. size_t height; /**< Height of the resource view */
  3506. size_t depth; /**< Depth of the resource view */
  3507. unsigned int firstMipmapLevel; /**< First defined mipmap level */
  3508. unsigned int lastMipmapLevel; /**< Last defined mipmap level */
  3509. unsigned int firstLayer; /**< First layer index */
  3510. unsigned int lastLayer; /**< Last layer index */
  3511. unsigned int reserved[16];
  3512. } HIP_RESOURCE_VIEW_DESC;
  3513. /**
  3514. * Memory copy types
  3515. *
  3516. */
  3517. #if !defined(__HIPCC_RTC__)
  3518. typedef enum hipMemcpyKind {
  3519. hipMemcpyHostToHost = 0, ///< Host-to-Host Copy
  3520. hipMemcpyHostToDevice = 1, ///< Host-to-Device Copy
  3521. hipMemcpyDeviceToHost = 2, ///< Device-to-Host Copy
  3522. hipMemcpyDeviceToDevice = 3, ///< Device-to-Device Copy
  3523. hipMemcpyDefault =
  3524. 4 ///< Runtime will automatically determine copy-kind based on virtual addresses.
  3525. } hipMemcpyKind;
  3526. typedef struct hipPitchedPtr {
  3527. void* ptr;
  3528. size_t pitch;
  3529. size_t xsize;
  3530. size_t ysize;
  3531. }hipPitchedPtr;
  3532. typedef struct hipExtent {
  3533. size_t width; // Width in elements when referring to array memory, in bytes when referring to
  3534. // linear memory
  3535. size_t height;
  3536. size_t depth;
  3537. }hipExtent;
  3538. typedef struct hipPos {
  3539. size_t x;
  3540. size_t y;
  3541. size_t z;
  3542. }hipPos;
  3543. typedef struct hipMemcpy3DParms {
  3544. hipArray_t srcArray;
  3545. struct hipPos srcPos;
  3546. struct hipPitchedPtr srcPtr;
  3547. hipArray_t dstArray;
  3548. struct hipPos dstPos;
  3549. struct hipPitchedPtr dstPtr;
  3550. struct hipExtent extent;
  3551. enum hipMemcpyKind kind;
  3552. } hipMemcpy3DParms;
  3553. typedef struct HIP_MEMCPY3D {
  3554. size_t srcXInBytes;
  3555. size_t srcY;
  3556. size_t srcZ;
  3557. size_t srcLOD;
  3558. hipMemoryType srcMemoryType;
  3559. const void* srcHost;
  3560. hipDeviceptr_t srcDevice;
  3561. hipArray_t srcArray;
  3562. size_t srcPitch;
  3563. size_t srcHeight;
  3564. size_t dstXInBytes;
  3565. size_t dstY;
  3566. size_t dstZ;
  3567. size_t dstLOD;
  3568. hipMemoryType dstMemoryType;
  3569. void* dstHost;
  3570. hipDeviceptr_t dstDevice;
  3571. hipArray_t dstArray;
  3572. size_t dstPitch;
  3573. size_t dstHeight;
  3574. size_t WidthInBytes;
  3575. size_t Height;
  3576. size_t Depth;
  3577. } HIP_MEMCPY3D;
  3578. static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
  3579. size_t ysz) {
  3580. struct hipPitchedPtr s;
  3581. s.ptr = d;
  3582. s.pitch = p;
  3583. s.xsize = xsz;
  3584. s.ysize = ysz;
  3585. return s;
  3586. }
  3587. static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
  3588. struct hipPos p;
  3589. p.x = x;
  3590. p.y = y;
  3591. p.z = z;
  3592. return p;
  3593. }
  3594. static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
  3595. struct hipExtent e;
  3596. e.width = w;
  3597. e.height = h;
  3598. e.depth = d;
  3599. return e;
  3600. }
  3601. typedef enum hipFunction_attribute {
  3602. HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
  3603. HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
  3604. HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,
  3605. HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
  3606. HIP_FUNC_ATTRIBUTE_NUM_REGS,
  3607. HIP_FUNC_ATTRIBUTE_PTX_VERSION,
  3608. HIP_FUNC_ATTRIBUTE_BINARY_VERSION,
  3609. HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,
  3610. HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
  3611. HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
  3612. HIP_FUNC_ATTRIBUTE_MAX
  3613. } hipFunction_attribute;
  3614. typedef enum hipPointer_attribute {
  3615. HIP_POINTER_ATTRIBUTE_CONTEXT = 1, ///< The context on which a pointer was allocated
  3616. ///< @warning - not supported in HIP
  3617. HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, ///< memory type describing location of a pointer
  3618. HIP_POINTER_ATTRIBUTE_DEVICE_POINTER,///< address at which the pointer is allocated on device
  3619. HIP_POINTER_ATTRIBUTE_HOST_POINTER, ///< address at which the pointer is allocated on host
  3620. HIP_POINTER_ATTRIBUTE_P2P_TOKENS, ///< A pair of tokens for use with linux kernel interface
  3621. ///< @warning - not supported in HIP
  3622. HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS, ///< Synchronize every synchronous memory operation
  3623. ///< initiated on this region
  3624. HIP_POINTER_ATTRIBUTE_BUFFER_ID, ///< Unique ID for an allocated memory region
  3625. HIP_POINTER_ATTRIBUTE_IS_MANAGED, ///< Indicates if the pointer points to managed memory
  3626. HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,///< device ordinal of a device on which a pointer
  3627. ///< was allocated or registered
  3628. HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE, ///< if this pointer maps to an allocation
  3629. ///< that is suitable for hipIpcGetMemHandle
  3630. ///< @warning - not supported in HIP
  3631. HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,///< Starting address for this requested pointer
  3632. HIP_POINTER_ATTRIBUTE_RANGE_SIZE, ///< Size of the address range for this requested pointer
  3633. HIP_POINTER_ATTRIBUTE_MAPPED, ///< tells if this pointer is in a valid address range
  3634. ///< that is mapped to a backing allocation
  3635. HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,///< Bitmask of allowed hipmemAllocationHandleType
  3636. ///< for this allocation @warning - not supported in HIP
  3637. HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, ///< returns if the memory referenced by
  3638. ///< this pointer can be used with the GPUDirect RDMA API
  3639. ///< @warning - not supported in HIP
  3640. HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS, ///< Returns the access flags the device associated with
  3641. ///< for the corresponding memory referenced by the ptr
  3642. HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE ///< Returns the mempool handle for the allocation if
  3643. ///< it was allocated from a mempool
  3644. ///< @warning - not supported in HIP
  3645. } hipPointer_attribute;
  3646. #endif // !defined(__HIPCC_RTC__)
  3647. #else
  3648. #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
  3649. #endif
  3650. #endif
  3651. /*
  3652. Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  3653. Permission is hereby granted, free of charge, to any person obtaining a copy
  3654. of this software and associated documentation files (the "Software"), to deal
  3655. in the Software without restriction, including without limitation the rights
  3656. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  3657. copies of the Software, and to permit persons to whom the Software is
  3658. furnished to do so, subject to the following conditions:
  3659. The above copyright notice and this permission notice shall be included in
  3660. all copies or substantial portions of the Software.
  3661. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3662. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3663. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3664. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3665. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  3666. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  3667. THE SOFTWARE.
  3668. */
  3669. /**
  3670. * @file surface_types.h
  3671. * @brief Defines surface types for HIP runtime.
  3672. */
  3673. #ifndef HIP_INCLUDE_HIP_SURFACE_TYPES_H
  3674. #define HIP_INCLUDE_HIP_SURFACE_TYPES_H
  3675. #if defined(__clang__)
  3676. #pragma clang diagnostic push
  3677. #pragma clang diagnostic ignored "-Wreserved-identifier"
  3678. #endif
  3679. #if !defined(__HIPCC_RTC__)
  3680. #include <hip/driver_types.h>
  3681. #endif
  3682. /**
  3683. * An opaque value that represents a hip surface object
  3684. */
  3685. struct __hip_surface;
  3686. typedef struct __hip_surface* hipSurfaceObject_t;
  3687. /**
  3688. * hip surface reference
  3689. */
  3690. struct surfaceReference {
  3691. hipSurfaceObject_t surfaceObject;
  3692. };
  3693. /**
  3694. * hip surface boundary modes
  3695. */
  3696. enum hipSurfaceBoundaryMode {
  3697. hipBoundaryModeZero = 0,
  3698. hipBoundaryModeTrap = 1,
  3699. hipBoundaryModeClamp = 2
  3700. };
  3701. #if defined(__clang__)
  3702. #pragma clang diagnostic pop
  3703. #endif
  3704. #endif /* !HIP_INCLUDE_HIP_SURFACE_TYPES_H */
  3705. /*
  3706. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  3707. Permission is hereby granted, free of charge, to any person obtaining a copy
  3708. of this software and associated documentation files (the "Software"), to deal
  3709. in the Software without restriction, including without limitation the rights
  3710. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  3711. copies of the Software, and to permit persons to whom the Software is
  3712. furnished to do so, subject to the following conditions:
  3713. The above copyright notice and this permission notice shall be included in
  3714. all copies or substantial portions of the Software.
  3715. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3716. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3717. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3718. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3719. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  3720. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  3721. THE SOFTWARE.
  3722. */
  3723. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
  3724. #define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
  3725. #if !defined(__HIPCC_RTC__)
  3726. #include <hip/hip_common.h>
  3727. #include <hip/driver_types.h>
  3728. #include <hip/amd_detail/amd_hip_vector_types.h>
  3729. #endif
  3730. #ifdef __cplusplus
  3731. extern "C" HIP_PUBLIC_API
  3732. hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
  3733. static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
  3734. int e = (int)sizeof(unsigned short) * 8;
  3735. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
  3736. }
  3737. static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
  3738. int e = (int)sizeof(unsigned short) * 8;
  3739. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
  3740. }
  3741. static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
  3742. int e = (int)sizeof(unsigned short) * 8;
  3743. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
  3744. }
  3745. static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
  3746. int e = (int)sizeof(unsigned short) * 8;
  3747. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
  3748. }
  3749. template <typename T>
  3750. static inline hipChannelFormatDesc hipCreateChannelDesc() {
  3751. return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
  3752. }
  3753. template <>
  3754. inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
  3755. int e = (int)sizeof(char) * 8;
  3756. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3757. }
  3758. template <>
  3759. inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
  3760. int e = (int)sizeof(signed char) * 8;
  3761. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3762. }
  3763. template <>
  3764. inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
  3765. int e = (int)sizeof(unsigned char) * 8;
  3766. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3767. }
  3768. template <>
  3769. inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
  3770. int e = (int)sizeof(unsigned char) * 8;
  3771. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3772. }
  3773. template <>
  3774. inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
  3775. int e = (int)sizeof(signed char) * 8;
  3776. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3777. }
  3778. template <>
  3779. inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
  3780. int e = (int)sizeof(unsigned char) * 8;
  3781. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
  3782. }
  3783. template <>
  3784. inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
  3785. int e = (int)sizeof(signed char) * 8;
  3786. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
  3787. }
  3788. #ifndef __GNUC__ // vector3 is the same as vector4
  3789. template <>
  3790. inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
  3791. int e = (int)sizeof(unsigned char) * 8;
  3792. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
  3793. }
  3794. template <>
  3795. inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
  3796. int e = (int)sizeof(signed char) * 8;
  3797. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
  3798. }
  3799. #endif
  3800. template <>
  3801. inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
  3802. int e = (int)sizeof(unsigned char) * 8;
  3803. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
  3804. }
  3805. template <>
  3806. inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
  3807. int e = (int)sizeof(signed char) * 8;
  3808. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
  3809. }
  3810. template <>
  3811. inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
  3812. int e = (int)sizeof(unsigned short) * 8;
  3813. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3814. }
  3815. template <>
  3816. inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
  3817. int e = (int)sizeof(signed short) * 8;
  3818. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3819. }
  3820. template <>
  3821. inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
  3822. int e = (int)sizeof(unsigned short) * 8;
  3823. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3824. }
  3825. template <>
  3826. inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
  3827. int e = (int)sizeof(signed short) * 8;
  3828. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3829. }
  3830. template <>
  3831. inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
  3832. int e = (int)sizeof(unsigned short) * 8;
  3833. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
  3834. }
  3835. template <>
  3836. inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
  3837. int e = (int)sizeof(signed short) * 8;
  3838. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
  3839. }
  3840. #ifndef __GNUC__
  3841. template <>
  3842. inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
  3843. int e = (int)sizeof(unsigned short) * 8;
  3844. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
  3845. }
  3846. template <>
  3847. inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
  3848. int e = (int)sizeof(signed short) * 8;
  3849. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
  3850. }
  3851. #endif
  3852. template <>
  3853. inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
  3854. int e = (int)sizeof(unsigned short) * 8;
  3855. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
  3856. }
  3857. template <>
  3858. inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
  3859. int e = (int)sizeof(signed short) * 8;
  3860. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
  3861. }
  3862. template <>
  3863. inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
  3864. int e = (int)sizeof(unsigned int) * 8;
  3865. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3866. }
  3867. template <>
  3868. inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
  3869. int e = (int)sizeof(signed int) * 8;
  3870. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3871. }
  3872. template <>
  3873. inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
  3874. int e = (int)sizeof(unsigned int) * 8;
  3875. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3876. }
  3877. template <>
  3878. inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
  3879. int e = (int)sizeof(signed int) * 8;
  3880. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3881. }
  3882. template <>
  3883. inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
  3884. int e = (int)sizeof(unsigned int) * 8;
  3885. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
  3886. }
  3887. template <>
  3888. inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
  3889. int e = (int)sizeof(signed int) * 8;
  3890. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
  3891. }
  3892. #ifndef __GNUC__
  3893. template <>
  3894. inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
  3895. int e = (int)sizeof(unsigned int) * 8;
  3896. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
  3897. }
  3898. template <>
  3899. inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
  3900. int e = (int)sizeof(signed int) * 8;
  3901. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
  3902. }
  3903. #endif
  3904. template <>
  3905. inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
  3906. int e = (int)sizeof(unsigned int) * 8;
  3907. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
  3908. }
  3909. template <>
  3910. inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
  3911. int e = (int)sizeof(signed int) * 8;
  3912. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
  3913. }
  3914. template <>
  3915. inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
  3916. int e = (int)sizeof(float) * 8;
  3917. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
  3918. }
  3919. template <>
  3920. inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
  3921. int e = (int)sizeof(float) * 8;
  3922. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
  3923. }
  3924. template <>
  3925. inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
  3926. int e = (int)sizeof(float) * 8;
  3927. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
  3928. }
  3929. #ifndef __GNUC__
  3930. template <>
  3931. inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
  3932. int e = (int)sizeof(float) * 8;
  3933. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
  3934. }
  3935. #endif
  3936. template <>
  3937. inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
  3938. int e = (int)sizeof(float) * 8;
  3939. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
  3940. }
  3941. #if !defined(__LP64__)
  3942. template <>
  3943. inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
  3944. int e = (int)sizeof(unsigned long) * 8;
  3945. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3946. }
  3947. template <>
  3948. inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
  3949. int e = (int)sizeof(signed long) * 8;
  3950. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3951. }
  3952. template <>
  3953. inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
  3954. int e = (int)sizeof(unsigned long) * 8;
  3955. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
  3956. }
  3957. template <>
  3958. inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
  3959. int e = (int)sizeof(signed long) * 8;
  3960. return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
  3961. }
  3962. template <>
  3963. inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
  3964. int e = (int)sizeof(unsigned long) * 8;
  3965. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
  3966. }
  3967. template <>
  3968. inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
  3969. int e = (int)sizeof(signed long) * 8;
  3970. return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
  3971. }
  3972. #ifndef __GNUC__
  3973. template <>
  3974. inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
  3975. int e = (int)sizeof(unsigned long) * 8;
  3976. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
  3977. }
  3978. template <>
  3979. inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
  3980. int e = (int)sizeof(signed long) * 8;
  3981. return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
  3982. }
  3983. #endif
  3984. template <>
  3985. inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
  3986. int e = (int)sizeof(unsigned long) * 8;
  3987. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
  3988. }
  3989. template <>
  3990. inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
  3991. int e = (int)sizeof(signed long) * 8;
  3992. return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
  3993. }
  3994. #endif /* !__LP64__ */
  3995. #else
  3996. struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
  3997. enum hipChannelFormatKind f);
  3998. #endif /* __cplusplus */
  3999. #endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H */
  4000. /*
  4001. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  4002. Permission is hereby granted, free of charge, to any person obtaining a copy
  4003. of this software and associated documentation files (the "Software"), to deal
  4004. in the Software without restriction, including without limitation the rights
  4005. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  4006. copies of the Software, and to permit persons to whom the Software is
  4007. furnished to do so, subject to the following conditions:
  4008. The above copyright notice and this permission notice shall be included in
  4009. all copies or substantial portions of the Software.
  4010. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4011. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4012. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  4013. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  4014. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  4015. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  4016. THE SOFTWARE.
  4017. */
  4018. #ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
  4019. #define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
  4020. #if defined(__clang__)
  4021. #pragma clang diagnostic push
  4022. #pragma clang diagnostic ignored "-Wreserved-identifier"
  4023. #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
  4024. #pragma clang diagnostic ignored "-Wc++98-compat"
  4025. #endif
  4026. #if !defined(__HIPCC_RTC__)
  4027. #include <hip/hip_common.h>
  4028. #endif
  4029. #if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
  4030. #include "texture_types.h"
  4031. #elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
  4032. /*******************************************************************************
  4033. * *
  4034. * *
  4035. * *
  4036. *******************************************************************************/
  4037. #if !defined(__HIPCC_RTC__)
  4038. #include <limits.h>
  4039. #include <hip/channel_descriptor.h>
  4040. #include <hip/driver_types.h>
  4041. #endif // !defined(__HIPCC_RTC__)
  4042. #define hipTextureType1D 0x01
  4043. #define hipTextureType2D 0x02
  4044. #define hipTextureType3D 0x03
  4045. #define hipTextureTypeCubemap 0x0C
  4046. #define hipTextureType1DLayered 0xF1
  4047. #define hipTextureType2DLayered 0xF2
  4048. #define hipTextureTypeCubemapLayered 0xFC
  4049. /**
  4050. * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
  4051. */
  4052. #define HIP_IMAGE_OBJECT_SIZE_DWORD 12
  4053. #define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
  4054. #define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
  4055. #define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
  4056. /**
  4057. * An opaque value that represents a hip texture object
  4058. */
  4059. struct __hip_texture;
  4060. typedef struct __hip_texture* hipTextureObject_t;
  4061. /**
  4062. * hip texture address modes
  4063. */
  4064. enum hipTextureAddressMode {
  4065. hipAddressModeWrap = 0,
  4066. hipAddressModeClamp = 1,
  4067. hipAddressModeMirror = 2,
  4068. hipAddressModeBorder = 3
  4069. };
  4070. /**
  4071. * hip texture filter modes
  4072. */
  4073. enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
  4074. /**
  4075. * hip texture read modes
  4076. */
  4077. enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
  4078. /**
  4079. * hip texture reference
  4080. */
  4081. typedef struct textureReference {
  4082. int normalized;
  4083. enum hipTextureReadMode readMode;// used only for driver API's
  4084. enum hipTextureFilterMode filterMode;
  4085. enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
  4086. struct hipChannelFormatDesc channelDesc;
  4087. int sRGB; // Perform sRGB->linear conversion during texture read
  4088. unsigned int maxAnisotropy; // Limit to the anisotropy ratio
  4089. enum hipTextureFilterMode mipmapFilterMode;
  4090. float mipmapLevelBias;
  4091. float minMipmapLevelClamp;
  4092. float maxMipmapLevelClamp;
  4093. hipTextureObject_t textureObject;
  4094. int numChannels;
  4095. enum hipArray_Format format;
  4096. }textureReference;
  4097. /**
  4098. * hip texture descriptor
  4099. */
  4100. typedef struct hipTextureDesc {
  4101. enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
  4102. enum hipTextureFilterMode filterMode;
  4103. enum hipTextureReadMode readMode;
  4104. int sRGB; // Perform sRGB->linear conversion during texture read
  4105. float borderColor[4];
  4106. int normalizedCoords;
  4107. unsigned int maxAnisotropy;
  4108. enum hipTextureFilterMode mipmapFilterMode;
  4109. float mipmapLevelBias;
  4110. float minMipmapLevelClamp;
  4111. float maxMipmapLevelClamp;
  4112. }hipTextureDesc;
  4113. #if __cplusplus
  4114. /*******************************************************************************
  4115. * *
  4116. * *
  4117. * *
  4118. *******************************************************************************/
  4119. #if __HIP__
  4120. #define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
  4121. #else
  4122. #define __HIP_TEXTURE_ATTRIB
  4123. #endif
  4124. typedef textureReference* hipTexRef;
  4125. template <class T, int texType = hipTextureType1D,
  4126. enum hipTextureReadMode mode = hipReadModeElementType>
  4127. struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
  4128. texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
  4129. enum hipTextureAddressMode aMode = hipAddressModeClamp) {
  4130. normalized = norm;
  4131. readMode = mode;
  4132. filterMode = fMode;
  4133. addressMode[0] = aMode;
  4134. addressMode[1] = aMode;
  4135. addressMode[2] = aMode;
  4136. channelDesc = hipCreateChannelDesc<T>();
  4137. sRGB = 0;
  4138. textureObject = nullptr;
  4139. maxAnisotropy = 0;
  4140. mipmapLevelBias = 0;
  4141. minMipmapLevelClamp = 0;
  4142. maxMipmapLevelClamp = 0;
  4143. }
  4144. texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
  4145. struct hipChannelFormatDesc desc) {
  4146. normalized = norm;
  4147. readMode = mode;
  4148. filterMode = fMode;
  4149. addressMode[0] = aMode;
  4150. addressMode[1] = aMode;
  4151. addressMode[2] = aMode;
  4152. channelDesc = desc;
  4153. sRGB = 0;
  4154. textureObject = nullptr;
  4155. maxAnisotropy = 0;
  4156. mipmapLevelBias = 0;
  4157. minMipmapLevelClamp = 0;
  4158. maxMipmapLevelClamp = 0;
  4159. }
  4160. };
  4161. #endif /* __cplusplus */
  4162. #else
  4163. #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
  4164. #endif
  4165. #if defined(__clang__)
  4166. #pragma clang diagnostic pop
  4167. #endif
  4168. #endif
  4169. /*
  4170. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  4171. Permission is hereby granted, free of charge, to any person obtaining a copy
  4172. of this software and associated documentation files (the "Software"), to deal
  4173. in the Software without restriction, including without limitation the rights
  4174. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  4175. copies of the Software, and to permit persons to whom the Software is
  4176. furnished to do so, subject to the following conditions:
  4177. The above copyright notice and this permission notice shall be included in
  4178. all copies or substantial portions of the Software.
  4179. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4180. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4181. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  4182. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  4183. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  4184. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  4185. THE SOFTWARE.
  4186. */
  4187. #pragma once
  4188. #if !defined(__HIPCC_RTC__)
  4189. #include <hip/hip_vector_types.h>
  4190. #endif
  4191. extern "C" {
  4192. #define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
  4193. __device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
  4194. __device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
  4195. __device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
  4196. __device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
  4197. __device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
  4198. __device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
  4199. __device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
  4200. __device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
  4201. __device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
  4202. __device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
  4203. __device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
  4204. __device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
  4205. __device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
  4206. __device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
  4207. __device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
  4208. __device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
  4209. __device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
  4210. __device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
  4211. __device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
  4212. __device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
  4213. __device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
  4214. __device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
  4215. __device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
  4216. __device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
  4217. __device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
  4218. __device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
  4219. __device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
  4220. __device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
  4221. __device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
  4222. __device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
  4223. __device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4224. __device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4225. __device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
  4226. __device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
  4227. __device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
  4228. __device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
  4229. __device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
  4230. __device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
  4231. __device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
  4232. __device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
  4233. __device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
  4234. __device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
  4235. __device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
  4236. __device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
  4237. __device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
  4238. __device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
  4239. __device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
  4240. __device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
  4241. __device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4242. __device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4243. __device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4244. __device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
  4245. __device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4246. __device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4247. __device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4248. __device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4249. __device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4250. __device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4251. __device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4252. __device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4253. __device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4254. __device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4255. __device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4256. __device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4257. __device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4258. __device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4259. __device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4260. __device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4261. __device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4262. __device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4263. __device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4264. __device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
  4265. }
  4266. /*
  4267. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  4268. Permission is hereby granted, free of charge, to any person obtaining a copy
  4269. of this software and associated documentation files (the "Software"), to deal
  4270. in the Software without restriction, including without limitation the rights
  4271. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  4272. copies of the Software, and to permit persons to whom the Software is
  4273. furnished to do so, subject to the following conditions:
  4274. The above copyright notice and this permission notice shall be included in
  4275. all copies or substantial portions of the Software.
  4276. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4277. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4278. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  4279. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  4280. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  4281. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  4282. THE SOFTWARE.
  4283. */
  4284. #pragma once
  4285. #if defined(__cplusplus)
  4286. #if !defined(__HIPCC_RTC__)
  4287. #include <hip/hip_vector_types.h>
  4288. #include <hip/hip_texture_types.h>
  4289. #include <hip/amd_detail/ockl_image.h>
  4290. #include <type_traits>
  4291. #endif // !defined(__HIPCC_RTC__)
  4292. #define TEXTURE_PARAMETERS_INIT \
  4293. unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
  4294. unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
  4295. template<typename T>
  4296. struct __hip_is_tex_surf_scalar_channel_type
  4297. {
  4298. static constexpr bool value =
  4299. std::is_same<T, char>::value ||
  4300. std::is_same<T, unsigned char>::value ||
  4301. std::is_same<T, short>::value ||
  4302. std::is_same<T, unsigned short>::value ||
  4303. std::is_same<T, int>::value ||
  4304. std::is_same<T, unsigned int>::value ||
  4305. std::is_same<T, float>::value;
  4306. };
  4307. template<typename T>
  4308. struct __hip_is_tex_surf_channel_type
  4309. {
  4310. static constexpr bool value =
  4311. __hip_is_tex_surf_scalar_channel_type<T>::value;
  4312. };
  4313. template<
  4314. typename T,
  4315. unsigned int rank>
  4316. struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>
  4317. {
  4318. static constexpr bool value =
  4319. __hip_is_tex_surf_scalar_channel_type<T>::value &&
  4320. ((rank == 1) ||
  4321. (rank == 2) ||
  4322. (rank == 4));
  4323. };
  4324. template<typename T>
  4325. struct __hip_is_tex_normalized_channel_type
  4326. {
  4327. static constexpr bool value =
  4328. std::is_same<T, char>::value ||
  4329. std::is_same<T, unsigned char>::value ||
  4330. std::is_same<T, short>::value ||
  4331. std::is_same<T, unsigned short>::value;
  4332. };
  4333. template<
  4334. typename T,
  4335. unsigned int rank>
  4336. struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
  4337. {
  4338. static constexpr bool value =
  4339. __hip_is_tex_normalized_channel_type<T>::value &&
  4340. ((rank == 1) ||
  4341. (rank == 2) ||
  4342. (rank == 4));
  4343. };
  4344. template <
  4345. typename T,
  4346. hipTextureReadMode readMode,
  4347. typename Enable = void>
  4348. struct __hip_tex_ret
  4349. {
  4350. static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
  4351. };
  4352. /*
  4353. * Map from device function return U to scalar texture type T
  4354. */
  4355. template<typename T, typename U>
  4356. __forceinline__ __device__
  4357. typename std::enable_if<
  4358. __hip_is_tex_surf_scalar_channel_type<T>::value, const T>::type
  4359. __hipMapFrom(const U &u) {
  4360. if constexpr (sizeof(T) < sizeof(float)) {
  4361. union {
  4362. U u;
  4363. int i;
  4364. } d = { u };
  4365. return static_cast<T>(d.i);
  4366. } else { // sizeof(T) == sizeof(float)
  4367. union {
  4368. U u;
  4369. T t;
  4370. } d = { u };
  4371. return d.t;
  4372. }
  4373. }
  4374. /*
  4375. * Map from device function return U to vector texture type T
  4376. */
  4377. template<typename T, typename U>
  4378. __forceinline__ __device__
  4379. typename std::enable_if<
  4380. __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
  4381. __hipMapFrom(const U &u) {
  4382. if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
  4383. union {
  4384. U u;
  4385. int4 i4;
  4386. } d = { u };
  4387. return __hipMapVector<typename T::value_type, sizeof(T)/sizeof(typename T::value_type)>(d.i4);
  4388. } else { // sizeof(typename T::value_type) == sizeof(float)
  4389. union {
  4390. U u;
  4391. T t;
  4392. } d = { u };
  4393. return d.t;
  4394. }
  4395. }
  4396. /*
  4397. * Map from scalar texture type T to device function input U
  4398. */
  4399. template<typename U, typename T>
  4400. __forceinline__ __device__
  4401. typename std::enable_if<
  4402. __hip_is_tex_surf_scalar_channel_type<T>::value, const U>::type
  4403. __hipMapTo(const T &t) {
  4404. if constexpr (sizeof(T) < sizeof(float)) {
  4405. union {
  4406. U u;
  4407. int i;
  4408. } d = { 0 };
  4409. d.i = static_cast<int>(t);
  4410. return d.u;
  4411. } else { // sizeof(T) == sizeof(float)
  4412. union {
  4413. U u;
  4414. T t;
  4415. } d = { 0 };
  4416. d.t = t;
  4417. return d.u;
  4418. }
  4419. }
  4420. /*
  4421. * Map from vector texture type T to device function input U
  4422. */
  4423. template<typename U, typename T>
  4424. __forceinline__ __device__
  4425. typename std::enable_if<
  4426. __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
  4427. __hipMapTo(const T &t) {
  4428. if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
  4429. union {
  4430. U u;
  4431. int4 i4;
  4432. } d = { 0 };
  4433. d.i4 = __hipMapVector<int, 4>(t);
  4434. return d.u;
  4435. } else { // sizeof(typename T::value_type) == sizeof(float)
  4436. union {
  4437. U u;
  4438. T t;
  4439. } d = { 0 };
  4440. d.t = t;
  4441. return d.u;
  4442. }
  4443. }
  4444. template <
  4445. typename T,
  4446. hipTextureReadMode readMode>
  4447. using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
  4448. template <typename T>
  4449. struct __hip_tex_ret<
  4450. T,
  4451. hipReadModeElementType,
  4452. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
  4453. {
  4454. using type = T;
  4455. };
  4456. template<
  4457. typename T,
  4458. unsigned int rank>
  4459. struct __hip_tex_ret<
  4460. HIP_vector_type<T, rank>,
  4461. hipReadModeElementType,
  4462. typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
  4463. {
  4464. using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
  4465. };
  4466. template<typename T>
  4467. struct __hip_tex_ret<
  4468. T,
  4469. hipReadModeNormalizedFloat,
  4470. typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
  4471. {
  4472. using type = float;
  4473. };
  4474. template<
  4475. typename T,
  4476. unsigned int rank>
  4477. struct __hip_tex_ret<
  4478. HIP_vector_type<T, rank>,
  4479. hipReadModeNormalizedFloat,
  4480. typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
  4481. {
  4482. using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
  4483. };
  4484. template <typename T, hipTextureReadMode readMode>
  4485. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
  4486. {
  4487. TEXTURE_PARAMETERS_INIT;
  4488. auto tmp = __ockl_image_load_1Db(i, x);
  4489. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4490. }
  4491. template <typename T, hipTextureReadMode readMode>
  4492. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
  4493. {
  4494. TEXTURE_PARAMETERS_INIT;
  4495. auto tmp = __ockl_image_sample_1D(i, s, x);
  4496. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4497. }
  4498. template <typename T, hipTextureReadMode readMode>
  4499. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
  4500. {
  4501. TEXTURE_PARAMETERS_INIT;
  4502. auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
  4503. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4504. }
  4505. template <typename T, hipTextureReadMode readMode>
  4506. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
  4507. {
  4508. TEXTURE_PARAMETERS_INIT;
  4509. auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
  4510. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4511. }
  4512. template <typename T, hipTextureReadMode readMode>
  4513. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
  4514. {
  4515. TEXTURE_PARAMETERS_INIT;
  4516. auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
  4517. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4518. }
  4519. template <typename T, hipTextureReadMode readMode>
  4520. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
  4521. {
  4522. TEXTURE_PARAMETERS_INIT;
  4523. auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
  4524. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4525. }
  4526. template <typename T, hipTextureReadMode readMode>
  4527. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
  4528. {
  4529. TEXTURE_PARAMETERS_INIT;
  4530. auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
  4531. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4532. }
  4533. template <typename T, hipTextureReadMode readMode>
  4534. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
  4535. {
  4536. TEXTURE_PARAMETERS_INIT;
  4537. auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
  4538. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4539. }
  4540. template <typename T, hipTextureReadMode readMode>
  4541. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
  4542. {
  4543. TEXTURE_PARAMETERS_INIT;
  4544. auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
  4545. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4546. }
  4547. template <typename T, hipTextureReadMode readMode>
  4548. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
  4549. {
  4550. TEXTURE_PARAMETERS_INIT;
  4551. auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
  4552. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4553. }
  4554. template <typename T, hipTextureReadMode readMode>
  4555. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
  4556. {
  4557. TEXTURE_PARAMETERS_INIT;
  4558. auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
  4559. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4560. }
  4561. template <typename T, hipTextureReadMode readMode>
  4562. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
  4563. {
  4564. TEXTURE_PARAMETERS_INIT;
  4565. auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
  4566. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4567. }
  4568. template <typename T, hipTextureReadMode readMode>
  4569. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
  4570. {
  4571. TEXTURE_PARAMETERS_INIT;
  4572. auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
  4573. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4574. }
  4575. template <typename T, hipTextureReadMode readMode>
  4576. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
  4577. {
  4578. TEXTURE_PARAMETERS_INIT;
  4579. auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
  4580. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4581. }
  4582. template <typename T, hipTextureReadMode readMode>
  4583. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
  4584. {
  4585. TEXTURE_PARAMETERS_INIT;
  4586. auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
  4587. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4588. }
  4589. template <typename T, hipTextureReadMode readMode>
  4590. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
  4591. {
  4592. TEXTURE_PARAMETERS_INIT;
  4593. // TODO missing in device libs.
  4594. // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  4595. // return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4596. return {};
  4597. }
  4598. template <typename T, hipTextureReadMode readMode>
  4599. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
  4600. {
  4601. TEXTURE_PARAMETERS_INIT;
  4602. // TODO missing in device libs.
  4603. // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  4604. // return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4605. return {};
  4606. }
  4607. template <typename T, hipTextureReadMode readMode>
  4608. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
  4609. {
  4610. TEXTURE_PARAMETERS_INIT;
  4611. auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
  4612. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4613. }
  4614. template <typename T, hipTextureReadMode readMode>
  4615. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
  4616. {
  4617. TEXTURE_PARAMETERS_INIT;
  4618. auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
  4619. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4620. }
  4621. template <typename T, hipTextureReadMode readMode>
  4622. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
  4623. {
  4624. TEXTURE_PARAMETERS_INIT;
  4625. auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
  4626. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4627. }
  4628. template <typename T, hipTextureReadMode readMode>
  4629. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
  4630. {
  4631. TEXTURE_PARAMETERS_INIT;
  4632. auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
  4633. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4634. }
  4635. template <typename T, hipTextureReadMode readMode>
  4636. static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
  4637. {
  4638. TEXTURE_PARAMETERS_INIT;
  4639. auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  4640. return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
  4641. }
  4642. template <
  4643. typename T,
  4644. hipTextureReadMode readMode,
  4645. typename Enable = void>
  4646. struct __hip_tex2dgather_ret
  4647. {
  4648. static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
  4649. };
  4650. template <
  4651. typename T,
  4652. hipTextureReadMode readMode>
  4653. using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
  4654. template <typename T>
  4655. struct __hip_tex2dgather_ret<
  4656. T,
  4657. hipReadModeElementType,
  4658. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
  4659. {
  4660. using type = HIP_vector_type<T, 4>;
  4661. };
  4662. template<
  4663. typename T,
  4664. unsigned int rank>
  4665. struct __hip_tex2dgather_ret<
  4666. HIP_vector_type<T, rank>,
  4667. hipReadModeElementType,
  4668. typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
  4669. {
  4670. using type = HIP_vector_type<T, 4>;
  4671. };
  4672. template <typename T>
  4673. struct __hip_tex2dgather_ret<
  4674. T,
  4675. hipReadModeNormalizedFloat,
  4676. typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
  4677. {
  4678. using type = float4;
  4679. };
  4680. template <typename T, hipTextureReadMode readMode>
  4681. static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
  4682. {
  4683. TEXTURE_PARAMETERS_INIT;
  4684. switch (comp) {
  4685. case 1: {
  4686. auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
  4687. return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
  4688. }
  4689. case 2: {
  4690. auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
  4691. return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
  4692. }
  4693. case 3: {
  4694. auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
  4695. return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
  4696. }
  4697. default: {
  4698. auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
  4699. return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
  4700. }
  4701. }
  4702. return {};
  4703. }
  4704. #endif
  4705. /*
  4706. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  4707. Permission is hereby granted, free of charge, to any person obtaining a copy
  4708. of this software and associated documentation files (the "Software"), to deal
  4709. in the Software without restriction, including without limitation the rights
  4710. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  4711. copies of the Software, and to permit persons to whom the Software is
  4712. furnished to do so, subject to the following conditions:
  4713. The above copyright notice and this permission notice shall be included in
  4714. all copies or substantial portions of the Software.
  4715. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4716. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4717. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  4718. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  4719. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  4720. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  4721. THE SOFTWARE.
  4722. */
  4723. #pragma once
  4724. #if defined(__cplusplus)
  4725. #if !defined(__HIPCC_RTC__)
  4726. #include <hip/hip_vector_types.h>
  4727. #include <hip/hip_texture_types.h>
  4728. #include <hip/amd_detail/texture_fetch_functions.h>
  4729. #include <hip/amd_detail/ockl_image.h>
  4730. #include <type_traits>
  4731. #endif // !defined(__HIPCC_RTC__)
  4732. #define TEXTURE_OBJECT_PARAMETERS_INIT \
  4733. unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
  4734. unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
  4735. template <
  4736. typename T,
  4737. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4738. static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
  4739. {
  4740. TEXTURE_OBJECT_PARAMETERS_INIT
  4741. auto tmp = __ockl_image_load_1Db(i, x);
  4742. return __hipMapFrom<T>(tmp);
  4743. }
  4744. template <
  4745. typename T,
  4746. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4747. static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
  4748. {
  4749. *ptr = tex1Dfetch<T>(textureObject, x);
  4750. }
  4751. template <
  4752. typename T,
  4753. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4754. static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
  4755. {
  4756. TEXTURE_OBJECT_PARAMETERS_INIT
  4757. auto tmp = __ockl_image_sample_1D(i, s, x);
  4758. return __hipMapFrom<T>(tmp);
  4759. }
  4760. template <
  4761. typename T,
  4762. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4763. static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
  4764. {
  4765. *ptr = tex1D<T>(textureObject, x);
  4766. }
  4767. template <
  4768. typename T,
  4769. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4770. static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
  4771. {
  4772. TEXTURE_OBJECT_PARAMETERS_INIT
  4773. auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
  4774. return __hipMapFrom<T>(tmp);
  4775. }
  4776. template <
  4777. typename T,
  4778. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4779. static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
  4780. {
  4781. *ptr = tex2D<T>(textureObject, x, y);
  4782. }
  4783. template <
  4784. typename T,
  4785. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4786. static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
  4787. {
  4788. TEXTURE_OBJECT_PARAMETERS_INIT
  4789. auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
  4790. return __hipMapFrom<T>(tmp);
  4791. }
  4792. template <
  4793. typename T,
  4794. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4795. static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
  4796. {
  4797. *ptr = tex3D<T>(textureObject, x, y, z);
  4798. }
  4799. template <
  4800. typename T,
  4801. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4802. static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
  4803. {
  4804. TEXTURE_OBJECT_PARAMETERS_INIT
  4805. auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
  4806. return __hipMapFrom<T>(tmp);
  4807. }
  4808. template <
  4809. typename T,
  4810. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4811. static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
  4812. {
  4813. *ptr = tex1DLayered<T>(textureObject, x, layer);
  4814. }
  4815. template <
  4816. typename T,
  4817. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4818. static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
  4819. {
  4820. TEXTURE_OBJECT_PARAMETERS_INIT
  4821. auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
  4822. return __hipMapFrom<T>(tmp);
  4823. }
  4824. template <
  4825. typename T,
  4826. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4827. static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
  4828. {
  4829. *ptr = tex1DLayered<T>(textureObject, x, y, layer);
  4830. }
  4831. template <
  4832. typename T,
  4833. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4834. static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
  4835. {
  4836. TEXTURE_OBJECT_PARAMETERS_INIT
  4837. auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
  4838. return __hipMapFrom<T>(tmp);
  4839. }
  4840. template <
  4841. typename T,
  4842. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4843. static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
  4844. {
  4845. *ptr = texCubemap<T>(textureObject, x, y, z);
  4846. }
  4847. template <
  4848. typename T,
  4849. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4850. static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
  4851. {
  4852. TEXTURE_OBJECT_PARAMETERS_INIT
  4853. auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
  4854. return __hipMapFrom<T>(tmp);
  4855. }
  4856. template <
  4857. typename T,
  4858. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4859. static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
  4860. {
  4861. *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
  4862. }
  4863. template <
  4864. typename T,
  4865. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4866. static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
  4867. {
  4868. TEXTURE_OBJECT_PARAMETERS_INIT
  4869. switch (comp) {
  4870. case 1: {
  4871. auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
  4872. return __hipMapFrom<T>(tmp);
  4873. break;
  4874. }
  4875. case 2: {
  4876. auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
  4877. return __hipMapFrom<T>(tmp);
  4878. break;
  4879. }
  4880. case 3: {
  4881. auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
  4882. return __hipMapFrom<T>(tmp);
  4883. break;
  4884. }
  4885. default: {
  4886. auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
  4887. return __hipMapFrom<T>(tmp);
  4888. break;
  4889. }
  4890. }
  4891. return {};
  4892. }
  4893. template <
  4894. typename T,
  4895. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4896. static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
  4897. {
  4898. *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
  4899. }
  4900. template <
  4901. typename T,
  4902. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4903. static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
  4904. {
  4905. TEXTURE_OBJECT_PARAMETERS_INIT
  4906. auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
  4907. return __hipMapFrom<T>(tmp);
  4908. }
  4909. template <
  4910. typename T,
  4911. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4912. static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
  4913. {
  4914. *ptr = tex1DLod<T>(textureObject, x, level);
  4915. }
  4916. template <
  4917. typename T,
  4918. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4919. static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
  4920. {
  4921. TEXTURE_OBJECT_PARAMETERS_INIT
  4922. auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
  4923. return __hipMapFrom<T>(tmp);
  4924. }
  4925. template <
  4926. typename T,
  4927. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4928. static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
  4929. {
  4930. *ptr = tex2DLod<T>(textureObject, x, y, level);
  4931. }
  4932. template <
  4933. typename T,
  4934. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4935. static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
  4936. {
  4937. TEXTURE_OBJECT_PARAMETERS_INIT
  4938. auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
  4939. return __hipMapFrom<T>(tmp);
  4940. }
  4941. template <
  4942. typename T,
  4943. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4944. static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
  4945. {
  4946. *ptr = tex3DLod<T>(textureObject, x, y, z, level);
  4947. }
  4948. template <
  4949. typename T,
  4950. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4951. static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
  4952. {
  4953. TEXTURE_OBJECT_PARAMETERS_INIT
  4954. auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
  4955. return __hipMapFrom<T>(tmp);
  4956. }
  4957. template <
  4958. typename T,
  4959. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4960. static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
  4961. {
  4962. *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
  4963. }
  4964. template <
  4965. typename T,
  4966. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4967. static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
  4968. {
  4969. TEXTURE_OBJECT_PARAMETERS_INIT
  4970. auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
  4971. return __hipMapFrom<T>(tmp);
  4972. }
  4973. template <
  4974. typename T,
  4975. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4976. static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
  4977. {
  4978. *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
  4979. }
  4980. template <
  4981. typename T,
  4982. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4983. static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
  4984. {
  4985. TEXTURE_OBJECT_PARAMETERS_INIT
  4986. auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
  4987. return __hipMapFrom<T>(tmp);
  4988. }
  4989. template <
  4990. typename T,
  4991. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4992. static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
  4993. {
  4994. *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
  4995. }
  4996. template <
  4997. typename T,
  4998. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  4999. static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
  5000. {
  5001. TEXTURE_OBJECT_PARAMETERS_INIT
  5002. // TODO missing in device libs.
  5003. // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  5004. // return __hipMapFrom<T>(tmp);
  5005. return {};
  5006. }
  5007. template <
  5008. typename T,
  5009. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5010. static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
  5011. {
  5012. *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
  5013. }
  5014. template <
  5015. typename T,
  5016. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5017. static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
  5018. {
  5019. TEXTURE_OBJECT_PARAMETERS_INIT
  5020. auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
  5021. return __hipMapFrom<T>(tmp);
  5022. }
  5023. template <
  5024. typename T,
  5025. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5026. static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
  5027. {
  5028. *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
  5029. }
  5030. template <
  5031. typename T,
  5032. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5033. static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
  5034. {
  5035. TEXTURE_OBJECT_PARAMETERS_INIT
  5036. auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
  5037. return __hipMapFrom<T>(tmp);
  5038. }
  5039. template <
  5040. typename T,
  5041. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5042. static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
  5043. {
  5044. *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
  5045. }
  5046. template <
  5047. typename T,
  5048. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5049. static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
  5050. {
  5051. TEXTURE_OBJECT_PARAMETERS_INIT
  5052. auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
  5053. return __hipMapFrom<T>(tmp);
  5054. }
  5055. template <
  5056. typename T,
  5057. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5058. static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
  5059. {
  5060. *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
  5061. }
  5062. template <
  5063. typename T,
  5064. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5065. static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
  5066. {
  5067. TEXTURE_OBJECT_PARAMETERS_INIT
  5068. auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  5069. return __hipMapFrom<T>(tmp);
  5070. }
  5071. template <
  5072. typename T,
  5073. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5074. static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
  5075. {
  5076. *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
  5077. }
  5078. template <
  5079. typename T,
  5080. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5081. static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
  5082. {
  5083. TEXTURE_OBJECT_PARAMETERS_INIT
  5084. auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
  5085. return __hipMapFrom<T>(tmp);
  5086. }
  5087. template <
  5088. typename T,
  5089. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5090. static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
  5091. {
  5092. *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
  5093. }
  5094. template <
  5095. typename T,
  5096. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5097. static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
  5098. {
  5099. TEXTURE_OBJECT_PARAMETERS_INIT
  5100. auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
  5101. return __hipMapFrom<T>(tmp);
  5102. }
  5103. template <
  5104. typename T,
  5105. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5106. static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
  5107. {
  5108. *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
  5109. }
  5110. template <
  5111. typename T,
  5112. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5113. static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
  5114. {
  5115. TEXTURE_OBJECT_PARAMETERS_INIT
  5116. // TODO missing in device libs.
  5117. // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
  5118. // return __hipMapFrom<T>(tmp);
  5119. return {};
  5120. }
  5121. template <
  5122. typename T,
  5123. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5124. static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
  5125. {
  5126. *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
  5127. }
  5128. #endif
  5129. /*
  5130. Copyright (c) 2018 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  5131. Permission is hereby granted, free of charge, to any person obtaining a copy
  5132. of this software and associated documentation files (the "Software"), to deal
  5133. in the Software without restriction, including without limitation the rights
  5134. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  5135. copies of the Software, and to permit persons to whom the Software is
  5136. furnished to do so, subject to the following conditions:
  5137. The above copyright notice and this permission notice shall be included in
  5138. all copies or substantial portions of the Software.
  5139. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5140. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5141. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5142. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5143. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  5144. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  5145. THE SOFTWARE.
  5146. */
  5147. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
  5148. #define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
  5149. #if defined(__cplusplus)
  5150. #if !defined(__HIPCC_RTC__)
  5151. #include <hip/surface_types.h>
  5152. #include <hip/hip_vector_types.h>
  5153. #include <hip/amd_detail/texture_fetch_functions.h>
  5154. #include <hip/amd_detail/ockl_image.h>
  5155. #endif
  5156. #if defined(__HIPCC_RTC__)
  5157. #define __HOST_DEVICE__ __device__
  5158. #else
  5159. #define __HOST_DEVICE__ __host__ __device__
  5160. #endif
  5161. #define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
  5162. unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
  5163. // CUDA is using byte address, need map to pixel address for HIP
  5164. static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
  5165. /*
  5166. * use below format index to generate format LUT
  5167. typedef enum {
  5168. HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
  5169. HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
  5170. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
  5171. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
  5172. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
  5173. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
  5174. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
  5175. HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
  5176. HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
  5177. HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
  5178. HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
  5179. HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
  5180. HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
  5181. HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
  5182. HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
  5183. HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
  5184. } hsa_ext_image_channel_type_t;
  5185. */
  5186. static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
  5187. x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
  5188. /*
  5189. * use below order index to generate order LUT
  5190. typedef enum {
  5191. HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
  5192. HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
  5193. HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
  5194. HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
  5195. HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
  5196. HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
  5197. HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
  5198. HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
  5199. HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
  5200. HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
  5201. HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
  5202. HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
  5203. HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
  5204. HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
  5205. HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
  5206. HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
  5207. HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
  5208. HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
  5209. HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
  5210. HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
  5211. } hsa_ext_image_channel_order_t;
  5212. */
  5213. static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
  5214. return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
  5215. }
  5216. template <
  5217. typename T,
  5218. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5219. static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
  5220. int boundaryMode = hipBoundaryModeZero) {
  5221. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5222. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  5223. auto tmp = __ockl_image_load_1D(i, x);
  5224. *data = __hipMapFrom<T>(tmp);
  5225. }
  5226. template <
  5227. typename T,
  5228. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5229. static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
  5230. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5231. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  5232. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5233. __ockl_image_store_1D(i, x, tmp);
  5234. }
  5235. template <
  5236. typename T,
  5237. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5238. static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
  5239. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5240. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5241. auto tmp = __ockl_image_load_2D(i, int2(x, y).data);
  5242. *data = __hipMapFrom<T>(tmp);
  5243. }
  5244. template <
  5245. typename T,
  5246. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5247. static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
  5248. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5249. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5250. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5251. __ockl_image_store_2D(i, int2(x, y).data, tmp);
  5252. }
  5253. template <
  5254. typename T,
  5255. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5256. static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
  5257. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5258. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
  5259. auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data);
  5260. *data = __hipMapFrom<T>(tmp);
  5261. }
  5262. template <
  5263. typename T,
  5264. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5265. static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
  5266. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5267. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
  5268. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5269. __ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp);
  5270. }
  5271. template <
  5272. typename T,
  5273. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5274. static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
  5275. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5276. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  5277. auto tmp = __ockl_image_load_lod_1D(i, x, layer);
  5278. *data = __hipMapFrom<T>(tmp);
  5279. }
  5280. template <
  5281. typename T,
  5282. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5283. static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
  5284. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5285. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  5286. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5287. __ockl_image_store_lod_1D(i, x, layer, tmp);
  5288. }
  5289. template <
  5290. typename T,
  5291. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5292. static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
  5293. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5294. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5295. auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer);
  5296. *data = __hipMapFrom<T>(tmp);
  5297. }
  5298. template <
  5299. typename T,
  5300. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5301. static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
  5302. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5303. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5304. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5305. __ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp);
  5306. }
  5307. template <
  5308. typename T,
  5309. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5310. static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
  5311. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5312. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5313. auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face);
  5314. *data = __hipMapFrom<T>(tmp);
  5315. }
  5316. template <
  5317. typename T,
  5318. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5319. static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
  5320. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5321. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5322. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5323. __ockl_image_store_CM(i, int2(x, y).data, face, tmp);
  5324. }
  5325. template <
  5326. typename T,
  5327. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5328. static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
  5329. int layer) {
  5330. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5331. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5332. auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer);
  5333. *data = __hipMapFrom<T>(tmp);
  5334. }
  5335. template <
  5336. typename T,
  5337. typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
  5338. static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
  5339. int layer) {
  5340. __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  5341. x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  5342. auto tmp = __hipMapTo<float4::Native_vec_>(data);
  5343. __ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp);
  5344. }
  5345. #endif
  5346. #endif
  5347. /*
  5348. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  5349. Permission is hereby granted, free of charge, to any person obtaining a copy
  5350. of this software and associated documentation files (the "Software"), to deal
  5351. in the Software without restriction, including without limitation the rights
  5352. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  5353. copies of the Software, and to permit persons to whom the Software is
  5354. furnished to do so, subject to the following conditions:
  5355. The above copyright notice and this permission notice shall be included in
  5356. all copies or substantial portions of the Software.
  5357. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5358. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5359. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5360. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5361. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  5362. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  5363. THE SOFTWARE.
  5364. */
  5365. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
  5366. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
  5367. #if !defined(__HIPCC_RTC__)
  5368. #include "hip/amd_detail/amd_hip_vector_types.h"
  5369. #endif
  5370. #if defined(__HIPCC_RTC__)
  5371. #define __HOST_DEVICE__ __device__
  5372. #else
  5373. #define __HOST_DEVICE__ __host__ __device__
  5374. // TODO: Clang has a bug which allows device functions to call std functions
  5375. // when std functions are introduced into default namespace by using statement.
  5376. // math.h may be included after this bug is fixed.
  5377. #if __cplusplus
  5378. #include <cmath>
  5379. #else
  5380. #include "math.h"
  5381. #endif
  5382. #endif // !defined(__HIPCC_RTC__)
  5383. #if __cplusplus
  5384. #define COMPLEX_NEG_OP_OVERLOAD(type) \
  5385. __HOST_DEVICE__ static inline type operator-(const type& op) { \
  5386. type ret; \
  5387. ret.x = -op.x; \
  5388. ret.y = -op.y; \
  5389. return ret; \
  5390. }
  5391. #define COMPLEX_EQ_OP_OVERLOAD(type) \
  5392. __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) { \
  5393. return lhs.x == rhs.x && lhs.y == rhs.y; \
  5394. }
  5395. #define COMPLEX_NE_OP_OVERLOAD(type) \
  5396. __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) { \
  5397. return !(lhs == rhs); \
  5398. }
  5399. #define COMPLEX_ADD_OP_OVERLOAD(type) \
  5400. __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) { \
  5401. type ret; \
  5402. ret.x = lhs.x + rhs.x; \
  5403. ret.y = lhs.y + rhs.y; \
  5404. return ret; \
  5405. }
  5406. #define COMPLEX_SUB_OP_OVERLOAD(type) \
  5407. __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) { \
  5408. type ret; \
  5409. ret.x = lhs.x - rhs.x; \
  5410. ret.y = lhs.y - rhs.y; \
  5411. return ret; \
  5412. }
  5413. #define COMPLEX_MUL_OP_OVERLOAD(type) \
  5414. __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) { \
  5415. type ret; \
  5416. ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
  5417. ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
  5418. return ret; \
  5419. }
  5420. #define COMPLEX_DIV_OP_OVERLOAD(type) \
  5421. __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) { \
  5422. type ret; \
  5423. ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
  5424. ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
  5425. ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \
  5426. ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \
  5427. return ret; \
  5428. }
  5429. #define COMPLEX_ADD_PREOP_OVERLOAD(type) \
  5430. __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) { \
  5431. lhs.x += rhs.x; \
  5432. lhs.y += rhs.y; \
  5433. return lhs; \
  5434. }
  5435. #define COMPLEX_SUB_PREOP_OVERLOAD(type) \
  5436. __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) { \
  5437. lhs.x -= rhs.x; \
  5438. lhs.y -= rhs.y; \
  5439. return lhs; \
  5440. }
  5441. #define COMPLEX_MUL_PREOP_OVERLOAD(type) \
  5442. __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
  5443. type temp{lhs}; \
  5444. lhs.x = rhs.x * temp.x - rhs.y * temp.y; \
  5445. lhs.y = rhs.y * temp.x + rhs.x * temp.y; \
  5446. return lhs; \
  5447. }
  5448. #define COMPLEX_DIV_PREOP_OVERLOAD(type) \
  5449. __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
  5450. type temp; \
  5451. temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
  5452. temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
  5453. lhs = temp; \
  5454. return lhs; \
  5455. }
  5456. #define COMPLEX_SCALAR_PRODUCT(type, type1) \
  5457. __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) { \
  5458. type ret; \
  5459. ret.x = lhs.x * rhs; \
  5460. ret.y = lhs.y * rhs; \
  5461. return ret; \
  5462. }
  5463. #endif
  5464. typedef float2 hipFloatComplex;
  5465. __HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
  5466. __HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
  5467. __HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
  5468. hipFloatComplex z;
  5469. z.x = a;
  5470. z.y = b;
  5471. return z;
  5472. }
  5473. __HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
  5474. hipFloatComplex ret;
  5475. ret.x = z.x;
  5476. ret.y = -z.y;
  5477. return ret;
  5478. }
  5479. __HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
  5480. return z.x * z.x + z.y * z.y;
  5481. }
  5482. __HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
  5483. return make_hipFloatComplex(p.x + q.x, p.y + q.y);
  5484. }
  5485. __HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
  5486. return make_hipFloatComplex(p.x - q.x, p.y - q.y);
  5487. }
  5488. __HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
  5489. return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
  5490. }
  5491. __HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
  5492. float sqabs = hipCsqabsf(q);
  5493. hipFloatComplex ret;
  5494. ret.x = (p.x * q.x + p.y * q.y) / sqabs;
  5495. ret.y = (p.y * q.x - p.x * q.y) / sqabs;
  5496. return ret;
  5497. }
  5498. __HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
  5499. typedef double2 hipDoubleComplex;
  5500. __HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
  5501. __HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
  5502. __HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
  5503. hipDoubleComplex z;
  5504. z.x = a;
  5505. z.y = b;
  5506. return z;
  5507. }
  5508. __HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
  5509. hipDoubleComplex ret;
  5510. ret.x = z.x;
  5511. ret.y = -z.y;
  5512. return ret;
  5513. }
  5514. __HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
  5515. return z.x * z.x + z.y * z.y;
  5516. }
  5517. __HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
  5518. return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
  5519. }
  5520. __HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
  5521. return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
  5522. }
  5523. __HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
  5524. return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
  5525. }
  5526. __HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
  5527. double sqabs = hipCsqabs(q);
  5528. hipDoubleComplex ret;
  5529. ret.x = (p.x * q.x + p.y * q.y) / sqabs;
  5530. ret.y = (p.y * q.x - p.x * q.y) / sqabs;
  5531. return ret;
  5532. }
  5533. __HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
  5534. #if __cplusplus
  5535. COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
  5536. COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
  5537. COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
  5538. COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
  5539. COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
  5540. COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
  5541. COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
  5542. COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
  5543. COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
  5544. COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
  5545. COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
  5546. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
  5547. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
  5548. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
  5549. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
  5550. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
  5551. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
  5552. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
  5553. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
  5554. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
  5555. COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
  5556. COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
  5557. COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
  5558. COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
  5559. COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
  5560. COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
  5561. COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
  5562. COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
  5563. COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
  5564. COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
  5565. COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
  5566. COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
  5567. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
  5568. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
  5569. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
  5570. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
  5571. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
  5572. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
  5573. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
  5574. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
  5575. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
  5576. COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
  5577. #endif
  5578. typedef hipFloatComplex hipComplex;
  5579. __HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
  5580. return make_hipFloatComplex(x, y);
  5581. }
  5582. __HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
  5583. return make_hipFloatComplex((float)z.x, (float)z.y);
  5584. }
  5585. __HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
  5586. return make_hipDoubleComplex((double)z.x, (double)z.y);
  5587. }
  5588. __HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
  5589. float real = (p.x * q.x) + r.x;
  5590. float imag = (q.x * p.y) + r.y;
  5591. real = -(p.y * q.y) + real;
  5592. imag = (p.x * q.y) + imag;
  5593. return make_hipComplex(real, imag);
  5594. }
  5595. __HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
  5596. hipDoubleComplex r) {
  5597. double real = (p.x * q.x) + r.x;
  5598. double imag = (q.x * p.y) + r.y;
  5599. real = -(p.y * q.y) + real;
  5600. imag = (p.x * q.y) + imag;
  5601. return make_hipDoubleComplex(real, imag);
  5602. }
  5603. #endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
  5604. /*
  5605. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  5606. Permission is hereby granted, free of charge, to any person obtaining a copy
  5607. of this software and associated documentation files (the "Software"), to deal
  5608. in the Software without restriction, including without limitation the rights
  5609. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  5610. copies of the Software, and to permit persons to whom the Software is
  5611. furnished to do so, subject to the following conditions:
  5612. The above copyright notice and this permission notice shall be included in
  5613. all copies or substantial portions of the Software.
  5614. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5615. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5616. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5617. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5618. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  5619. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  5620. THE SOFTWARE.
  5621. */
  5622. #ifndef AMD_HIP_MATH_CONSTANTS_H
  5623. #define AMD_HIP_MATH_CONSTANTS_H
  5624. // single precision constants
  5625. #define HIP_INF_F __int_as_float(0x7f800000U)
  5626. #define HIP_NAN_F __int_as_float(0x7fffffffU)
  5627. #define HIP_MIN_DENORM_F __int_as_float(0x00000001U)
  5628. #define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
  5629. #define HIP_NEG_ZERO_F __int_as_float(0x80000000U)
  5630. #define HIP_ZERO_F 0.0F
  5631. #define HIP_ONE_F 1.0F
  5632. #define HIP_SQRT_HALF_F 0.707106781F
  5633. #define HIP_SQRT_HALF_HI_F 0.707106781F
  5634. #define HIP_SQRT_HALF_LO_F 1.210161749e-08F
  5635. #define HIP_SQRT_TWO_F 1.414213562F
  5636. #define HIP_THIRD_F 0.333333333F
  5637. #define HIP_PIO4_F 0.785398163F
  5638. #define HIP_PIO2_F 1.570796327F
  5639. #define HIP_3PIO4_F 2.356194490F
  5640. #define HIP_2_OVER_PI_F 0.636619772F
  5641. #define HIP_SQRT_2_OVER_PI_F 0.797884561F
  5642. #define HIP_PI_F 3.141592654F
  5643. #define HIP_L2E_F 1.442695041F
  5644. #define HIP_L2T_F 3.321928094F
  5645. #define HIP_LG2_F 0.301029996F
  5646. #define HIP_LGE_F 0.434294482F
  5647. #define HIP_LN2_F 0.693147181F
  5648. #define HIP_LNT_F 2.302585093F
  5649. #define HIP_LNPI_F 1.144729886F
  5650. #define HIP_TWO_TO_M126_F 1.175494351e-38F
  5651. #define HIP_TWO_TO_126_F 8.507059173e37F
  5652. #define HIP_NORM_HUGE_F 3.402823466e38F
  5653. #define HIP_TWO_TO_23_F 8388608.0F
  5654. #define HIP_TWO_TO_24_F 16777216.0F
  5655. #define HIP_TWO_TO_31_F 2147483648.0F
  5656. #define HIP_TWO_TO_32_F 4294967296.0F
  5657. #define HIP_REMQUO_BITS_F 3U
  5658. #define HIP_REMQUO_MASK_F (~((~0U)<<HIP_REMQUO_BITS_F))
  5659. #define HIP_TRIG_PLOSS_F 105615.0F
  5660. // double precision constants
  5661. #define HIP_INF __longlong_as_double(0x7ff0000000000000ULL)
  5662. #define HIP_NAN __longlong_as_double(0xfff8000000000000ULL)
  5663. #define HIP_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
  5664. #define HIP_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
  5665. #define HIP_ZERO 0.0
  5666. #define HIP_ONE 1.0
  5667. #define HIP_SQRT_TWO 1.4142135623730951e+0
  5668. #define HIP_SQRT_HALF 7.0710678118654757e-1
  5669. #define HIP_SQRT_HALF_HI 7.0710678118654757e-1
  5670. #define HIP_SQRT_HALF_LO (-4.8336466567264567e-17)
  5671. #define HIP_THIRD 3.3333333333333333e-1
  5672. #define HIP_TWOTHIRD 6.6666666666666667e-1
  5673. #define HIP_PIO4 7.8539816339744828e-1
  5674. #define HIP_PIO4_HI 7.8539816339744828e-1
  5675. #define HIP_PIO4_LO 3.0616169978683830e-17
  5676. #define HIP_PIO2 1.5707963267948966e+0
  5677. #define HIP_PIO2_HI 1.5707963267948966e+0
  5678. #define HIP_PIO2_LO 6.1232339957367660e-17
  5679. #define HIP_3PIO4 2.3561944901923448e+0
  5680. #define HIP_2_OVER_PI 6.3661977236758138e-1
  5681. #define HIP_PI 3.1415926535897931e+0
  5682. #define HIP_PI_HI 3.1415926535897931e+0
  5683. #define HIP_PI_LO 1.2246467991473532e-16
  5684. #define HIP_SQRT_2PI 2.5066282746310007e+0
  5685. #define HIP_SQRT_2PI_HI 2.5066282746310007e+0
  5686. #define HIP_SQRT_2PI_LO (-1.8328579980459167e-16)
  5687. #define HIP_SQRT_PIO2 1.2533141373155003e+0
  5688. #define HIP_SQRT_PIO2_HI 1.2533141373155003e+0
  5689. #define HIP_SQRT_PIO2_LO (-9.1642899902295834e-17)
  5690. #define HIP_SQRT_2OPI 7.9788456080286536e-1
  5691. #define HIP_L2E 1.4426950408889634e+0
  5692. #define HIP_L2E_HI 1.4426950408889634e+0
  5693. #define HIP_L2E_LO 2.0355273740931033e-17
  5694. #define HIP_L2T 3.3219280948873622e+0
  5695. #define HIP_LG2 3.0102999566398120e-1
  5696. #define HIP_LG2_HI 3.0102999566398120e-1
  5697. #define HIP_LG2_LO (-2.8037281277851704e-18)
  5698. #define HIP_LGE 4.3429448190325182e-1
  5699. #define HIP_LGE_HI 4.3429448190325182e-1
  5700. #define HIP_LGE_LO 1.09831965021676510e-17
  5701. #define HIP_LN2 6.9314718055994529e-1
  5702. #define HIP_LN2_HI 6.9314718055994529e-1
  5703. #define HIP_LN2_LO 2.3190468138462996e-17
  5704. #define HIP_LNT 2.3025850929940459e+0
  5705. #define HIP_LNT_HI 2.3025850929940459e+0
  5706. #define HIP_LNT_LO (-2.1707562233822494e-16)
  5707. #define HIP_LNPI 1.1447298858494002e+0
  5708. #define HIP_LN2_X_1024 7.0978271289338397e+2
  5709. #define HIP_LN2_X_1025 7.1047586007394398e+2
  5710. #define HIP_LN2_X_1075 7.4513321910194122e+2
  5711. #define HIP_LG2_X_1024 3.0825471555991675e+2
  5712. #define HIP_LG2_X_1075 3.2360724533877976e+2
  5713. #define HIP_TWO_TO_23 8388608.0
  5714. #define HIP_TWO_TO_52 4503599627370496.0
  5715. #define HIP_TWO_TO_53 9007199254740992.0
  5716. #define HIP_TWO_TO_54 18014398509481984.0
  5717. #define HIP_TWO_TO_M54 5.5511151231257827e-17
  5718. #define HIP_TWO_TO_M1022 2.22507385850720140e-308
  5719. #define HIP_TRIG_PLOSS 2147483648.0
  5720. #define HIP_DBL2INT_CVT 6755399441055744.0
  5721. #endif
  5722. /*
  5723. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  5724. Permission is hereby granted, free of charge, to any person obtaining a copy
  5725. of this software and associated documentation files (the "Software"), to deal
  5726. in the Software without restriction, including without limitation the rights
  5727. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  5728. copies of the Software, and to permit persons to whom the Software is
  5729. furnished to do so, subject to the following conditions:
  5730. The above copyright notice and this permission notice shall be included in
  5731. all copies or substantial portions of the Software.
  5732. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5733. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5734. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5735. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5736. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  5737. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  5738. THE SOFTWARE.
  5739. */
  5740. #pragma once
  5741. #if !defined(__HIPCC_RTC__)
  5742. #include "host_defines.h"
  5743. #include "amd_hip_vector_types.h" // For Native_vec_
  5744. #endif
  5745. #if defined(__cplusplus)
  5746. extern "C" {
  5747. #endif
  5748. // DOT FUNCTIONS
  5749. #if defined(__clang__) && defined(__HIP__)
  5750. __device__
  5751. __attribute__((const))
  5752. int __ockl_sdot2(
  5753. HIP_vector_base<short, 2>::Native_vec_,
  5754. HIP_vector_base<short, 2>::Native_vec_,
  5755. int, bool);
  5756. __device__
  5757. __attribute__((const))
  5758. unsigned int __ockl_udot2(
  5759. HIP_vector_base<unsigned short, 2>::Native_vec_,
  5760. HIP_vector_base<unsigned short, 2>::Native_vec_,
  5761. unsigned int, bool);
  5762. __device__
  5763. __attribute__((const))
  5764. int __ockl_sdot4(
  5765. HIP_vector_base<char, 4>::Native_vec_,
  5766. HIP_vector_base<char, 4>::Native_vec_,
  5767. int, bool);
  5768. __device__
  5769. __attribute__((const))
  5770. unsigned int __ockl_udot4(
  5771. HIP_vector_base<unsigned char, 4>::Native_vec_,
  5772. HIP_vector_base<unsigned char, 4>::Native_vec_,
  5773. unsigned int, bool);
  5774. __device__
  5775. __attribute__((const))
  5776. int __ockl_sdot8(int, int, int, bool);
  5777. __device__
  5778. __attribute__((const))
  5779. unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
  5780. #endif
  5781. #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  5782. // BEGIN FLOAT
  5783. __device__
  5784. __attribute__((const))
  5785. float __ocml_acos_f32(float);
  5786. __device__
  5787. __attribute__((pure))
  5788. float __ocml_acosh_f32(float);
  5789. __device__
  5790. __attribute__((const))
  5791. float __ocml_asin_f32(float);
  5792. __device__
  5793. __attribute__((pure))
  5794. float __ocml_asinh_f32(float);
  5795. __device__
  5796. __attribute__((const))
  5797. float __ocml_atan2_f32(float, float);
  5798. __device__
  5799. __attribute__((const))
  5800. float __ocml_atan_f32(float);
  5801. __device__
  5802. __attribute__((pure))
  5803. float __ocml_atanh_f32(float);
  5804. __device__
  5805. __attribute__((pure))
  5806. float __ocml_cbrt_f32(float);
  5807. __device__
  5808. __attribute__((const))
  5809. float __ocml_ceil_f32(float);
  5810. __device__
  5811. __attribute__((const))
  5812. __device__
  5813. float __ocml_copysign_f32(float, float);
  5814. __device__
  5815. float __ocml_cos_f32(float);
  5816. __device__
  5817. float __ocml_native_cos_f32(float);
  5818. __device__
  5819. __attribute__((pure))
  5820. __device__
  5821. float __ocml_cosh_f32(float);
  5822. __device__
  5823. float __ocml_cospi_f32(float);
  5824. __device__
  5825. float __ocml_i0_f32(float);
  5826. __device__
  5827. float __ocml_i1_f32(float);
  5828. __device__
  5829. __attribute__((pure))
  5830. float __ocml_erfc_f32(float);
  5831. __device__
  5832. __attribute__((pure))
  5833. float __ocml_erfcinv_f32(float);
  5834. __device__
  5835. __attribute__((pure))
  5836. float __ocml_erfcx_f32(float);
  5837. __device__
  5838. __attribute__((pure))
  5839. float __ocml_erf_f32(float);
  5840. __device__
  5841. __attribute__((pure))
  5842. float __ocml_erfinv_f32(float);
  5843. __device__
  5844. __attribute__((pure))
  5845. float __ocml_exp10_f32(float);
  5846. __device__
  5847. __attribute__((pure))
  5848. float __ocml_native_exp10_f32(float);
  5849. __device__
  5850. __attribute__((pure))
  5851. float __ocml_exp2_f32(float);
  5852. __device__
  5853. __attribute__((pure))
  5854. float __ocml_exp_f32(float);
  5855. __device__
  5856. __attribute__((pure))
  5857. float __ocml_native_exp_f32(float);
  5858. __device__
  5859. __attribute__((pure))
  5860. float __ocml_expm1_f32(float);
  5861. __device__
  5862. __attribute__((const))
  5863. float __ocml_fabs_f32(float);
  5864. __device__
  5865. __attribute__((const))
  5866. float __ocml_fdim_f32(float, float);
  5867. __device__
  5868. __attribute__((const))
  5869. float __ocml_floor_f32(float);
  5870. __device__
  5871. __attribute__((const))
  5872. float __ocml_fma_f32(float, float, float);
  5873. __device__
  5874. __attribute__((const))
  5875. float __ocml_fmax_f32(float, float);
  5876. __device__
  5877. __attribute__((const))
  5878. float __ocml_fmin_f32(float, float);
  5879. __device__
  5880. __attribute__((const))
  5881. __device__
  5882. float __ocml_fmod_f32(float, float);
  5883. __device__
  5884. float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
  5885. __device__
  5886. __attribute__((const))
  5887. float __ocml_hypot_f32(float, float);
  5888. __device__
  5889. __attribute__((const))
  5890. int __ocml_ilogb_f32(float);
  5891. __device__
  5892. __attribute__((const))
  5893. int __ocml_isfinite_f32(float);
  5894. __device__
  5895. __attribute__((const))
  5896. int __ocml_isinf_f32(float);
  5897. __device__
  5898. __attribute__((const))
  5899. int __ocml_isnan_f32(float);
  5900. __device__
  5901. float __ocml_j0_f32(float);
  5902. __device__
  5903. float __ocml_j1_f32(float);
  5904. __device__
  5905. __attribute__((const))
  5906. float __ocml_ldexp_f32(float, int);
  5907. __device__
  5908. float __ocml_lgamma_f32(float);
  5909. __device__
  5910. __attribute__((pure))
  5911. float __ocml_log10_f32(float);
  5912. __device__
  5913. __attribute__((pure))
  5914. float __ocml_native_log10_f32(float);
  5915. __device__
  5916. __attribute__((pure))
  5917. float __ocml_log1p_f32(float);
  5918. __device__
  5919. __attribute__((pure))
  5920. float __ocml_log2_f32(float);
  5921. __device__
  5922. __attribute__((pure))
  5923. float __ocml_native_log2_f32(float);
  5924. __device__
  5925. __attribute__((const))
  5926. float __ocml_logb_f32(float);
  5927. __device__
  5928. __attribute__((pure))
  5929. float __ocml_log_f32(float);
  5930. __device__
  5931. __attribute__((pure))
  5932. float __ocml_native_log_f32(float);
  5933. __device__
  5934. float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
  5935. __device__
  5936. __attribute__((const))
  5937. float __ocml_nearbyint_f32(float);
  5938. __device__
  5939. __attribute__((const))
  5940. float __ocml_nextafter_f32(float, float);
  5941. __device__
  5942. __attribute__((const))
  5943. float __ocml_len3_f32(float, float, float);
  5944. __device__
  5945. __attribute__((const))
  5946. float __ocml_len4_f32(float, float, float, float);
  5947. __device__
  5948. __attribute__((pure))
  5949. float __ocml_ncdf_f32(float);
  5950. __device__
  5951. __attribute__((pure))
  5952. float __ocml_ncdfinv_f32(float);
  5953. __device__
  5954. __attribute__((pure))
  5955. float __ocml_pow_f32(float, float);
  5956. __device__
  5957. __attribute__((pure))
  5958. float __ocml_pown_f32(float, int);
  5959. __device__
  5960. __attribute__((pure))
  5961. float __ocml_rcbrt_f32(float);
  5962. __device__
  5963. __attribute__((const))
  5964. float __ocml_remainder_f32(float, float);
  5965. __device__
  5966. float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
  5967. __device__
  5968. __attribute__((const))
  5969. float __ocml_rhypot_f32(float, float);
  5970. __device__
  5971. __attribute__((const))
  5972. float __ocml_rint_f32(float);
  5973. __device__
  5974. __attribute__((const))
  5975. float __ocml_rlen3_f32(float, float, float);
  5976. __device__
  5977. __attribute__((const))
  5978. float __ocml_rlen4_f32(float, float, float, float);
  5979. __device__
  5980. __attribute__((const))
  5981. float __ocml_round_f32(float);
  5982. __device__
  5983. __attribute__((pure))
  5984. float __ocml_rsqrt_f32(float);
  5985. __device__
  5986. __attribute__((const))
  5987. float __ocml_scalb_f32(float, float);
  5988. __device__
  5989. __attribute__((const))
  5990. float __ocml_scalbn_f32(float, int);
  5991. __device__
  5992. __attribute__((const))
  5993. int __ocml_signbit_f32(float);
  5994. __device__
  5995. float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
  5996. __device__
  5997. float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
  5998. __device__
  5999. float __ocml_sin_f32(float);
  6000. __device__
  6001. float __ocml_native_sin_f32(float);
  6002. __device__
  6003. __attribute__((pure))
  6004. float __ocml_sinh_f32(float);
  6005. __device__
  6006. float __ocml_sinpi_f32(float);
  6007. __device__
  6008. __attribute__((const))
  6009. float __ocml_sqrt_f32(float);
  6010. __device__
  6011. __attribute__((const))
  6012. float __ocml_native_sqrt_f32(float);
  6013. __device__
  6014. float __ocml_tan_f32(float);
  6015. __device__
  6016. __attribute__((pure))
  6017. float __ocml_tanh_f32(float);
  6018. __device__
  6019. float __ocml_tgamma_f32(float);
  6020. __device__
  6021. __attribute__((const))
  6022. float __ocml_trunc_f32(float);
  6023. __device__
  6024. float __ocml_y0_f32(float);
  6025. __device__
  6026. float __ocml_y1_f32(float);
  6027. // BEGIN INTRINSICS
  6028. __device__
  6029. __attribute__((const))
  6030. float __ocml_add_rte_f32(float, float);
  6031. __device__
  6032. __attribute__((const))
  6033. float __ocml_add_rtn_f32(float, float);
  6034. __device__
  6035. __attribute__((const))
  6036. float __ocml_add_rtp_f32(float, float);
  6037. __device__
  6038. __attribute__((const))
  6039. float __ocml_add_rtz_f32(float, float);
  6040. __device__
  6041. __attribute__((const))
  6042. float __ocml_sub_rte_f32(float, float);
  6043. __device__
  6044. __attribute__((const))
  6045. float __ocml_sub_rtn_f32(float, float);
  6046. __device__
  6047. __attribute__((const))
  6048. float __ocml_sub_rtp_f32(float, float);
  6049. __device__
  6050. __attribute__((const))
  6051. float __ocml_sub_rtz_f32(float, float);
  6052. __device__
  6053. __attribute__((const))
  6054. float __ocml_mul_rte_f32(float, float);
  6055. __device__
  6056. __attribute__((const))
  6057. float __ocml_mul_rtn_f32(float, float);
  6058. __device__
  6059. __attribute__((const))
  6060. float __ocml_mul_rtp_f32(float, float);
  6061. __device__
  6062. __attribute__((const))
  6063. float __ocml_mul_rtz_f32(float, float);
  6064. __device__
  6065. __attribute__((const))
  6066. float __ocml_div_rte_f32(float, float);
  6067. __device__
  6068. __attribute__((const))
  6069. float __ocml_div_rtn_f32(float, float);
  6070. __device__
  6071. __attribute__((const))
  6072. float __ocml_div_rtp_f32(float, float);
  6073. __device__
  6074. __attribute__((const))
  6075. float __ocml_div_rtz_f32(float, float);
  6076. __device__
  6077. __attribute__((const))
  6078. float __ocml_sqrt_rte_f32(float);
  6079. __device__
  6080. __attribute__((const))
  6081. float __ocml_sqrt_rtn_f32(float);
  6082. __device__
  6083. __attribute__((const))
  6084. float __ocml_sqrt_rtp_f32(float);
  6085. __device__
  6086. __attribute__((const))
  6087. float __ocml_sqrt_rtz_f32(float);
  6088. __device__
  6089. __attribute__((const))
  6090. float __ocml_fma_rte_f32(float, float, float);
  6091. __device__
  6092. __attribute__((const))
  6093. float __ocml_fma_rtn_f32(float, float, float);
  6094. __device__
  6095. __attribute__((const))
  6096. float __ocml_fma_rtp_f32(float, float, float);
  6097. __device__
  6098. __attribute__((const))
  6099. float __ocml_fma_rtz_f32(float, float, float);
  6100. // END INTRINSICS
  6101. // END FLOAT
  6102. // BEGIN DOUBLE
  6103. __device__
  6104. __attribute__((const))
  6105. double __ocml_acos_f64(double);
  6106. __device__
  6107. __attribute__((pure))
  6108. double __ocml_acosh_f64(double);
  6109. __device__
  6110. __attribute__((const))
  6111. double __ocml_asin_f64(double);
  6112. __device__
  6113. __attribute__((pure))
  6114. double __ocml_asinh_f64(double);
  6115. __device__
  6116. __attribute__((const))
  6117. double __ocml_atan2_f64(double, double);
  6118. __device__
  6119. __attribute__((const))
  6120. double __ocml_atan_f64(double);
  6121. __device__
  6122. __attribute__((pure))
  6123. double __ocml_atanh_f64(double);
  6124. __device__
  6125. __attribute__((pure))
  6126. double __ocml_cbrt_f64(double);
  6127. __device__
  6128. __attribute__((const))
  6129. double __ocml_ceil_f64(double);
  6130. __device__
  6131. __attribute__((const))
  6132. double __ocml_copysign_f64(double, double);
  6133. __device__
  6134. double __ocml_cos_f64(double);
  6135. __device__
  6136. __attribute__((pure))
  6137. double __ocml_cosh_f64(double);
  6138. __device__
  6139. double __ocml_cospi_f64(double);
  6140. __device__
  6141. double __ocml_i0_f64(double);
  6142. __device__
  6143. double __ocml_i1_f64(double);
  6144. __device__
  6145. __attribute__((pure))
  6146. double __ocml_erfc_f64(double);
  6147. __device__
  6148. __attribute__((pure))
  6149. double __ocml_erfcinv_f64(double);
  6150. __device__
  6151. __attribute__((pure))
  6152. double __ocml_erfcx_f64(double);
  6153. __device__
  6154. __attribute__((pure))
  6155. double __ocml_erf_f64(double);
  6156. __device__
  6157. __attribute__((pure))
  6158. double __ocml_erfinv_f64(double);
  6159. __device__
  6160. __attribute__((pure))
  6161. double __ocml_exp10_f64(double);
  6162. __device__
  6163. __attribute__((pure))
  6164. double __ocml_exp2_f64(double);
  6165. __device__
  6166. __attribute__((pure))
  6167. double __ocml_exp_f64(double);
  6168. __device__
  6169. __attribute__((pure))
  6170. double __ocml_expm1_f64(double);
  6171. __device__
  6172. __attribute__((const))
  6173. double __ocml_fabs_f64(double);
  6174. __device__
  6175. __attribute__((const))
  6176. double __ocml_fdim_f64(double, double);
  6177. __device__
  6178. __attribute__((const))
  6179. double __ocml_floor_f64(double);
  6180. __device__
  6181. __attribute__((const))
  6182. double __ocml_fma_f64(double, double, double);
  6183. __device__
  6184. __attribute__((const))
  6185. double __ocml_fmax_f64(double, double);
  6186. __device__
  6187. __attribute__((const))
  6188. double __ocml_fmin_f64(double, double);
  6189. __device__
  6190. __attribute__((const))
  6191. double __ocml_fmod_f64(double, double);
  6192. __device__
  6193. double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
  6194. __device__
  6195. __attribute__((const))
  6196. double __ocml_hypot_f64(double, double);
  6197. __device__
  6198. __attribute__((const))
  6199. int __ocml_ilogb_f64(double);
  6200. __device__
  6201. __attribute__((const))
  6202. int __ocml_isfinite_f64(double);
  6203. __device__
  6204. __attribute__((const))
  6205. int __ocml_isinf_f64(double);
  6206. __device__
  6207. __attribute__((const))
  6208. int __ocml_isnan_f64(double);
  6209. __device__
  6210. double __ocml_j0_f64(double);
  6211. __device__
  6212. double __ocml_j1_f64(double);
  6213. __device__
  6214. __attribute__((const))
  6215. double __ocml_ldexp_f64(double, int);
  6216. __device__
  6217. double __ocml_lgamma_f64(double);
  6218. __device__
  6219. __attribute__((pure))
  6220. double __ocml_log10_f64(double);
  6221. __device__
  6222. __attribute__((pure))
  6223. double __ocml_log1p_f64(double);
  6224. __device__
  6225. __attribute__((pure))
  6226. double __ocml_log2_f64(double);
  6227. __device__
  6228. __attribute__((const))
  6229. double __ocml_logb_f64(double);
  6230. __device__
  6231. __attribute__((pure))
  6232. double __ocml_log_f64(double);
  6233. __device__
  6234. double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
  6235. __device__
  6236. __attribute__((const))
  6237. double __ocml_nearbyint_f64(double);
  6238. __device__
  6239. __attribute__((const))
  6240. double __ocml_nextafter_f64(double, double);
  6241. __device__
  6242. __attribute__((const))
  6243. double __ocml_len3_f64(double, double, double);
  6244. __device__
  6245. __attribute__((const))
  6246. double __ocml_len4_f64(double, double, double, double);
  6247. __device__
  6248. __attribute__((pure))
  6249. double __ocml_ncdf_f64(double);
  6250. __device__
  6251. __attribute__((pure))
  6252. double __ocml_ncdfinv_f64(double);
  6253. __device__
  6254. __attribute__((pure))
  6255. double __ocml_pow_f64(double, double);
  6256. __device__
  6257. __attribute__((pure))
  6258. double __ocml_pown_f64(double, int);
  6259. __device__
  6260. __attribute__((pure))
  6261. double __ocml_rcbrt_f64(double);
  6262. __device__
  6263. __attribute__((const))
  6264. double __ocml_remainder_f64(double, double);
  6265. __device__
  6266. double __ocml_remquo_f64(
  6267. double, double, __attribute__((address_space(5))) int*);
  6268. __device__
  6269. __attribute__((const))
  6270. double __ocml_rhypot_f64(double, double);
  6271. __device__
  6272. __attribute__((const))
  6273. double __ocml_rint_f64(double);
  6274. __device__
  6275. __attribute__((const))
  6276. double __ocml_rlen3_f64(double, double, double);
  6277. __device__
  6278. __attribute__((const))
  6279. double __ocml_rlen4_f64(double, double, double, double);
  6280. __device__
  6281. __attribute__((const))
  6282. double __ocml_round_f64(double);
  6283. __device__
  6284. __attribute__((pure))
  6285. double __ocml_rsqrt_f64(double);
  6286. __device__
  6287. __attribute__((const))
  6288. double __ocml_scalb_f64(double, double);
  6289. __device__
  6290. __attribute__((const))
  6291. double __ocml_scalbn_f64(double, int);
  6292. __device__
  6293. __attribute__((const))
  6294. int __ocml_signbit_f64(double);
  6295. __device__
  6296. double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
  6297. __device__
  6298. double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
  6299. __device__
  6300. double __ocml_sin_f64(double);
  6301. __device__
  6302. __attribute__((pure))
  6303. double __ocml_sinh_f64(double);
  6304. __device__
  6305. double __ocml_sinpi_f64(double);
  6306. __device__
  6307. __attribute__((const))
  6308. double __ocml_sqrt_f64(double);
  6309. __device__
  6310. double __ocml_tan_f64(double);
  6311. __device__
  6312. __attribute__((pure))
  6313. double __ocml_tanh_f64(double);
  6314. __device__
  6315. double __ocml_tgamma_f64(double);
  6316. __device__
  6317. __attribute__((const))
  6318. double __ocml_trunc_f64(double);
  6319. __device__
  6320. double __ocml_y0_f64(double);
  6321. __device__
  6322. double __ocml_y1_f64(double);
  6323. // BEGIN INTRINSICS
  6324. __device__
  6325. __attribute__((const))
  6326. double __ocml_add_rte_f64(double, double);
  6327. __device__
  6328. __attribute__((const))
  6329. double __ocml_add_rtn_f64(double, double);
  6330. __device__
  6331. __attribute__((const))
  6332. double __ocml_add_rtp_f64(double, double);
  6333. __device__
  6334. __attribute__((const))
  6335. double __ocml_add_rtz_f64(double, double);
  6336. __device__
  6337. __attribute__((const))
  6338. double __ocml_sub_rte_f64(double, double);
  6339. __device__
  6340. __attribute__((const))
  6341. double __ocml_sub_rtn_f64(double, double);
  6342. __device__
  6343. __attribute__((const))
  6344. double __ocml_sub_rtp_f64(double, double);
  6345. __device__
  6346. __attribute__((const))
  6347. double __ocml_sub_rtz_f64(double, double);
  6348. __device__
  6349. __attribute__((const))
  6350. double __ocml_mul_rte_f64(double, double);
  6351. __device__
  6352. __attribute__((const))
  6353. double __ocml_mul_rtn_f64(double, double);
  6354. __device__
  6355. __attribute__((const))
  6356. double __ocml_mul_rtp_f64(double, double);
  6357. __device__
  6358. __attribute__((const))
  6359. double __ocml_mul_rtz_f64(double, double);
  6360. __device__
  6361. __attribute__((const))
  6362. double __ocml_div_rte_f64(double, double);
  6363. __device__
  6364. __attribute__((const))
  6365. double __ocml_div_rtn_f64(double, double);
  6366. __device__
  6367. __attribute__((const))
  6368. double __ocml_div_rtp_f64(double, double);
  6369. __device__
  6370. __attribute__((const))
  6371. double __ocml_div_rtz_f64(double, double);
  6372. __device__
  6373. __attribute__((const))
  6374. double __ocml_sqrt_rte_f64(double);
  6375. __device__
  6376. __attribute__((const))
  6377. double __ocml_sqrt_rtn_f64(double);
  6378. __device__
  6379. __attribute__((const))
  6380. double __ocml_sqrt_rtp_f64(double);
  6381. __device__
  6382. __attribute__((const))
  6383. double __ocml_sqrt_rtz_f64(double);
  6384. __device__
  6385. __attribute__((const))
  6386. double __ocml_fma_rte_f64(double, double, double);
  6387. __device__
  6388. __attribute__((const))
  6389. double __ocml_fma_rtn_f64(double, double, double);
  6390. __device__
  6391. __attribute__((const))
  6392. double __ocml_fma_rtp_f64(double, double, double);
  6393. __device__
  6394. __attribute__((const))
  6395. double __ocml_fma_rtz_f64(double, double, double);
  6396. // END INTRINSICS
  6397. // END DOUBLE
  6398. #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  6399. #if defined(__cplusplus)
  6400. } // extern "C"
  6401. #endif
  6402. /*
  6403. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  6404. Permission is hereby granted, free of charge, to any person obtaining a copy
  6405. of this software and associated documentation files (the "Software"), to deal
  6406. in the Software without restriction, including without limitation the rights
  6407. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  6408. copies of the Software, and to permit persons to whom the Software is
  6409. furnished to do so, subject to the following conditions:
  6410. The above copyright notice and this permission notice shall be included in
  6411. all copies or substantial portions of the Software.
  6412. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  6413. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  6414. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  6415. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  6416. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  6417. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  6418. THE SOFTWARE.
  6419. */
  6420. /**
  6421. * @file amd_detail/device_library_decls.h
  6422. * @brief Contains declarations for types and functions in device library.
  6423. * Uses int64_t and uint64_t instead of long, long long, unsigned
  6424. * long and unsigned long long types for device library API
  6425. * declarations.
  6426. */
  6427. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
  6428. #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
  6429. #if !defined(__HIPCC_RTC__)
  6430. #include "hip/amd_detail/host_defines.h"
  6431. #endif
  6432. typedef unsigned char uchar;
  6433. typedef unsigned short ushort;
  6434. typedef unsigned int uint;
  6435. typedef unsigned long ulong;
  6436. typedef unsigned long long ullong;
  6437. extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
  6438. extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
  6439. extern "C" __device__ uint __ockl_activelane_u32(void);
  6440. extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
  6441. extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
  6442. extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
  6443. extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
  6444. extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
  6445. extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
  6446. extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
  6447. extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
  6448. extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
  6449. extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
  6450. extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
  6451. extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
  6452. extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
  6453. extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
  6454. extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
  6455. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
  6456. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
  6457. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);
  6458. extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
  6459. extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
  6460. extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
  6461. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
  6462. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
  6463. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
  6464. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
  6465. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
  6466. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
  6467. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
  6468. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
  6469. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
  6470. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
  6471. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
  6472. extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
  6473. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
  6474. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
  6475. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
  6476. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
  6477. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
  6478. extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
  6479. extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
  6480. extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
  6481. extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
  6482. extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
  6483. extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
  6484. extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
  6485. extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
  6486. extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
  6487. extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
  6488. extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
  6489. extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
  6490. extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
  6491. extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
  6492. extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
  6493. extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
  6494. extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
  6495. extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
  6496. uint64_t value0, uint64_t value1,
  6497. uint64_t value2, uint64_t value3,
  6498. uint64_t value4, uint64_t value5,
  6499. uint64_t value6, uint32_t is_last);
  6500. extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
  6501. uint64_t length, uint32_t is_last);
  6502. // Introduce local address space
  6503. #define __local __attribute__((address_space(3)))
  6504. #ifdef __HIP_DEVICE_COMPILE__
  6505. __device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
  6506. #endif //__HIP_DEVICE_COMPILE__
  6507. // Using hip.amdgcn.bc - sync threads
  6508. #define __CLK_LOCAL_MEM_FENCE 0x01
  6509. typedef unsigned __cl_mem_fence_flags;
  6510. #endif
  6511. /*
  6512. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  6513. Permission is hereby granted, free of charge, to any person obtaining a copy
  6514. of this software and associated documentation files (the "Software"), to deal
  6515. in the Software without restriction, including without limitation the rights
  6516. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  6517. copies of the Software, and to permit persons to whom the Software is
  6518. furnished to do so, subject to the following conditions:
  6519. The above copyright notice and this permission notice shall be included in
  6520. all copies or substantial portions of the Software.
  6521. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  6522. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  6523. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  6524. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  6525. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  6526. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  6527. THE SOFTWARE.
  6528. */
  6529. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
  6530. #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
  6531. #if !defined(__HIPCC_RTC__)
  6532. #include <hip/amd_detail/amd_hip_common.h>
  6533. #include "host_defines.h"
  6534. #include "math_fwd.h"
  6535. #include <hip/hip_runtime_api.h>
  6536. #include <stddef.h>
  6537. #include <hip/hip_vector_types.h>
  6538. #include <hip/amd_detail/device_library_decls.h>
  6539. #endif // !defined(__HIPCC_RTC__)
  6540. #if defined(__clang__) && defined(__HIP__)
  6541. extern "C" __device__ int printf(const char *fmt, ...);
  6542. #else
  6543. template <typename... All>
  6544. static inline __device__ void printf(const char* format, All... all) {}
  6545. #endif // __HIP_CLANG_ONLY__
  6546. extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
  6547. /*
  6548. Integer Intrinsics
  6549. */
  6550. // integer intrinsic function __poc __clz __ffs __brev
  6551. __device__ static inline unsigned int __popc(unsigned int input) {
  6552. return __builtin_popcount(input);
  6553. }
  6554. __device__ static inline unsigned int __popcll(unsigned long long int input) {
  6555. return __builtin_popcountll(input);
  6556. }
  6557. __device__ static inline int __clz(int input) {
  6558. return __ockl_clz_u32((uint)input);
  6559. }
  6560. __device__ static inline int __clzll(long long int input) {
  6561. return __ockl_clz_u64((uint64_t)input);
  6562. }
  6563. __device__ static inline unsigned int __ffs(unsigned int input) {
  6564. return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
  6565. }
  6566. __device__ static inline unsigned int __ffsll(unsigned long long int input) {
  6567. return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
  6568. }
  6569. __device__ static inline unsigned int __ffs(int input) {
  6570. return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
  6571. }
  6572. __device__ static inline unsigned int __ffsll(long long int input) {
  6573. return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
  6574. }
  6575. // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
  6576. // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
  6577. // If not found, return -1.
  6578. __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
  6579. uint64_t temp_mask = mask;
  6580. int32_t temp_offset = offset;
  6581. if (offset == 0) {
  6582. temp_mask &= (1 << base);
  6583. temp_offset = 1;
  6584. }
  6585. else if (offset < 0) {
  6586. temp_mask = __builtin_bitreverse64(mask);
  6587. base = 63 - base;
  6588. temp_offset = -offset;
  6589. }
  6590. temp_mask = temp_mask & ((~0ULL) << base);
  6591. if (__builtin_popcountll(temp_mask) < temp_offset)
  6592. return -1;
  6593. int32_t total = 0;
  6594. for (int i = 0x20; i > 0; i >>= 1) {
  6595. uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
  6596. int32_t pcnt = __builtin_popcountll(temp_mask_lo);
  6597. if (pcnt < temp_offset) {
  6598. temp_mask = temp_mask >> i;
  6599. temp_offset -= pcnt;
  6600. total += i;
  6601. }
  6602. else {
  6603. temp_mask = temp_mask_lo;
  6604. }
  6605. }
  6606. if (offset < 0)
  6607. return 63 - total;
  6608. else
  6609. return total;
  6610. }
  6611. __device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
  6612. uint64_t temp_mask = mask;
  6613. int32_t temp_offset = offset;
  6614. if (offset == 0) {
  6615. temp_mask &= (1 << base);
  6616. temp_offset = 1;
  6617. }
  6618. else if (offset < 0) {
  6619. temp_mask = __builtin_bitreverse64(mask);
  6620. base = 63 - base;
  6621. temp_offset = -offset;
  6622. }
  6623. temp_mask = temp_mask & ((~0ULL) << base);
  6624. if (__builtin_popcountll(temp_mask) < temp_offset)
  6625. return -1;
  6626. int32_t total = 0;
  6627. for (int i = 0x20; i > 0; i >>= 1) {
  6628. uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
  6629. int32_t pcnt = __builtin_popcountll(temp_mask_lo);
  6630. if (pcnt < temp_offset) {
  6631. temp_mask = temp_mask >> i;
  6632. temp_offset -= pcnt;
  6633. total += i;
  6634. }
  6635. else {
  6636. temp_mask = temp_mask_lo;
  6637. }
  6638. }
  6639. if (offset < 0)
  6640. return 63 - total;
  6641. else
  6642. return total;
  6643. }
  6644. __device__ static inline unsigned int __brev(unsigned int input) {
  6645. return __builtin_bitreverse32(input);
  6646. }
  6647. __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
  6648. return __builtin_bitreverse64(input);
  6649. }
  6650. __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
  6651. return input == 0 ? -1 : __builtin_ctzl(input);
  6652. }
  6653. __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
  6654. uint32_t offset = src1 & 31;
  6655. uint32_t width = src2 & 31;
  6656. return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
  6657. }
  6658. __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
  6659. uint64_t offset = src1 & 63;
  6660. uint64_t width = src2 & 63;
  6661. return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
  6662. }
  6663. __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
  6664. uint32_t offset = src2 & 31;
  6665. uint32_t width = src3 & 31;
  6666. uint32_t mask = (1 << width) - 1;
  6667. return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
  6668. }
  6669. __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
  6670. uint64_t offset = src2 & 63;
  6671. uint64_t width = src3 & 63;
  6672. uint64_t mask = (1ULL << width) - 1;
  6673. return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
  6674. }
  6675. __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
  6676. {
  6677. uint32_t mask_shift = shift & 31;
  6678. return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
  6679. }
  6680. __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
  6681. {
  6682. uint32_t min_shift = shift >= 32 ? 32 : shift;
  6683. return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
  6684. }
  6685. __device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
  6686. {
  6687. return __builtin_amdgcn_alignbit(hi, lo, shift);
  6688. }
  6689. __device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
  6690. {
  6691. return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
  6692. }
  6693. __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
  6694. __device__ static unsigned int __hadd(int x, int y);
  6695. __device__ static int __mul24(int x, int y);
  6696. __device__ static long long int __mul64hi(long long int x, long long int y);
  6697. __device__ static int __mulhi(int x, int y);
  6698. __device__ static int __rhadd(int x, int y);
  6699. __device__ static unsigned int __sad(int x, int y,unsigned int z);
  6700. __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
  6701. __device__ static int __umul24(unsigned int x, unsigned int y);
  6702. __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
  6703. __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
  6704. __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
  6705. __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
  6706. struct ucharHolder {
  6707. union {
  6708. unsigned char c[4];
  6709. unsigned int ui;
  6710. };
  6711. } __attribute__((aligned(4)));
  6712. struct uchar2Holder {
  6713. union {
  6714. unsigned int ui[2];
  6715. unsigned char c[8];
  6716. };
  6717. } __attribute__((aligned(8)));
  6718. __device__
  6719. static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
  6720. struct uchar2Holder cHoldVal;
  6721. struct ucharHolder cHoldKey;
  6722. cHoldKey.ui = s;
  6723. cHoldVal.ui[0] = x;
  6724. cHoldVal.ui[1] = y;
  6725. unsigned int result;
  6726. result = cHoldVal.c[cHoldKey.c[0] & 0x07];
  6727. result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
  6728. result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
  6729. result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
  6730. return result;
  6731. }
  6732. __device__ static inline unsigned int __hadd(int x, int y) {
  6733. int z = x + y;
  6734. int sign = z & 0x8000000;
  6735. int value = z & 0x7FFFFFFF;
  6736. return ((value) >> 1 || sign);
  6737. }
  6738. __device__ static inline int __mul24(int x, int y) {
  6739. return __ockl_mul24_i32(x, y);
  6740. }
  6741. __device__ static inline long long __mul64hi(long long int x, long long int y) {
  6742. ulong x0 = (ulong)x & 0xffffffffUL;
  6743. long x1 = x >> 32;
  6744. ulong y0 = (ulong)y & 0xffffffffUL;
  6745. long y1 = y >> 32;
  6746. ulong z0 = x0*y0;
  6747. long t = x1*y0 + (z0 >> 32);
  6748. long z1 = t & 0xffffffffL;
  6749. long z2 = t >> 32;
  6750. z1 = x0*y1 + z1;
  6751. return x1*y1 + z2 + (z1 >> 32);
  6752. }
  6753. __device__ static inline int __mulhi(int x, int y) {
  6754. return __ockl_mul_hi_i32(x, y);
  6755. }
  6756. __device__ static inline int __rhadd(int x, int y) {
  6757. int z = x + y + 1;
  6758. int sign = z & 0x8000000;
  6759. int value = z & 0x7FFFFFFF;
  6760. return ((value) >> 1 || sign);
  6761. }
  6762. __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
  6763. return x > y ? x - y + z : y - x + z;
  6764. }
  6765. __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
  6766. return (x + y) >> 1;
  6767. }
  6768. __device__ static inline int __umul24(unsigned int x, unsigned int y) {
  6769. return __ockl_mul24_u32(x, y);
  6770. }
  6771. __device__
  6772. static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
  6773. ulong x0 = x & 0xffffffffUL;
  6774. ulong x1 = x >> 32;
  6775. ulong y0 = y & 0xffffffffUL;
  6776. ulong y1 = y >> 32;
  6777. ulong z0 = x0*y0;
  6778. ulong t = x1*y0 + (z0 >> 32);
  6779. ulong z1 = t & 0xffffffffUL;
  6780. ulong z2 = t >> 32;
  6781. z1 = x0*y1 + z1;
  6782. return x1*y1 + z2 + (z1 >> 32);
  6783. }
  6784. __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
  6785. return __ockl_mul_hi_u32(x, y);
  6786. }
  6787. __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
  6788. return (x + y + 1) >> 1;
  6789. }
  6790. __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
  6791. return __ockl_sadd_u32(x, y, z);
  6792. }
  6793. __device__ static inline unsigned int __lane_id() {
  6794. return __builtin_amdgcn_mbcnt_hi(
  6795. -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
  6796. }
  6797. __device__
  6798. static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
  6799. __device__
  6800. static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
  6801. /*
  6802. HIP specific device functions
  6803. */
  6804. #if !defined(__HIPCC_RTC__)
  6805. #include "amd_warp_functions.h"
  6806. #endif
  6807. #define MASK1 0x00ff00ff
  6808. #define MASK2 0xff00ff00
  6809. __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
  6810. char4 out;
  6811. unsigned one1 = in1.w & MASK1;
  6812. unsigned one2 = in2.w & MASK1;
  6813. out.w = (one1 + one2) & MASK1;
  6814. one1 = in1.w & MASK2;
  6815. one2 = in2.w & MASK2;
  6816. out.w = out.w | ((one1 + one2) & MASK2);
  6817. return out;
  6818. }
  6819. __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
  6820. char4 out;
  6821. unsigned one1 = in1.w & MASK1;
  6822. unsigned one2 = in2.w & MASK1;
  6823. out.w = (one1 - one2) & MASK1;
  6824. one1 = in1.w & MASK2;
  6825. one2 = in2.w & MASK2;
  6826. out.w = out.w | ((one1 - one2) & MASK2);
  6827. return out;
  6828. }
  6829. __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
  6830. char4 out;
  6831. unsigned one1 = in1.w & MASK1;
  6832. unsigned one2 = in2.w & MASK1;
  6833. out.w = (one1 * one2) & MASK1;
  6834. one1 = in1.w & MASK2;
  6835. one2 = in2.w & MASK2;
  6836. out.w = out.w | ((one1 * one2) & MASK2);
  6837. return out;
  6838. }
  6839. __device__ static inline float __double2float_rd(double x) {
  6840. return __ocml_cvtrtn_f32_f64(x);
  6841. }
  6842. __device__ static inline float __double2float_rn(double x) { return x; }
  6843. __device__ static inline float __double2float_ru(double x) {
  6844. return __ocml_cvtrtp_f32_f64(x);
  6845. }
  6846. __device__ static inline float __double2float_rz(double x) {
  6847. return __ocml_cvtrtz_f32_f64(x);
  6848. }
  6849. __device__ static inline int __double2hiint(double x) {
  6850. static_assert(sizeof(double) == 2 * sizeof(int), "");
  6851. int tmp[2];
  6852. __builtin_memcpy(tmp, &x, sizeof(tmp));
  6853. return tmp[1];
  6854. }
  6855. __device__ static inline int __double2loint(double x) {
  6856. static_assert(sizeof(double) == 2 * sizeof(int), "");
  6857. int tmp[2];
  6858. __builtin_memcpy(tmp, &x, sizeof(tmp));
  6859. return tmp[0];
  6860. }
  6861. __device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
  6862. __device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
  6863. __device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
  6864. __device__ static inline int __double2int_rz(double x) { return (int)x; }
  6865. __device__ static inline long long int __double2ll_rd(double x) {
  6866. return (long long)__ocml_floor_f64(x);
  6867. }
  6868. __device__ static inline long long int __double2ll_rn(double x) {
  6869. return (long long)__ocml_rint_f64(x);
  6870. }
  6871. __device__ static inline long long int __double2ll_ru(double x) {
  6872. return (long long)__ocml_ceil_f64(x);
  6873. }
  6874. __device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
  6875. __device__ static inline unsigned int __double2uint_rd(double x) {
  6876. return (unsigned int)__ocml_floor_f64(x);
  6877. }
  6878. __device__ static inline unsigned int __double2uint_rn(double x) {
  6879. return (unsigned int)__ocml_rint_f64(x);
  6880. }
  6881. __device__ static inline unsigned int __double2uint_ru(double x) {
  6882. return (unsigned int)__ocml_ceil_f64(x);
  6883. }
  6884. __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
  6885. __device__ static inline unsigned long long int __double2ull_rd(double x) {
  6886. return (unsigned long long int)__ocml_floor_f64(x);
  6887. }
  6888. __device__ static inline unsigned long long int __double2ull_rn(double x) {
  6889. return (unsigned long long int)__ocml_rint_f64(x);
  6890. }
  6891. __device__ static inline unsigned long long int __double2ull_ru(double x) {
  6892. return (unsigned long long int)__ocml_ceil_f64(x);
  6893. }
  6894. __device__ static inline unsigned long long int __double2ull_rz(double x) {
  6895. return (unsigned long long int)x;
  6896. }
  6897. __device__ static inline long long int __double_as_longlong(double x) {
  6898. static_assert(sizeof(long long) == sizeof(double), "");
  6899. long long tmp;
  6900. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  6901. return tmp;
  6902. }
  6903. /*
  6904. __device__ unsigned short __float2half_rn(float x);
  6905. __device__ float __half2float(unsigned short);
  6906. The above device function are not a valid .
  6907. Use
  6908. __device__ __half __float2half_rn(float x);
  6909. __device__ float __half2float(__half);
  6910. from hip_fp16.h
  6911. CUDA implements half as unsigned short whereas, HIP doesn't.
  6912. */
  6913. __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
  6914. __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
  6915. __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
  6916. __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
  6917. __device__ static inline long long int __float2ll_rd(float x) {
  6918. return (long long int)__ocml_floor_f32(x);
  6919. }
  6920. __device__ static inline long long int __float2ll_rn(float x) {
  6921. return (long long int)__ocml_rint_f32(x);
  6922. }
  6923. __device__ static inline long long int __float2ll_ru(float x) {
  6924. return (long long int)__ocml_ceil_f32(x);
  6925. }
  6926. __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
  6927. __device__ static inline unsigned int __float2uint_rd(float x) {
  6928. return (unsigned int)__ocml_floor_f32(x);
  6929. }
  6930. __device__ static inline unsigned int __float2uint_rn(float x) {
  6931. return (unsigned int)__ocml_rint_f32(x);
  6932. }
  6933. __device__ static inline unsigned int __float2uint_ru(float x) {
  6934. return (unsigned int)__ocml_ceil_f32(x);
  6935. }
  6936. __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
  6937. __device__ static inline unsigned long long int __float2ull_rd(float x) {
  6938. return (unsigned long long int)__ocml_floor_f32(x);
  6939. }
  6940. __device__ static inline unsigned long long int __float2ull_rn(float x) {
  6941. return (unsigned long long int)__ocml_rint_f32(x);
  6942. }
  6943. __device__ static inline unsigned long long int __float2ull_ru(float x) {
  6944. return (unsigned long long int)__ocml_ceil_f32(x);
  6945. }
  6946. __device__ static inline unsigned long long int __float2ull_rz(float x) {
  6947. return (unsigned long long int)x;
  6948. }
  6949. __device__ static inline int __float_as_int(float x) {
  6950. static_assert(sizeof(int) == sizeof(float), "");
  6951. int tmp;
  6952. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  6953. return tmp;
  6954. }
  6955. __device__ static inline unsigned int __float_as_uint(float x) {
  6956. static_assert(sizeof(unsigned int) == sizeof(float), "");
  6957. unsigned int tmp;
  6958. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  6959. return tmp;
  6960. }
  6961. __device__ static inline double __hiloint2double(int hi, int lo) {
  6962. static_assert(sizeof(double) == sizeof(uint64_t), "");
  6963. uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
  6964. double tmp1;
  6965. __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  6966. return tmp1;
  6967. }
  6968. __device__ static inline double __int2double_rn(int x) { return (double)x; }
  6969. __device__ static inline float __int2float_rd(int x) {
  6970. return __ocml_cvtrtn_f32_s32(x);
  6971. }
  6972. __device__ static inline float __int2float_rn(int x) { return (float)x; }
  6973. __device__ static inline float __int2float_ru(int x) {
  6974. return __ocml_cvtrtp_f32_s32(x);
  6975. }
  6976. __device__ static inline float __int2float_rz(int x) {
  6977. return __ocml_cvtrtz_f32_s32(x);
  6978. }
  6979. __device__ static inline float __int_as_float(int x) {
  6980. static_assert(sizeof(float) == sizeof(int), "");
  6981. float tmp;
  6982. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  6983. return tmp;
  6984. }
  6985. __device__ static inline double __ll2double_rd(long long int x) {
  6986. return __ocml_cvtrtn_f64_s64(x);
  6987. }
  6988. __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
  6989. __device__ static inline double __ll2double_ru(long long int x) {
  6990. return __ocml_cvtrtp_f64_s64(x);
  6991. }
  6992. __device__ static inline double __ll2double_rz(long long int x) {
  6993. return __ocml_cvtrtz_f64_s64(x);
  6994. }
  6995. __device__ static inline float __ll2float_rd(long long int x) {
  6996. return __ocml_cvtrtn_f32_s64(x);
  6997. }
  6998. __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
  6999. __device__ static inline float __ll2float_ru(long long int x) {
  7000. return __ocml_cvtrtp_f32_s64(x);
  7001. }
  7002. __device__ static inline float __ll2float_rz(long long int x) {
  7003. return __ocml_cvtrtz_f32_s64(x);
  7004. }
  7005. __device__ static inline double __longlong_as_double(long long int x) {
  7006. static_assert(sizeof(double) == sizeof(long long), "");
  7007. double tmp;
  7008. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  7009. return tmp;
  7010. }
  7011. __device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
  7012. __device__ static inline float __uint2float_rd(unsigned int x) {
  7013. return __ocml_cvtrtn_f32_u32(x);
  7014. }
  7015. __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
  7016. __device__ static inline float __uint2float_ru(unsigned int x) {
  7017. return __ocml_cvtrtp_f32_u32(x);
  7018. }
  7019. __device__ static inline float __uint2float_rz(unsigned int x) {
  7020. return __ocml_cvtrtz_f32_u32(x);
  7021. }
  7022. __device__ static inline float __uint_as_float(unsigned int x) {
  7023. static_assert(sizeof(float) == sizeof(unsigned int), "");
  7024. float tmp;
  7025. __builtin_memcpy(&tmp, &x, sizeof(tmp));
  7026. return tmp;
  7027. }
  7028. __device__ static inline double __ull2double_rd(unsigned long long int x) {
  7029. return __ocml_cvtrtn_f64_u64(x);
  7030. }
  7031. __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
  7032. __device__ static inline double __ull2double_ru(unsigned long long int x) {
  7033. return __ocml_cvtrtp_f64_u64(x);
  7034. }
  7035. __device__ static inline double __ull2double_rz(unsigned long long int x) {
  7036. return __ocml_cvtrtz_f64_u64(x);
  7037. }
  7038. __device__ static inline float __ull2float_rd(unsigned long long int x) {
  7039. return __ocml_cvtrtn_f32_u64(x);
  7040. }
  7041. __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
  7042. __device__ static inline float __ull2float_ru(unsigned long long int x) {
  7043. return __ocml_cvtrtp_f32_u64(x);
  7044. }
  7045. __device__ static inline float __ull2float_rz(unsigned long long int x) {
  7046. return __ocml_cvtrtz_f32_u64(x);
  7047. }
  7048. #if defined(__clang__) && defined(__HIP__)
  7049. // Clock functions
  7050. __device__ long long int __clock64();
  7051. __device__ long long int __clock();
  7052. __device__ long long int clock64();
  7053. __device__ long long int clock();
  7054. __device__ long long int wall_clock64();
  7055. // hip.amdgcn.bc - named sync
  7056. __device__ void __named_sync();
  7057. #ifdef __HIP_DEVICE_COMPILE__
  7058. // Clock function to return GPU core cycle count.
  7059. // GPU can change its core clock frequency at runtime. The maximum frequency can be queried
  7060. // through hipDeviceAttributeClockRate attribute.
  7061. __device__
  7062. inline __attribute((always_inline))
  7063. long long int __clock64() {
  7064. #if __has_builtin(__builtin_amdgcn_s_memtime)
  7065. // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
  7066. return (long long int) __builtin_amdgcn_s_memtime();
  7067. #else
  7068. // Subject to change when better solution available
  7069. return (long long int) __builtin_readcyclecounter();
  7070. #endif
  7071. }
  7072. __device__
  7073. inline __attribute((always_inline))
  7074. long long int __clock() { return __clock64(); }
  7075. // Clock function to return wall clock count at a constant frequency that can be queried
  7076. // through hipDeviceAttributeWallClockRate attribute.
  7077. __device__
  7078. inline __attribute__((always_inline))
  7079. long long int wall_clock64() {
  7080. return (long long int) __ockl_steadyctr_u64();
  7081. }
  7082. __device__
  7083. inline __attribute__((always_inline))
  7084. long long int clock64() { return __clock64(); }
  7085. __device__
  7086. inline __attribute__((always_inline))
  7087. long long int clock() { return __clock(); }
  7088. // hip.amdgcn.bc - named sync
  7089. __device__
  7090. inline
  7091. void __named_sync() { __builtin_amdgcn_s_barrier(); }
  7092. #endif // __HIP_DEVICE_COMPILE__
  7093. // warp vote function __all __any __ballot
  7094. __device__
  7095. inline
  7096. int __all(int predicate) {
  7097. return __ockl_wfall_i32(predicate);
  7098. }
  7099. __device__
  7100. inline
  7101. int __any(int predicate) {
  7102. return __ockl_wfany_i32(predicate);
  7103. }
  7104. // XXX from llvm/include/llvm/IR/InstrTypes.h
  7105. #define ICMP_NE 33
  7106. __device__
  7107. inline
  7108. unsigned long long int __ballot(int predicate) {
  7109. return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
  7110. }
  7111. __device__
  7112. inline
  7113. unsigned long long int __ballot64(int predicate) {
  7114. return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
  7115. }
  7116. // hip.amdgcn.bc - lanemask
  7117. __device__
  7118. inline
  7119. uint64_t __lanemask_gt()
  7120. {
  7121. uint32_t lane = __ockl_lane_u32();
  7122. if (lane == 63)
  7123. return 0;
  7124. uint64_t ballot = __ballot64(1);
  7125. uint64_t mask = (~((uint64_t)0)) << (lane + 1);
  7126. return mask & ballot;
  7127. }
  7128. __device__
  7129. inline
  7130. uint64_t __lanemask_lt()
  7131. {
  7132. uint32_t lane = __ockl_lane_u32();
  7133. int64_t ballot = __ballot64(1);
  7134. uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
  7135. return mask & ballot;
  7136. }
  7137. __device__
  7138. inline
  7139. uint64_t __lanemask_eq()
  7140. {
  7141. uint32_t lane = __ockl_lane_u32();
  7142. int64_t mask = ((uint64_t)1 << lane);
  7143. return mask;
  7144. }
  7145. __device__ inline void* __local_to_generic(void* p) { return p; }
  7146. #ifdef __HIP_DEVICE_COMPILE__
  7147. __device__
  7148. inline
  7149. void* __get_dynamicgroupbaseptr()
  7150. {
  7151. // Get group segment base pointer.
  7152. return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
  7153. }
  7154. #else
  7155. __device__
  7156. void* __get_dynamicgroupbaseptr();
  7157. #endif // __HIP_DEVICE_COMPILE__
  7158. __device__
  7159. inline
  7160. void *__amdgcn_get_dynamicgroupbaseptr() {
  7161. return __get_dynamicgroupbaseptr();
  7162. }
  7163. // Memory Fence Functions
  7164. __device__
  7165. inline
  7166. static void __threadfence()
  7167. {
  7168. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
  7169. }
  7170. __device__
  7171. inline
  7172. static void __threadfence_block()
  7173. {
  7174. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
  7175. }
  7176. __device__
  7177. inline
  7178. static void __threadfence_system()
  7179. {
  7180. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
  7181. }
  7182. // abort
  7183. __device__
  7184. inline
  7185. __attribute__((weak))
  7186. void abort() {
  7187. return __builtin_trap();
  7188. }
  7189. // The noinline attribute helps encapsulate the printf expansion,
  7190. // which otherwise has a performance impact just by increasing the
  7191. // size of the calling function. Additionally, the weak attribute
  7192. // allows the function to exist as a global although its definition is
  7193. // included in every compilation unit.
  7194. #if defined(_WIN32) || defined(_WIN64)
  7195. extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
  7196. void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
  7197. // FIXME: Need `wchar_t` support to generate assertion message.
  7198. __builtin_trap();
  7199. }
  7200. #else /* defined(_WIN32) || defined(_WIN64) */
  7201. extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
  7202. void __assert_fail(const char *assertion,
  7203. const char *file,
  7204. unsigned int line,
  7205. const char *function)
  7206. {
  7207. const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
  7208. // strlen is not available as a built-in yet, so we create our own
  7209. // loop in a macro. With a string literal argument, the compiler
  7210. // usually manages to replace the loop with a constant.
  7211. //
  7212. // The macro does not check for null pointer, since all the string
  7213. // arguments are defined to be constant literals when called from
  7214. // the assert() macro.
  7215. //
  7216. // NOTE: The loop below includes the null terminator in the length
  7217. // as required by append_string_n().
  7218. #define __hip_get_string_length(LEN, STR) \
  7219. do { \
  7220. const char *tmp = STR; \
  7221. while (*tmp++); \
  7222. LEN = tmp - STR; \
  7223. } while (0)
  7224. auto msg = __ockl_fprintf_stderr_begin();
  7225. int len = 0;
  7226. __hip_get_string_length(len, fmt);
  7227. msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
  7228. __hip_get_string_length(len, file);
  7229. msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
  7230. msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
  7231. __hip_get_string_length(len, function);
  7232. msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
  7233. __hip_get_string_length(len, assertion);
  7234. __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
  7235. #undef __hip_get_string_length
  7236. __builtin_trap();
  7237. }
  7238. extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
  7239. void __assertfail()
  7240. {
  7241. // ignore all the args for now.
  7242. __builtin_trap();
  7243. }
  7244. #endif /* defined(_WIN32) || defined(_WIN64) */
  7245. __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
  7246. if (flags) {
  7247. __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
  7248. __builtin_amdgcn_s_barrier();
  7249. __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
  7250. } else {
  7251. __builtin_amdgcn_s_barrier();
  7252. }
  7253. }
  7254. __device__
  7255. inline
  7256. static void __barrier(int n)
  7257. {
  7258. __work_group_barrier((__cl_mem_fence_flags)n);
  7259. }
  7260. __device__
  7261. inline
  7262. __attribute__((convergent))
  7263. void __syncthreads()
  7264. {
  7265. __barrier(__CLK_LOCAL_MEM_FENCE);
  7266. }
  7267. __device__
  7268. inline
  7269. __attribute__((convergent))
  7270. int __syncthreads_count(int predicate)
  7271. {
  7272. return __ockl_wgred_add_i32(!!predicate);
  7273. }
  7274. __device__
  7275. inline
  7276. __attribute__((convergent))
  7277. int __syncthreads_and(int predicate)
  7278. {
  7279. return __ockl_wgred_and_i32(!!predicate);
  7280. }
  7281. __device__
  7282. inline
  7283. __attribute__((convergent))
  7284. int __syncthreads_or(int predicate)
  7285. {
  7286. return __ockl_wgred_or_i32(!!predicate);
  7287. }
  7288. // hip.amdgcn.bc - device routine
  7289. /*
  7290. HW_ID Register bit structure for RDNA2 & RDNA3
  7291. WAVE_ID 4:0 Wave id within the SIMD.
  7292. SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
  7293. WGP_ID 13:10 Physical WGP ID.
  7294. SA_ID 16 Shader Array ID
  7295. SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
  7296. SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
  7297. DP_RATE 31:29 Number of double-precision float units per SIMD
  7298. HW_ID Register bit structure for GCN and CDNA
  7299. WAVE_ID 3:0 Wave buffer slot number. 0-9.
  7300. SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
  7301. PIPE_ID 7:6 Pipeline from which the wave was dispatched.
  7302. CU_ID 11:8 Compute Unit the wave is assigned to.
  7303. SH_ID 12 Shader Array (within an SE) the wave is assigned to.
  7304. SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
  7305. 14:13 Shader Engine the wave is assigned to for Vega.
  7306. TG_ID 19:16 Thread-group ID
  7307. VM_ID 23:20 Virtual Memory ID
  7308. QUEUE_ID 26:24 Queue from which this wave was dispatched.
  7309. STATE_ID 29:27 State ID (graphics only, not compute).
  7310. ME_ID 31:30 Micro-engine ID.
  7311. XCC_ID Register bit structure for gfx940
  7312. XCC_ID 3:0 XCC the wave is assigned to.
  7313. */
  7314. #if (defined (__GFX10__) || defined (__GFX11__))
  7315. #define HW_ID 23
  7316. #else
  7317. #define HW_ID 4
  7318. #endif
  7319. #if (defined(__GFX10__) || defined(__GFX11__))
  7320. #define HW_ID_WGP_ID_SIZE 4
  7321. #define HW_ID_WGP_ID_OFFSET 10
  7322. #else
  7323. #define HW_ID_CU_ID_SIZE 4
  7324. #define HW_ID_CU_ID_OFFSET 8
  7325. #endif
  7326. #if (defined(__gfx908__) || defined(__gfx90a__) || \
  7327. defined(__GFX11__))
  7328. #define HW_ID_SE_ID_SIZE 3
  7329. #else //4 SEs/XCC for gfx940-942
  7330. #define HW_ID_SE_ID_SIZE 2
  7331. #endif
  7332. #if (defined(__GFX10__) || defined(__GFX11__))
  7333. #define HW_ID_SE_ID_OFFSET 18
  7334. #define HW_ID_SA_ID_OFFSET 16
  7335. #define HW_ID_SA_ID_SIZE 1
  7336. #else
  7337. #define HW_ID_SE_ID_OFFSET 13
  7338. #endif
  7339. #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
  7340. #define XCC_ID 20
  7341. #define XCC_ID_XCC_ID_SIZE 4
  7342. #define XCC_ID_XCC_ID_OFFSET 0
  7343. #endif
  7344. #if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
  7345. (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
  7346. #define __HIP_NO_IMAGE_SUPPORT 1
  7347. #endif
  7348. /*
  7349. Encoding of parameter bitmask
  7350. HW_ID 5:0 HW_ID
  7351. OFFSET 10:6 Range: 0..31
  7352. SIZE 15:11 Range: 1..32
  7353. */
  7354. #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
  7355. /*
  7356. __smid returns the wave's assigned Compute Unit and Shader Engine.
  7357. The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
  7358. Note: the results vary over time.
  7359. SZ minus 1 since SIZE is 1-based.
  7360. */
  7361. __device__
  7362. inline
  7363. unsigned __smid(void)
  7364. {
  7365. unsigned se_id = __builtin_amdgcn_s_getreg(
  7366. GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
  7367. #if (defined(__GFX10__) || defined(__GFX11__))
  7368. unsigned wgp_id = __builtin_amdgcn_s_getreg(
  7369. GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
  7370. unsigned sa_id = __builtin_amdgcn_s_getreg(
  7371. GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
  7372. #else
  7373. #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
  7374. unsigned xcc_id = __builtin_amdgcn_s_getreg(
  7375. GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
  7376. #endif
  7377. unsigned cu_id = __builtin_amdgcn_s_getreg(
  7378. GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
  7379. #endif
  7380. #if (defined(__GFX10__) || defined(__GFX11__))
  7381. unsigned temp = se_id;
  7382. temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
  7383. temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
  7384. return temp;
  7385. //TODO : CU Mode impl
  7386. #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
  7387. unsigned temp = xcc_id;
  7388. temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
  7389. temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
  7390. return temp;
  7391. #else
  7392. return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
  7393. #endif
  7394. }
  7395. /**
  7396. * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
  7397. * To be removed in a future release.
  7398. */
  7399. #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
  7400. #define HIP_DYNAMIC_SHARED_ATTRIBUTE
  7401. #endif //defined(__clang__) && defined(__HIP__)
  7402. // loop unrolling
  7403. static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
  7404. auto dstPtr = static_cast<unsigned char*>(dst);
  7405. auto srcPtr = static_cast<const unsigned char*>(src);
  7406. while (size >= 4u) {
  7407. dstPtr[0] = srcPtr[0];
  7408. dstPtr[1] = srcPtr[1];
  7409. dstPtr[2] = srcPtr[2];
  7410. dstPtr[3] = srcPtr[3];
  7411. size -= 4u;
  7412. srcPtr += 4u;
  7413. dstPtr += 4u;
  7414. }
  7415. switch (size) {
  7416. case 3:
  7417. dstPtr[2] = srcPtr[2];
  7418. case 2:
  7419. dstPtr[1] = srcPtr[1];
  7420. case 1:
  7421. dstPtr[0] = srcPtr[0];
  7422. }
  7423. return dst;
  7424. }
  7425. static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
  7426. auto dstPtr = static_cast<unsigned char*>(dst);
  7427. while (size >= 4u) {
  7428. dstPtr[0] = val;
  7429. dstPtr[1] = val;
  7430. dstPtr[2] = val;
  7431. dstPtr[3] = val;
  7432. size -= 4u;
  7433. dstPtr += 4u;
  7434. }
  7435. switch (size) {
  7436. case 3:
  7437. dstPtr[2] = val;
  7438. case 2:
  7439. dstPtr[1] = val;
  7440. case 1:
  7441. dstPtr[0] = val;
  7442. }
  7443. return dst;
  7444. }
  7445. #ifndef __OPENMP_AMDGCN__
  7446. static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
  7447. return __hip_hc_memcpy(dst, src, size);
  7448. }
  7449. static inline __device__ void* memset(void* ptr, int val, size_t size) {
  7450. unsigned char val8 = static_cast<unsigned char>(val);
  7451. return __hip_hc_memset(ptr, val8, size);
  7452. }
  7453. #endif // !__OPENMP_AMDGCN__
  7454. #endif
  7455. /*
  7456. Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  7457. Permission is hereby granted, free of charge, to any person obtaining a copy
  7458. of this software and associated documentation files (the "Software"), to deal
  7459. in the Software without restriction, including without limitation the rights
  7460. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7461. copies of the Software, and to permit persons to whom the Software is
  7462. furnished to do so, subject to the following conditions:
  7463. The above copyright notice and this permission notice shall be included in
  7464. all copies or substantial portions of the Software.
  7465. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  7466. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  7467. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  7468. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  7469. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  7470. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  7471. THE SOFTWARE.
  7472. */
  7473. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
  7474. #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
  7475. __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
  7476. union { int i; unsigned u; float f; } tmp; tmp.u = src;
  7477. tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
  7478. return tmp.u;
  7479. }
  7480. __device__ static inline float __hip_ds_bpermutef(int index, float src) {
  7481. union { int i; unsigned u; float f; } tmp; tmp.f = src;
  7482. tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
  7483. return tmp.f;
  7484. }
  7485. __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
  7486. union { int i; unsigned u; float f; } tmp; tmp.u = src;
  7487. tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
  7488. return tmp.u;
  7489. }
  7490. __device__ static inline float __hip_ds_permutef(int index, float src) {
  7491. union { int i; unsigned u; float f; } tmp; tmp.f = src;
  7492. tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
  7493. return tmp.f;
  7494. }
  7495. #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
  7496. #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
  7497. template <int pattern>
  7498. __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
  7499. union { int i; unsigned u; float f; } tmp; tmp.u = src;
  7500. tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
  7501. return tmp.u;
  7502. }
  7503. template <int pattern>
  7504. __device__ static inline float __hip_ds_swizzlef_N(float src) {
  7505. union { int i; unsigned u; float f; } tmp; tmp.f = src;
  7506. tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
  7507. return tmp.f;
  7508. }
  7509. #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
  7510. __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
  7511. template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
  7512. __device__ static inline int __hip_move_dpp_N(int src) {
  7513. return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
  7514. bound_ctrl);
  7515. }
  7516. static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
  7517. __device__
  7518. inline
  7519. int __shfl(int var, int src_lane, int width = warpSize) {
  7520. int self = __lane_id();
  7521. int index = (src_lane & (width - 1)) + (self & ~(width-1));
  7522. return __builtin_amdgcn_ds_bpermute(index<<2, var);
  7523. }
  7524. __device__
  7525. inline
  7526. unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
  7527. union { int i; unsigned u; float f; } tmp; tmp.u = var;
  7528. tmp.i = __shfl(tmp.i, src_lane, width);
  7529. return tmp.u;
  7530. }
  7531. __device__
  7532. inline
  7533. float __shfl(float var, int src_lane, int width = warpSize) {
  7534. union { int i; unsigned u; float f; } tmp; tmp.f = var;
  7535. tmp.i = __shfl(tmp.i, src_lane, width);
  7536. return tmp.f;
  7537. }
  7538. __device__
  7539. inline
  7540. double __shfl(double var, int src_lane, int width = warpSize) {
  7541. static_assert(sizeof(double) == 2 * sizeof(int), "");
  7542. static_assert(sizeof(double) == sizeof(uint64_t), "");
  7543. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7544. tmp[0] = __shfl(tmp[0], src_lane, width);
  7545. tmp[1] = __shfl(tmp[1], src_lane, width);
  7546. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7547. double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7548. return tmp1;
  7549. }
  7550. __device__
  7551. inline
  7552. long __shfl(long var, int src_lane, int width = warpSize)
  7553. {
  7554. #ifndef _MSC_VER
  7555. static_assert(sizeof(long) == 2 * sizeof(int), "");
  7556. static_assert(sizeof(long) == sizeof(uint64_t), "");
  7557. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7558. tmp[0] = __shfl(tmp[0], src_lane, width);
  7559. tmp[1] = __shfl(tmp[1], src_lane, width);
  7560. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7561. long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7562. return tmp1;
  7563. #else
  7564. static_assert(sizeof(long) == sizeof(int), "");
  7565. return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
  7566. #endif
  7567. }
  7568. __device__
  7569. inline
  7570. unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
  7571. #ifndef _MSC_VER
  7572. static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  7573. static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
  7574. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7575. tmp[0] = __shfl(tmp[0], src_lane, width);
  7576. tmp[1] = __shfl(tmp[1], src_lane, width);
  7577. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7578. unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7579. return tmp1;
  7580. #else
  7581. static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  7582. return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
  7583. #endif
  7584. }
  7585. __device__
  7586. inline
  7587. long long __shfl(long long var, int src_lane, int width = warpSize)
  7588. {
  7589. static_assert(sizeof(long long) == 2 * sizeof(int), "");
  7590. static_assert(sizeof(long long) == sizeof(uint64_t), "");
  7591. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7592. tmp[0] = __shfl(tmp[0], src_lane, width);
  7593. tmp[1] = __shfl(tmp[1], src_lane, width);
  7594. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7595. long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7596. return tmp1;
  7597. }
  7598. __device__
  7599. inline
  7600. unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
  7601. static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  7602. static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
  7603. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7604. tmp[0] = __shfl(tmp[0], src_lane, width);
  7605. tmp[1] = __shfl(tmp[1], src_lane, width);
  7606. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7607. unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7608. return tmp1;
  7609. }
  7610. __device__
  7611. inline
  7612. int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
  7613. int self = __lane_id();
  7614. int index = self - lane_delta;
  7615. index = (index < (self & ~(width-1)))?self:index;
  7616. return __builtin_amdgcn_ds_bpermute(index<<2, var);
  7617. }
  7618. __device__
  7619. inline
  7620. unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
  7621. union { int i; unsigned u; float f; } tmp; tmp.u = var;
  7622. tmp.i = __shfl_up(tmp.i, lane_delta, width);
  7623. return tmp.u;
  7624. }
  7625. __device__
  7626. inline
  7627. float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
  7628. union { int i; unsigned u; float f; } tmp; tmp.f = var;
  7629. tmp.i = __shfl_up(tmp.i, lane_delta, width);
  7630. return tmp.f;
  7631. }
  7632. __device__
  7633. inline
  7634. double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
  7635. static_assert(sizeof(double) == 2 * sizeof(int), "");
  7636. static_assert(sizeof(double) == sizeof(uint64_t), "");
  7637. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7638. tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  7639. tmp[1] = __shfl_up(tmp[1], lane_delta, width);
  7640. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7641. double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7642. return tmp1;
  7643. }
  7644. __device__
  7645. inline
  7646. long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
  7647. {
  7648. #ifndef _MSC_VER
  7649. static_assert(sizeof(long) == 2 * sizeof(int), "");
  7650. static_assert(sizeof(long) == sizeof(uint64_t), "");
  7651. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7652. tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  7653. tmp[1] = __shfl_up(tmp[1], lane_delta, width);
  7654. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7655. long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7656. return tmp1;
  7657. #else
  7658. static_assert(sizeof(long) == sizeof(int), "");
  7659. return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
  7660. #endif
  7661. }
  7662. __device__
  7663. inline
  7664. unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
  7665. {
  7666. #ifndef _MSC_VER
  7667. static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  7668. static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
  7669. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7670. tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  7671. tmp[1] = __shfl_up(tmp[1], lane_delta, width);
  7672. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7673. unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7674. return tmp1;
  7675. #else
  7676. static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  7677. return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
  7678. #endif
  7679. }
  7680. __device__
  7681. inline
  7682. long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
  7683. {
  7684. static_assert(sizeof(long long) == 2 * sizeof(int), "");
  7685. static_assert(sizeof(long long) == sizeof(uint64_t), "");
  7686. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7687. tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  7688. tmp[1] = __shfl_up(tmp[1], lane_delta, width);
  7689. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7690. long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7691. return tmp1;
  7692. }
  7693. __device__
  7694. inline
  7695. unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
  7696. {
  7697. static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  7698. static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
  7699. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7700. tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  7701. tmp[1] = __shfl_up(tmp[1], lane_delta, width);
  7702. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7703. unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7704. return tmp1;
  7705. }
  7706. __device__
  7707. inline
  7708. int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
  7709. int self = __lane_id();
  7710. int index = self + lane_delta;
  7711. index = (int)((self&(width-1))+lane_delta) >= width?self:index;
  7712. return __builtin_amdgcn_ds_bpermute(index<<2, var);
  7713. }
  7714. __device__
  7715. inline
  7716. unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
  7717. union { int i; unsigned u; float f; } tmp; tmp.u = var;
  7718. tmp.i = __shfl_down(tmp.i, lane_delta, width);
  7719. return tmp.u;
  7720. }
  7721. __device__
  7722. inline
  7723. float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
  7724. union { int i; unsigned u; float f; } tmp; tmp.f = var;
  7725. tmp.i = __shfl_down(tmp.i, lane_delta, width);
  7726. return tmp.f;
  7727. }
  7728. __device__
  7729. inline
  7730. double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
  7731. static_assert(sizeof(double) == 2 * sizeof(int), "");
  7732. static_assert(sizeof(double) == sizeof(uint64_t), "");
  7733. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7734. tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  7735. tmp[1] = __shfl_down(tmp[1], lane_delta, width);
  7736. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7737. double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7738. return tmp1;
  7739. }
  7740. __device__
  7741. inline
  7742. long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
  7743. {
  7744. #ifndef _MSC_VER
  7745. static_assert(sizeof(long) == 2 * sizeof(int), "");
  7746. static_assert(sizeof(long) == sizeof(uint64_t), "");
  7747. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7748. tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  7749. tmp[1] = __shfl_down(tmp[1], lane_delta, width);
  7750. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7751. long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7752. return tmp1;
  7753. #else
  7754. static_assert(sizeof(long) == sizeof(int), "");
  7755. return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
  7756. #endif
  7757. }
  7758. __device__
  7759. inline
  7760. unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
  7761. {
  7762. #ifndef _MSC_VER
  7763. static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  7764. static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
  7765. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7766. tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  7767. tmp[1] = __shfl_down(tmp[1], lane_delta, width);
  7768. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7769. unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7770. return tmp1;
  7771. #else
  7772. static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  7773. return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
  7774. #endif
  7775. }
  7776. __device__
  7777. inline
  7778. long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
  7779. {
  7780. static_assert(sizeof(long long) == 2 * sizeof(int), "");
  7781. static_assert(sizeof(long long) == sizeof(uint64_t), "");
  7782. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7783. tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  7784. tmp[1] = __shfl_down(tmp[1], lane_delta, width);
  7785. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7786. long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7787. return tmp1;
  7788. }
  7789. __device__
  7790. inline
  7791. unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
  7792. {
  7793. static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  7794. static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
  7795. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7796. tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  7797. tmp[1] = __shfl_down(tmp[1], lane_delta, width);
  7798. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7799. unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7800. return tmp1;
  7801. }
  7802. __device__
  7803. inline
  7804. int __shfl_xor(int var, int lane_mask, int width = warpSize) {
  7805. int self = __lane_id();
  7806. int index = self^lane_mask;
  7807. index = index >= ((self+width)&~(width-1))?self:index;
  7808. return __builtin_amdgcn_ds_bpermute(index<<2, var);
  7809. }
  7810. __device__
  7811. inline
  7812. unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
  7813. union { int i; unsigned u; float f; } tmp; tmp.u = var;
  7814. tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  7815. return tmp.u;
  7816. }
  7817. __device__
  7818. inline
  7819. float __shfl_xor(float var, int lane_mask, int width = warpSize) {
  7820. union { int i; unsigned u; float f; } tmp; tmp.f = var;
  7821. tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  7822. return tmp.f;
  7823. }
  7824. __device__
  7825. inline
  7826. double __shfl_xor(double var, int lane_mask, int width = warpSize) {
  7827. static_assert(sizeof(double) == 2 * sizeof(int), "");
  7828. static_assert(sizeof(double) == sizeof(uint64_t), "");
  7829. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7830. tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  7831. tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
  7832. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7833. double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7834. return tmp1;
  7835. }
  7836. __device__
  7837. inline
  7838. long __shfl_xor(long var, int lane_mask, int width = warpSize)
  7839. {
  7840. #ifndef _MSC_VER
  7841. static_assert(sizeof(long) == 2 * sizeof(int), "");
  7842. static_assert(sizeof(long) == sizeof(uint64_t), "");
  7843. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7844. tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  7845. tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
  7846. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7847. long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7848. return tmp1;
  7849. #else
  7850. static_assert(sizeof(long) == sizeof(int), "");
  7851. return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
  7852. #endif
  7853. }
  7854. __device__
  7855. inline
  7856. unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
  7857. {
  7858. #ifndef _MSC_VER
  7859. static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  7860. static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
  7861. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7862. tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  7863. tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
  7864. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7865. unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7866. return tmp1;
  7867. #else
  7868. static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  7869. return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
  7870. #endif
  7871. }
  7872. __device__
  7873. inline
  7874. long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
  7875. {
  7876. static_assert(sizeof(long long) == 2 * sizeof(int), "");
  7877. static_assert(sizeof(long long) == sizeof(uint64_t), "");
  7878. int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7879. tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  7880. tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
  7881. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7882. long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7883. return tmp1;
  7884. }
  7885. __device__
  7886. inline
  7887. unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
  7888. {
  7889. static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  7890. static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
  7891. unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
  7892. tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  7893. tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
  7894. uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
  7895. unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  7896. return tmp1;
  7897. }
  7898. #endif
  7899. /*
  7900. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  7901. Permission is hereby granted, free of charge, to any person obtaining a copy
  7902. of this software and associated documentation files (the "Software"), to deal
  7903. in the Software without restriction, including without limitation the rights
  7904. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7905. copies of the Software, and to permit persons to whom the Software is
  7906. furnished to do so, subject to the following conditions:
  7907. The above copyright notice and this permission notice shall be included in
  7908. all copies or substantial portions of the Software.
  7909. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  7910. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  7911. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  7912. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  7913. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  7914. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  7915. THE SOFTWARE.
  7916. */
  7917. /**
  7918. * @file amd_detail/hip_cooperative_groups_helper.h
  7919. *
  7920. * @brief Device side implementation of cooperative group feature.
  7921. *
  7922. * Defines helper constructs and APIs which aid the types and device API
  7923. * wrappers defined within `amd_detail/hip_cooperative_groups.h`.
  7924. */
  7925. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
  7926. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
  7927. #if __cplusplus
  7928. #if !defined(__HIPCC_RTC__)
  7929. #include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
  7930. #include <hip/amd_detail/amd_device_functions.h>
  7931. #endif
  7932. #if !defined(__align__)
  7933. #define __align__(x) __attribute__((aligned(x)))
  7934. #endif
  7935. #if !defined(__CG_QUALIFIER__)
  7936. #define __CG_QUALIFIER__ __device__ __forceinline__
  7937. #endif
  7938. #if !defined(__CG_STATIC_QUALIFIER__)
  7939. #define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
  7940. #endif
  7941. #if !defined(_CG_STATIC_CONST_DECL_)
  7942. #define _CG_STATIC_CONST_DECL_ static constexpr
  7943. #endif
  7944. #if __AMDGCN_WAVEFRONT_SIZE == 32
  7945. using lane_mask = unsigned int;
  7946. #else
  7947. using lane_mask = unsigned long long int;
  7948. #endif
  7949. namespace cooperative_groups {
  7950. /* Global scope */
  7951. template <unsigned int size>
  7952. using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
  7953. template <unsigned int size>
  7954. using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
  7955. template <unsigned int size>
  7956. using is_valid_tile_size =
  7957. std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
  7958. template <typename T>
  7959. using is_valid_type =
  7960. std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
  7961. namespace internal {
  7962. /**
  7963. * @brief Enums representing different cooperative group types
  7964. * @note This enum is only applicable on Linux.
  7965. *
  7966. */
  7967. typedef enum {
  7968. cg_invalid,
  7969. cg_multi_grid,
  7970. cg_grid,
  7971. cg_workgroup,
  7972. cg_tiled_group,
  7973. cg_coalesced_group
  7974. } group_type;
  7975. /**
  7976. * @ingroup CooperativeG
  7977. * @{
  7978. * This section describes the cooperative groups functions of HIP runtime API.
  7979. *
  7980. * The cooperative groups provides flexible thread parallel programming algorithms, threads
  7981. * cooperate and share data to perform collective computations.
  7982. *
  7983. * @note Cooperative groups feature is implemented on Linux, under developement
  7984. * on Windows.
  7985. *
  7986. */
  7987. /**
  7988. *
  7989. * @brief Functionalities related to multi-grid cooperative group type
  7990. * @note The following cooperative groups functions are only applicable on Linux.
  7991. *
  7992. */
  7993. namespace multi_grid {
  7994. __CG_STATIC_QUALIFIER__ uint32_t num_grids() {
  7995. return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
  7996. __CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
  7997. return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
  7998. __CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
  7999. __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  8000. return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
  8001. __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
  8002. __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
  8003. } // namespace multi_grid
  8004. /**
  8005. * @brief Functionalities related to grid cooperative group type
  8006. * @note The following cooperative groups functions are only applicable on Linux.
  8007. */
  8008. namespace grid {
  8009. __CG_STATIC_QUALIFIER__ uint32_t size() {
  8010. return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
  8011. (blockDim.x * gridDim.x));
  8012. }
  8013. __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  8014. // Compute global id of the workgroup to which the current thread belongs to
  8015. uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
  8016. (blockIdx.y * gridDim.x) + (blockIdx.x));
  8017. // Compute total number of threads being passed to reach current workgroup
  8018. // within grid
  8019. uint32_t num_threads_till_current_workgroup =
  8020. static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
  8021. // Compute thread local rank within current workgroup
  8022. uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
  8023. (threadIdx.y * blockDim.x) + (threadIdx.x));
  8024. return (num_threads_till_current_workgroup + local_thread_rank);
  8025. }
  8026. __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
  8027. __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
  8028. } // namespace grid
  8029. /**
  8030. * @brief Functionalities related to `workgroup` (thread_block in CUDA terminology)
  8031. * cooperative group type
  8032. * @note The following cooperative groups functions are only applicable on Linux.
  8033. */
  8034. namespace workgroup {
  8035. __CG_STATIC_QUALIFIER__ dim3 group_index() {
  8036. return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
  8037. static_cast<uint32_t>(blockIdx.z)));
  8038. }
  8039. __CG_STATIC_QUALIFIER__ dim3 thread_index() {
  8040. return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
  8041. static_cast<uint32_t>(threadIdx.z)));
  8042. }
  8043. __CG_STATIC_QUALIFIER__ uint32_t size() {
  8044. return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
  8045. }
  8046. __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  8047. return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
  8048. (threadIdx.y * blockDim.x) + (threadIdx.x)));
  8049. }
  8050. __CG_STATIC_QUALIFIER__ bool is_valid() {
  8051. return true;
  8052. }
  8053. __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
  8054. __CG_STATIC_QUALIFIER__ dim3 block_dim() {
  8055. return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
  8056. static_cast<uint32_t>(blockDim.z)));
  8057. }
  8058. } // namespace workgroup
  8059. namespace tiled_group {
  8060. // enforce ordering for memory intructions
  8061. __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
  8062. } // namespace tiled_group
  8063. namespace coalesced_group {
  8064. // enforce ordering for memory intructions
  8065. __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
  8066. // Masked bit count
  8067. //
  8068. // For each thread, this function returns the number of active threads which
  8069. // have i-th bit of x set and come before the current thread.
  8070. __CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
  8071. unsigned int counter=0;
  8072. #if __AMDGCN_WAVEFRONT_SIZE == 32
  8073. counter = __builtin_amdgcn_mbcnt_lo(x, add);
  8074. #else
  8075. counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
  8076. counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
  8077. #endif
  8078. return counter;
  8079. }
  8080. } // namespace coalesced_group
  8081. } // namespace internal
  8082. } // namespace cooperative_groups
  8083. /**
  8084. * @}
  8085. */
  8086. #endif // __cplusplus
  8087. #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
  8088. /*
  8089. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  8090. Permission is hereby granted, free of charge, to any person obtaining a copy
  8091. of this software and associated documentation files (the "Software"), to deal
  8092. in the Software without restriction, including without limitation the rights
  8093. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8094. copies of the Software, and to permit persons to whom the Software is
  8095. furnished to do so, subject to the following conditions:
  8096. The above copyright notice and this permission notice shall be included in
  8097. all copies or substantial portions of the Software.
  8098. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  8099. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  8100. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  8101. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  8102. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  8103. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  8104. THE SOFTWARE.
  8105. */
  8106. /**
  8107. * @file amd_detail/hip_cooperative_groups.h
  8108. *
  8109. * @brief Device side implementation of `Cooperative Group` feature.
  8110. *
  8111. * Defines new types and device API wrappers related to `Cooperative Group`
  8112. * feature, which the programmer can directly use in his kernel(s) in order to
  8113. * make use of this feature.
  8114. */
  8115. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
  8116. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
  8117. #if __cplusplus
  8118. #if !defined(__HIPCC_RTC__)
  8119. #include <hip/amd_detail/hip_cooperative_groups_helper.h>
  8120. #endif
  8121. #define __hip_abort() \
  8122. { abort(); }
  8123. #if defined(NDEBUG)
  8124. #define __hip_assert(COND)
  8125. #else
  8126. #define __hip_assert(COND) \
  8127. { \
  8128. if (!COND) { \
  8129. __hip_abort(); \
  8130. } \
  8131. }
  8132. #endif
  8133. namespace cooperative_groups {
  8134. /** @brief The base type of all cooperative group types
  8135. *
  8136. * \details Holds the key properties of a constructed cooperative group types
  8137. * object, like the group type, its size, etc
  8138. *
  8139. * @note Cooperative groups feature is implemented on Linux, under developement
  8140. * on Windows.
  8141. */
  8142. class thread_group {
  8143. protected:
  8144. uint32_t _type; // thread_group type
  8145. uint32_t _size; // total number of threads in the tread_group
  8146. uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
  8147. // LSB represents lane 0, and MSB represents lane 63
  8148. // Construct a thread group, and set thread group type and other essential
  8149. // thread group properties. This generic thread group is directly constructed
  8150. // only when the group is supposed to contain only the calling the thread
  8151. // (throurh the API - `this_thread()`), and in all other cases, this thread
  8152. // group object is a sub-object of some other derived thread group object
  8153. __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
  8154. uint64_t mask = static_cast<uint64_t>(0)) {
  8155. _type = type;
  8156. _size = size;
  8157. _mask = mask;
  8158. }
  8159. struct _tiled_info {
  8160. bool is_tiled;
  8161. unsigned int size;
  8162. unsigned int meta_group_rank;
  8163. unsigned int meta_group_size;
  8164. };
  8165. struct _coalesced_info {
  8166. lane_mask member_mask;
  8167. unsigned int size;
  8168. struct _tiled_info tiled_info;
  8169. } coalesced_info;
  8170. friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
  8171. unsigned int tile_size);
  8172. friend class thread_block;
  8173. public:
  8174. // Total number of threads in the thread group, and this serves the purpose
  8175. // for all derived cooperative group types since their `size` is directly
  8176. // saved during the construction
  8177. __CG_QUALIFIER__ uint32_t size() const { return _size; }
  8178. __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
  8179. // Rank of the calling thread within [0, size())
  8180. __CG_QUALIFIER__ uint32_t thread_rank() const;
  8181. // Is this cooperative group type valid?
  8182. __CG_QUALIFIER__ bool is_valid() const;
  8183. // synchronize the threads in the thread group
  8184. __CG_QUALIFIER__ void sync() const;
  8185. };
  8186. /**
  8187. *-------------------------------------------------------------------------------------------------
  8188. *-------------------------------------------------------------------------------------------------
  8189. * @defgroup CooperativeG Cooperative Groups
  8190. * @ingroup API
  8191. * @{
  8192. * This section describes the cooperative groups functions of HIP runtime API.
  8193. *
  8194. * The cooperative groups provides flexible thread parallel programming algorithms, threads
  8195. * cooperate and share data to perform collective computations.
  8196. *
  8197. * @note Cooperative groups feature is implemented on Linux, under developement
  8198. * on Windows.
  8199. *
  8200. */
  8201. /** \brief The multi-grid cooperative group type
  8202. *
  8203. * \details Represents an inter-device cooperative group type where the
  8204. * participating threads within the group spans across multple
  8205. * devices, running the (same) kernel on these devices
  8206. * @note The multi-grid cooperative group type is implemented on Linux, under developement
  8207. * on Windows.
  8208. */
  8209. class multi_grid_group : public thread_group {
  8210. // Only these friend functions are allowed to construct an object of this class
  8211. // and access its resources
  8212. friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
  8213. protected:
  8214. // Construct mutli-grid thread group (through the API this_multi_grid())
  8215. explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
  8216. : thread_group(internal::cg_multi_grid, size) {}
  8217. public:
  8218. // Number of invocations participating in this multi-grid group. In other
  8219. // words, the number of GPUs
  8220. __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
  8221. // Rank of this invocation. In other words, an ID number within the range
  8222. // [0, num_grids()) of the GPU, this kernel is running on
  8223. __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
  8224. __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
  8225. __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
  8226. __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
  8227. };
  8228. /** @brief User exposed API interface to construct multi-grid cooperative
  8229. * group type object - `multi_grid_group`
  8230. *
  8231. * \details User is not allowed to directly construct an object of type
  8232. * `multi_grid_group`. Instead, he should construct it through this
  8233. * API function
  8234. * @note This multi-grid cooperative API type is implemented on Linux, under developement
  8235. * on Windows.
  8236. */
  8237. __CG_QUALIFIER__ multi_grid_group this_multi_grid() {
  8238. return multi_grid_group(internal::multi_grid::size());
  8239. }
  8240. /** @brief The grid cooperative group type
  8241. *
  8242. * \details Represents an inter-workgroup cooperative group type where the
  8243. * participating threads within the group spans across multiple
  8244. * workgroups running the (same) kernel on the same device
  8245. * @note This is implemented on Linux, under developement
  8246. * on Windows.
  8247. */
  8248. class grid_group : public thread_group {
  8249. // Only these friend functions are allowed to construct an object of this class
  8250. // and access its resources
  8251. friend __CG_QUALIFIER__ grid_group this_grid();
  8252. protected:
  8253. // Construct grid thread group (through the API this_grid())
  8254. explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
  8255. public:
  8256. __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
  8257. __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
  8258. __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
  8259. };
  8260. /** @brief User exposed API interface to construct grid cooperative group type
  8261. * object - `grid_group`
  8262. *
  8263. * \details User is not allowed to directly construct an object of type
  8264. * `multi_grid_group`. Instead, he should construct it through this
  8265. * API function
  8266. * @note This function is implemented on Linux, under developement
  8267. * on Windows.
  8268. */
  8269. __CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
  8270. /** @brief The workgroup (thread-block in CUDA terminology) cooperative group
  8271. * type
  8272. *
  8273. * \details Represents an intra-workgroup cooperative group type where the
  8274. * participating threads within the group are exactly the same threads
  8275. * which are participated in the currently executing `workgroup`
  8276. * @note This is implemented on Linux, under developement
  8277. * on Windows.
  8278. */
  8279. class thread_block : public thread_group {
  8280. // Only these friend functions are allowed to construct an object of thi
  8281. // class and access its resources
  8282. friend __CG_QUALIFIER__ thread_block this_thread_block();
  8283. friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
  8284. unsigned int tile_size);
  8285. friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
  8286. unsigned int tile_size);
  8287. protected:
  8288. // Construct a workgroup thread group (through the API this_thread_block())
  8289. explicit __CG_QUALIFIER__ thread_block(uint32_t size)
  8290. : thread_group(internal::cg_workgroup, size) {}
  8291. __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
  8292. const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
  8293. // Invalid tile size, assert
  8294. if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
  8295. __hip_assert(false && "invalid tile size")
  8296. }
  8297. thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
  8298. tiledGroup.coalesced_info.tiled_info.size = tile_size;
  8299. tiledGroup.coalesced_info.tiled_info.is_tiled = true;
  8300. tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
  8301. tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size;
  8302. return tiledGroup;
  8303. }
  8304. public:
  8305. // 3-dimensional block index within the grid
  8306. __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
  8307. // 3-dimensional thread index within the block
  8308. __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
  8309. __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
  8310. __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
  8311. __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
  8312. __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
  8313. __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
  8314. };
  8315. /** \brief User exposed API interface to construct workgroup cooperative
  8316. * group type object - `thread_block`.
  8317. *
  8318. * \details User is not allowed to directly construct an object of type
  8319. * `thread_block`. Instead, he should construct it through this API
  8320. * function.
  8321. * @note This function is implemented on Linux, under developement
  8322. * on Windows.
  8323. */
  8324. __CG_QUALIFIER__ thread_block this_thread_block() {
  8325. return thread_block(internal::workgroup::size());
  8326. }
  8327. /** \brief The tiled_group cooperative group type
  8328. *
  8329. * \details Represents one tiled thread group in a wavefront.
  8330. * This group type also supports sub-wave level intrinsics.
  8331. * @note This is implemented on Linux, under developement
  8332. * on Windows.
  8333. */
  8334. class tiled_group : public thread_group {
  8335. private:
  8336. friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
  8337. unsigned int tile_size);
  8338. friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
  8339. unsigned int tile_size);
  8340. __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
  8341. const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
  8342. if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
  8343. __hip_assert(false && "invalid tile size")
  8344. }
  8345. if (size() <= tile_size) {
  8346. return *this;
  8347. }
  8348. tiled_group tiledGroup = tiled_group(tile_size);
  8349. tiledGroup.coalesced_info.tiled_info.is_tiled = true;
  8350. return tiledGroup;
  8351. }
  8352. protected:
  8353. explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
  8354. : thread_group(internal::cg_tiled_group, tileSize) {
  8355. coalesced_info.tiled_info.size = tileSize;
  8356. coalesced_info.tiled_info.is_tiled = true;
  8357. }
  8358. public:
  8359. __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
  8360. __CG_QUALIFIER__ unsigned int thread_rank() const {
  8361. return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
  8362. }
  8363. __CG_QUALIFIER__ void sync() const {
  8364. internal::tiled_group::sync();
  8365. }
  8366. };
  8367. /** \brief The coalesced_group cooperative group type
  8368. *
  8369. * \details Represents a active thread group in a wavefront.
  8370. * This group type also supports sub-wave level intrinsics.
  8371. * @note This is implemented on Linux, under developement
  8372. * on Windows.
  8373. */
  8374. class coalesced_group : public thread_group {
  8375. private:
  8376. friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
  8377. friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
  8378. friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
  8379. __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
  8380. const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
  8381. if (!tile_size || (tile_size > size()) || !pow2) {
  8382. return coalesced_group(0);
  8383. }
  8384. // If a tiled group is passed to be partitioned further into a coalesced_group.
  8385. // prepare a mask for further partitioning it so that it stays coalesced.
  8386. if (coalesced_info.tiled_info.is_tiled) {
  8387. unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
  8388. unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
  8389. lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
  8390. member_mask <<= (__lane_id() & ~(tile_size - 1));
  8391. coalesced_group coalesced_tile = coalesced_group(member_mask);
  8392. coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
  8393. coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
  8394. coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
  8395. return coalesced_tile;
  8396. }
  8397. // Here the parent coalesced_group is not partitioned.
  8398. else {
  8399. lane_mask member_mask = 0;
  8400. unsigned int tile_rank = 0;
  8401. int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
  8402. for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
  8403. lane_mask active = coalesced_info.member_mask & (1 << i);
  8404. // Make sure the lane is active
  8405. if (active) {
  8406. if (lanes_to_skip <= 0 && tile_rank < tile_size) {
  8407. // Prepare a member_mask that is appropriate for a tile
  8408. member_mask |= active;
  8409. tile_rank++;
  8410. }
  8411. lanes_to_skip--;
  8412. }
  8413. }
  8414. coalesced_group coalesced_tile = coalesced_group(member_mask);
  8415. coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
  8416. coalesced_tile.coalesced_info.tiled_info.meta_group_size =
  8417. (size() + tile_size - 1) / tile_size;
  8418. return coalesced_tile;
  8419. }
  8420. return coalesced_group(0);
  8421. }
  8422. protected:
  8423. // Constructor
  8424. explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
  8425. : thread_group(internal::cg_coalesced_group) {
  8426. coalesced_info.member_mask = member_mask; // Which threads are active
  8427. coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
  8428. coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
  8429. coalesced_info.tiled_info.meta_group_rank = 0;
  8430. coalesced_info.tiled_info.meta_group_size = 1;
  8431. }
  8432. public:
  8433. __CG_QUALIFIER__ unsigned int size() const {
  8434. return coalesced_info.size;
  8435. }
  8436. __CG_QUALIFIER__ unsigned int thread_rank() const {
  8437. return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
  8438. }
  8439. __CG_QUALIFIER__ void sync() const {
  8440. internal::coalesced_group::sync();
  8441. }
  8442. __CG_QUALIFIER__ unsigned int meta_group_rank() const {
  8443. return coalesced_info.tiled_info.meta_group_rank;
  8444. }
  8445. __CG_QUALIFIER__ unsigned int meta_group_size() const {
  8446. return coalesced_info.tiled_info.meta_group_size;
  8447. }
  8448. template <class T>
  8449. __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
  8450. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8451. srcRank = srcRank % static_cast<int>(size());
  8452. int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
  8453. : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
  8454. : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
  8455. return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  8456. }
  8457. template <class T>
  8458. __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
  8459. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8460. // Note: The cuda implementation appears to use the remainder of lane_delta
  8461. // and WARP_SIZE as the shift value rather than lane_delta itself.
  8462. // This is not described in the documentation and is not done here.
  8463. if (size() == __AMDGCN_WAVEFRONT_SIZE) {
  8464. return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
  8465. }
  8466. int lane;
  8467. if (__AMDGCN_WAVEFRONT_SIZE == 64) {
  8468. lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
  8469. }
  8470. else {
  8471. lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
  8472. }
  8473. if (lane == -1) {
  8474. lane = __lane_id();
  8475. }
  8476. return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  8477. }
  8478. template <class T>
  8479. __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
  8480. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8481. // Note: The cuda implementation appears to use the remainder of lane_delta
  8482. // and WARP_SIZE as the shift value rather than lane_delta itself.
  8483. // This is not described in the documentation and is not done here.
  8484. if (size() == __AMDGCN_WAVEFRONT_SIZE) {
  8485. return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
  8486. }
  8487. int lane;
  8488. if (__AMDGCN_WAVEFRONT_SIZE == 64) {
  8489. lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
  8490. }
  8491. else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
  8492. lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
  8493. }
  8494. if (lane == -1) {
  8495. lane = __lane_id();
  8496. }
  8497. return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  8498. }
  8499. };
  8500. /** \brief User exposed API to create coalesced groups.
  8501. *
  8502. * \details A collective operation that groups all active lanes into a new thread group.
  8503. * @note This function is implemented on Linux, under developement
  8504. * on Windows.
  8505. */
  8506. __CG_QUALIFIER__ coalesced_group coalesced_threads() {
  8507. return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
  8508. }
  8509. /**
  8510. * Implemenation of all publicly exposed base class APIs
  8511. * @note This function is implemented on Linux, under developement
  8512. * on Windows.
  8513. */
  8514. __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
  8515. switch (this->_type) {
  8516. case internal::cg_multi_grid: {
  8517. return (static_cast<const multi_grid_group*>(this)->thread_rank());
  8518. }
  8519. case internal::cg_grid: {
  8520. return (static_cast<const grid_group*>(this)->thread_rank());
  8521. }
  8522. case internal::cg_workgroup: {
  8523. return (static_cast<const thread_block*>(this)->thread_rank());
  8524. }
  8525. case internal::cg_tiled_group: {
  8526. return (static_cast<const tiled_group*>(this)->thread_rank());
  8527. }
  8528. case internal::cg_coalesced_group: {
  8529. return (static_cast<const coalesced_group*>(this)->thread_rank());
  8530. }
  8531. default: {
  8532. __hip_assert(false && "invalid cooperative group type")
  8533. return -1;
  8534. }
  8535. }
  8536. }
  8537. /**
  8538. * Implemenation of all publicly exposed thread group API
  8539. * @note This function is implemented on Linux, under developement
  8540. * on Windows.
  8541. */
  8542. __CG_QUALIFIER__ bool thread_group::is_valid() const {
  8543. switch (this->_type) {
  8544. case internal::cg_multi_grid: {
  8545. return (static_cast<const multi_grid_group*>(this)->is_valid());
  8546. }
  8547. case internal::cg_grid: {
  8548. return (static_cast<const grid_group*>(this)->is_valid());
  8549. }
  8550. case internal::cg_workgroup: {
  8551. return (static_cast<const thread_block*>(this)->is_valid());
  8552. }
  8553. case internal::cg_tiled_group: {
  8554. return (static_cast<const tiled_group*>(this)->is_valid());
  8555. }
  8556. case internal::cg_coalesced_group: {
  8557. return (static_cast<const coalesced_group*>(this)->is_valid());
  8558. }
  8559. default: {
  8560. __hip_assert(false && "invalid cooperative group type")
  8561. return false;
  8562. }
  8563. }
  8564. }
  8565. /**
  8566. * Implemenation of all publicly exposed thread group sync API
  8567. * @note This function is implemented on Linux, under developement
  8568. * on Windows.
  8569. */
  8570. __CG_QUALIFIER__ void thread_group::sync() const {
  8571. switch (this->_type) {
  8572. case internal::cg_multi_grid: {
  8573. static_cast<const multi_grid_group*>(this)->sync();
  8574. break;
  8575. }
  8576. case internal::cg_grid: {
  8577. static_cast<const grid_group*>(this)->sync();
  8578. break;
  8579. }
  8580. case internal::cg_workgroup: {
  8581. static_cast<const thread_block*>(this)->sync();
  8582. break;
  8583. }
  8584. case internal::cg_tiled_group: {
  8585. static_cast<const tiled_group*>(this)->sync();
  8586. break;
  8587. }
  8588. case internal::cg_coalesced_group: {
  8589. static_cast<const coalesced_group*>(this)->sync();
  8590. break;
  8591. }
  8592. default: {
  8593. __hip_assert(false && "invalid cooperative group type")
  8594. }
  8595. }
  8596. }
  8597. /**
  8598. * Implemenation of publicly exposed `wrapper` API on top of basic cooperative
  8599. * group type APIs
  8600. * @note This function is implemented on Linux, under developement
  8601. * on Windows.
  8602. */
  8603. template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
  8604. /**
  8605. * Implemenation of publicly exposed `wrapper` API on top of basic cooperative
  8606. * group type APIs
  8607. * @note This function is implemented on Linux, under developement
  8608. * on Windows.
  8609. */
  8610. template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
  8611. return g.thread_rank();
  8612. }
  8613. /**
  8614. * Implemenation of publicly exposed `wrapper` API on top of basic cooperative
  8615. * group type APIs
  8616. * @note This function is implemented on Linux, under developement
  8617. * on Windows.
  8618. */
  8619. template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
  8620. /**
  8621. * Implemenation of publicly exposed `wrapper` API on top of basic cooperative
  8622. * group type APIs
  8623. * @note This function is implemented on Linux, under developement
  8624. * on Windows.
  8625. */
  8626. template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
  8627. /**
  8628. * template class tile_base
  8629. * @note This class is implemented on Linux, under developement
  8630. * on Windows.
  8631. */
  8632. template <unsigned int tileSize> class tile_base {
  8633. protected:
  8634. _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
  8635. public:
  8636. // Rank of the thread within this tile
  8637. _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
  8638. return (internal::workgroup::thread_rank() & (numThreads - 1));
  8639. }
  8640. // Number of threads within this tile
  8641. __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
  8642. };
  8643. /**
  8644. * template class thread_block_tile_base
  8645. * @note This class is implemented on Linux, under developement
  8646. * on Windows.
  8647. */
  8648. template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
  8649. static_assert(is_valid_tile_size<size>::value,
  8650. "Tile size is either not a power of 2 or greater than the wavefront size");
  8651. using tile_base<size>::numThreads;
  8652. public:
  8653. __CG_STATIC_QUALIFIER__ void sync() {
  8654. internal::tiled_group::sync();
  8655. }
  8656. template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
  8657. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8658. return (__shfl(var, srcRank, numThreads));
  8659. }
  8660. template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
  8661. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8662. return (__shfl_down(var, lane_delta, numThreads));
  8663. }
  8664. template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
  8665. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8666. return (__shfl_up(var, lane_delta, numThreads));
  8667. }
  8668. template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
  8669. static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
  8670. return (__shfl_xor(var, laneMask, numThreads));
  8671. }
  8672. };
  8673. /** \brief User exposed API that captures the state of the parent group pre-partition
  8674. */
  8675. template <unsigned int tileSize, typename ParentCGTy>
  8676. class parent_group_info {
  8677. public:
  8678. // Returns the linear rank of the group within the set of tiles partitioned
  8679. // from a parent group (bounded by meta_group_size)
  8680. __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
  8681. return ParentCGTy::thread_rank() / tileSize;
  8682. }
  8683. // Returns the number of groups created when the parent group was partitioned.
  8684. __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
  8685. return (ParentCGTy::size() + tileSize - 1) / tileSize;
  8686. }
  8687. };
  8688. /** \brief Group type - thread_block_tile
  8689. *
  8690. * \details Represents one tile of thread group.
  8691. * @note This type is implemented on Linux, under developement
  8692. * on Windows.
  8693. */
  8694. template <unsigned int tileSize, class ParentCGTy>
  8695. class thread_block_tile_type : public thread_block_tile_base<tileSize>,
  8696. public tiled_group,
  8697. public parent_group_info<tileSize, ParentCGTy> {
  8698. _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
  8699. protected:
  8700. __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
  8701. coalesced_info.tiled_info.size = numThreads;
  8702. coalesced_info.tiled_info.is_tiled = true;
  8703. }
  8704. };
  8705. // Partial template specialization
  8706. template <unsigned int tileSize>
  8707. class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
  8708. public tiled_group
  8709. {
  8710. _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
  8711. typedef thread_block_tile_base<numThreads> tbtBase;
  8712. protected:
  8713. __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
  8714. : tiled_group(numThreads) {
  8715. coalesced_info.tiled_info.size = numThreads;
  8716. coalesced_info.tiled_info.is_tiled = true;
  8717. coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
  8718. coalesced_info.tiled_info.meta_group_size = meta_group_size;
  8719. }
  8720. public:
  8721. using tbtBase::size;
  8722. using tbtBase::sync;
  8723. using tbtBase::thread_rank;
  8724. __CG_QUALIFIER__ unsigned int meta_group_rank() const {
  8725. return coalesced_info.tiled_info.meta_group_rank;
  8726. }
  8727. __CG_QUALIFIER__ unsigned int meta_group_size() const {
  8728. return coalesced_info.tiled_info.meta_group_size;
  8729. }
  8730. // end of operative group
  8731. /**
  8732. * @}
  8733. */
  8734. };
  8735. /** \brief User exposed API to partition groups.
  8736. *
  8737. * \details A collective operation that partitions the parent group into a one-dimensional,
  8738. * row-major, tiling of subgroups.
  8739. */
  8740. __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
  8741. if (parent.cg_type() == internal::cg_tiled_group) {
  8742. const tiled_group* cg = static_cast<const tiled_group*>(&parent);
  8743. return cg->new_tiled_group(tile_size);
  8744. }
  8745. else if(parent.cg_type() == internal::cg_coalesced_group) {
  8746. const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
  8747. return cg->new_tiled_group(tile_size);
  8748. }
  8749. else {
  8750. const thread_block* tb = static_cast<const thread_block*>(&parent);
  8751. return tb->new_tiled_group(tile_size);
  8752. }
  8753. }
  8754. // Thread block type overload
  8755. __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
  8756. return (parent.new_tiled_group(tile_size));
  8757. }
  8758. __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
  8759. return (parent.new_tiled_group(tile_size));
  8760. }
  8761. // If a coalesced group is passed to be partitioned, it should remain coalesced
  8762. __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
  8763. return (parent.new_tiled_group(tile_size));
  8764. }
  8765. template <unsigned int size, class ParentCGTy> class thread_block_tile;
  8766. namespace impl {
  8767. template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
  8768. template <unsigned int size, class ParentCGTy>
  8769. class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
  8770. protected:
  8771. template <unsigned int tbtSize, class tbtParentT>
  8772. __CG_QUALIFIER__ thread_block_tile_internal(
  8773. const thread_block_tile_internal<tbtSize, tbtParentT>& g)
  8774. : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
  8775. __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
  8776. : thread_block_tile_type<size, ParentCGTy>() {}
  8777. };
  8778. } // namespace impl
  8779. template <unsigned int size, class ParentCGTy>
  8780. class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
  8781. protected:
  8782. __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
  8783. : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
  8784. public:
  8785. __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
  8786. return thread_block_tile<size, void>(*this);
  8787. }
  8788. };
  8789. template <unsigned int size>
  8790. class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
  8791. template <unsigned int, class ParentCGTy> friend class thread_block_tile;
  8792. protected:
  8793. public:
  8794. template <class ParentCGTy>
  8795. __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
  8796. : impl::thread_block_tile_internal<size, void>(g) {}
  8797. };
  8798. template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
  8799. namespace impl {
  8800. template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
  8801. template <unsigned int size>
  8802. struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
  8803. __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
  8804. : thread_block_tile<size, thread_block>(g) {}
  8805. };
  8806. } // namespace impl
  8807. /** \brief User exposed API to partition groups.
  8808. *
  8809. * \details This constructs a templated class derieved from thread_group.
  8810. * The template defines tile size of the new thread group at compile time.
  8811. */
  8812. template <unsigned int size, class ParentCGTy>
  8813. __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
  8814. static_assert(is_valid_tile_size<size>::value,
  8815. "Tiled partition with size > wavefront size. Currently not supported ");
  8816. return impl::tiled_partition_internal<size, ParentCGTy>(g);
  8817. }
  8818. } // namespace cooperative_groups
  8819. #endif // __cplusplus
  8820. #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
  8821. /*
  8822. Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  8823. Permission is hereby granted, free of charge, to any person obtaining a copy
  8824. of this software and associated documentation files (the "Software"), to deal
  8825. in the Software without restriction, including without limitation the rights
  8826. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8827. copies of the Software, and to permit persons to whom the Software is
  8828. furnished to do so, subject to the following conditions:
  8829. The above copyright notice and this permission notice shall be included in
  8830. all copies or substantial portions of the Software.
  8831. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  8832. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  8833. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  8834. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  8835. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  8836. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  8837. THE SOFTWARE.
  8838. */
  8839. #pragma once
  8840. #ifdef __cplusplus
  8841. /**
  8842. * @brief Unsafe floating point rmw atomic add.
  8843. *
  8844. * Performs a relaxed read-modify-write floating point atomic add with
  8845. * device memory scope. Original value at \p addr is returned and
  8846. * the value of \p addr is updated to have the original value plus \p value
  8847. *
  8848. * @note This operation currently only performs different operations for
  8849. * the gfx90a target. Other devices continue to use safe atomics.
  8850. *
  8851. * It can be used to generate code that uses fast hardware floating point atomic
  8852. * operations which may handle rounding and subnormal values differently than
  8853. * non-atomic floating point operations.
  8854. *
  8855. * The operation is not always safe and can have undefined behavior unless
  8856. * following condition are met:
  8857. *
  8858. * - \p addr is at least 4 bytes aligned
  8859. * - If \p addr is a global segment address, it is in a coarse grain allocation.
  8860. * Passing in global segment addresses in fine grain allocations will result in
  8861. * undefined behavior and is not supported.
  8862. *
  8863. * @param [in,out] addr Pointer to value to be increment by \p value.
  8864. * @param [in] value Value by \p addr is to be incremented.
  8865. * @return Original value contained in \p addr.
  8866. */
  8867. __device__ inline float unsafeAtomicAdd(float* addr, float value) {
  8868. #if defined(__gfx90a__) && \
  8869. __has_builtin(__builtin_amdgcn_is_shared) && \
  8870. __has_builtin(__builtin_amdgcn_is_private) && \
  8871. __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
  8872. __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
  8873. if (__builtin_amdgcn_is_shared(
  8874. (const __attribute__((address_space(0))) void*)addr))
  8875. return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
  8876. else if (__builtin_amdgcn_is_private(
  8877. (const __attribute__((address_space(0))) void*)addr)) {
  8878. float temp = *addr;
  8879. *addr = temp + value;
  8880. return temp;
  8881. }
  8882. else
  8883. return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
  8884. #elif __has_builtin(__hip_atomic_fetch_add)
  8885. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8886. #else
  8887. return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
  8888. #endif
  8889. }
  8890. /**
  8891. * @brief Unsafe floating point rmw atomic max.
  8892. *
  8893. * Performs a relaxed read-modify-write floating point atomic max with
  8894. * device memory scope. The original value at \p addr is returned and
  8895. * the value at \p addr is replaced by \p val if greater.
  8896. *
  8897. * @note This operation is currently identical to that performed by
  8898. * atomicMax and is included for completeness.
  8899. *
  8900. * @param [in,out] addr Pointer to value to be updated
  8901. * @param [in] val Value used to update the value at \p addr.
  8902. * @return Original value contained in \p addr.
  8903. */
  8904. __device__ inline float unsafeAtomicMax(float* addr, float val) {
  8905. #if __has_builtin(__hip_atomic_load) && \
  8906. __has_builtin(__hip_atomic_compare_exchange_strong)
  8907. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8908. bool done = false;
  8909. while (!done && value < val) {
  8910. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  8911. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8912. }
  8913. return value;
  8914. #else
  8915. unsigned int *uaddr = (unsigned int *)addr;
  8916. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  8917. bool done = false;
  8918. while (!done && __uint_as_float(value) < val) {
  8919. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  8920. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  8921. }
  8922. return __uint_as_float(value);
  8923. #endif
  8924. }
  8925. /**
  8926. * @brief Unsafe floating point rmw atomic min.
  8927. *
  8928. * Performs a relaxed read-modify-write floating point atomic min with
  8929. * device memory scope. The original value at \p addr is returned and
  8930. * the value at \p addr is replaced by \p val if lesser.
  8931. *
  8932. * @note This operation is currently identical to that performed by
  8933. * atomicMin and is included for completeness.
  8934. *
  8935. * @param [in,out] addr Pointer to value to be updated
  8936. * @param [in] val Value used to update the value at \p addr.
  8937. * @return Original value contained in \p addr.
  8938. */
  8939. __device__ inline float unsafeAtomicMin(float* addr, float val) {
  8940. #if __has_builtin(__hip_atomic_load) && \
  8941. __has_builtin(__hip_atomic_compare_exchange_strong)
  8942. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8943. bool done = false;
  8944. while (!done && value > val) {
  8945. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  8946. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8947. }
  8948. return value;
  8949. #else
  8950. unsigned int *uaddr = (unsigned int *)addr;
  8951. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  8952. bool done = false;
  8953. while (!done && __uint_as_float(value) > val) {
  8954. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  8955. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  8956. }
  8957. return __uint_as_float(value);
  8958. #endif
  8959. }
  8960. /**
  8961. * @brief Unsafe double precision rmw atomic add.
  8962. *
  8963. * Performs a relaxed read-modify-write double precision atomic add with
  8964. * device memory scope. Original value at \p addr is returned and
  8965. * the value of \p addr is updated to have the original value plus \p value
  8966. *
  8967. * @note This operation currently only performs different operations for
  8968. * the gfx90a target. Other devices continue to use safe atomics.
  8969. *
  8970. * It can be used to generate code that uses fast hardware floating point atomic
  8971. * operations which may handle rounding and subnormal values differently than
  8972. * non-atomic floating point operations.
  8973. *
  8974. * The operation is not always safe and can have undefined behavior unless
  8975. * following condition are met:
  8976. *
  8977. * - \p addr is at least 8 byte aligned
  8978. * - If \p addr is a global segment address, it is in a coarse grain allocation.
  8979. * Passing in global segment addresses in fine grain allocations will result in
  8980. * undefined behavior and are not supported.
  8981. *
  8982. * @param [in,out] addr Pointer to value to be updated.
  8983. * @param [in] value Value by \p addr is to be incremented.
  8984. * @return Original value contained in \p addr.
  8985. */
  8986. __device__ inline double unsafeAtomicAdd(double* addr, double value) {
  8987. #if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
  8988. return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
  8989. #elif defined (__hip_atomic_fetch_add)
  8990. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  8991. #else
  8992. return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
  8993. #endif
  8994. }
  8995. /**
  8996. * @brief Unsafe double precision rmw atomic max.
  8997. *
  8998. * Performs a relaxed read-modify-write double precision atomic max with
  8999. * device memory scope. Original value at \p addr is returned and
  9000. * the value of \p addr is updated with \p val if greater.
  9001. *
  9002. * @note This operation currently only performs different operations for
  9003. * the gfx90a target. Other devices continue to use safe atomics.
  9004. *
  9005. * It can be used to generate code that uses fast hardware floating point atomic
  9006. * operations which may handle rounding and subnormal values differently than
  9007. * non-atomic floating point operations.
  9008. *
  9009. * The operation is not always safe and can have undefined behavior unless
  9010. * following condition are met:
  9011. *
  9012. * - \p addr is at least 8 byte aligned
  9013. * - If \p addr is a global segment address, it is in a coarse grain allocation.
  9014. * Passing in global segment addresses in fine grain allocations will result in
  9015. * undefined behavior and are not supported.
  9016. *
  9017. * @param [in,out] addr Pointer to value to be updated.
  9018. * @param [in] val Value used to updated the contents at \p addr
  9019. * @return Original value contained at \p addr.
  9020. */
  9021. __device__ inline double unsafeAtomicMax(double* addr, double val) {
  9022. #if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
  9023. __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
  9024. return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
  9025. #else
  9026. #if __has_builtin(__hip_atomic_load) && \
  9027. __has_builtin(__hip_atomic_compare_exchange_strong)
  9028. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9029. bool done = false;
  9030. while (!done && value < val) {
  9031. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9032. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9033. }
  9034. return value;
  9035. #else
  9036. unsigned long long *uaddr = (unsigned long long *)addr;
  9037. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9038. bool done = false;
  9039. while (!done && __longlong_as_double(value) < val) {
  9040. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  9041. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9042. }
  9043. return __longlong_as_double(value);
  9044. #endif
  9045. #endif
  9046. }
  9047. /**
  9048. * @brief Unsafe double precision rmw atomic min.
  9049. *
  9050. * Performs a relaxed read-modify-write double precision atomic min with
  9051. * device memory scope. Original value at \p addr is returned and
  9052. * the value of \p addr is updated with \p val if lesser.
  9053. *
  9054. * @note This operation currently only performs different operations for
  9055. * the gfx90a target. Other devices continue to use safe atomics.
  9056. *
  9057. * It can be used to generate code that uses fast hardware floating point atomic
  9058. * operations which may handle rounding and subnormal values differently than
  9059. * non-atomic floating point operations.
  9060. *
  9061. * The operation is not always safe and can have undefined behavior unless
  9062. * following condition are met:
  9063. *
  9064. * - \p addr is at least 8 byte aligned
  9065. * - If \p addr is a global segment address, it is in a coarse grain allocation.
  9066. * Passing in global segment addresses in fine grain allocations will result in
  9067. * undefined behavior and are not supported.
  9068. *
  9069. * @param [in,out] addr Pointer to value to be updated.
  9070. * @param [in] val Value used to updated the contents at \p addr
  9071. * @return Original value contained at \p addr.
  9072. */
  9073. __device__ inline double unsafeAtomicMin(double* addr, double val) {
  9074. #if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
  9075. __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
  9076. return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
  9077. #else
  9078. #if __has_builtin(__hip_atomic_load) && \
  9079. __has_builtin(__hip_atomic_compare_exchange_strong)
  9080. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9081. bool done = false;
  9082. while (!done && value > val) {
  9083. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9084. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9085. }
  9086. return value;
  9087. #else
  9088. unsigned long long *uaddr = (unsigned long long *)addr;
  9089. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9090. bool done = false;
  9091. while (!done && __longlong_as_double(value) > val) {
  9092. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  9093. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9094. }
  9095. return __longlong_as_double(value);
  9096. #endif
  9097. #endif
  9098. }
  9099. /**
  9100. * @brief Safe floating point rmw atomic add.
  9101. *
  9102. * Performs a relaxed read-modify-write floating point atomic add with
  9103. * device memory scope. Original value at \p addr is returned and
  9104. * the value of \p addr is updated to have the original value plus \p value
  9105. *
  9106. * @note This operation ensures that, on all targets, we produce safe atomics.
  9107. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9108. *
  9109. * @param [in,out] addr Pointer to value to be increment by \p value.
  9110. * @param [in] value Value by \p addr is to be incremented.
  9111. * @return Original value contained in \p addr.
  9112. */
  9113. __device__ inline float safeAtomicAdd(float* addr, float value) {
  9114. #if defined(__gfx908__) || defined(__gfx941__) \
  9115. || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
  9116. && !__has_builtin(__hip_atomic_fetch_add))
  9117. // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
  9118. // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
  9119. // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
  9120. // so we need to force a CAS loop emulation to ensure safety.
  9121. // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
  9122. // need to force a CAS loop here.
  9123. float old_val;
  9124. #if __has_builtin(__hip_atomic_load)
  9125. old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9126. #else // !__has_builtin(__hip_atomic_load)
  9127. old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
  9128. #endif // __has_builtin(__hip_atomic_load)
  9129. float expected, temp;
  9130. do {
  9131. temp = expected = old_val;
  9132. #if __has_builtin(__hip_atomic_compare_exchange_strong)
  9133. __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
  9134. __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9135. #else // !__has_builtin(__hip_atomic_compare_exchange_strong)
  9136. __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
  9137. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9138. #endif // __has_builtin(__hip_atomic_compare_exchange_strong)
  9139. old_val = expected;
  9140. } while (__float_as_uint(temp) != __float_as_uint(old_val));
  9141. return old_val;
  9142. #elif defined(__gfx90a__)
  9143. // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
  9144. // atomics will produce safe CAS loops, but are otherwise not different than
  9145. // agent-scope atomics. This logic is only applicable for gfx90a, and should
  9146. // not be assumed on other architectures.
  9147. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9148. #elif __has_builtin(__hip_atomic_fetch_add)
  9149. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9150. #else
  9151. return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
  9152. #endif
  9153. }
  9154. /**
  9155. * @brief Safe floating point rmw atomic max.
  9156. *
  9157. * Performs a relaxed read-modify-write floating point atomic max with
  9158. * device memory scope. The original value at \p addr is returned and
  9159. * the value at \p addr is replaced by \p val if greater.
  9160. *
  9161. * @note This operation ensures that, on all targets, we produce safe atomics.
  9162. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9163. *
  9164. * @param [in,out] addr Pointer to value to be updated
  9165. * @param [in] val Value used to update the value at \p addr.
  9166. * @return Original value contained in \p addr.
  9167. */
  9168. __device__ inline float safeAtomicMax(float* addr, float val) {
  9169. #if __has_builtin(__hip_atomic_load) && \
  9170. __has_builtin(__hip_atomic_compare_exchange_strong)
  9171. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9172. bool done = false;
  9173. while (!done && value < val) {
  9174. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9175. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9176. }
  9177. return value;
  9178. #else
  9179. unsigned int *uaddr = (unsigned int *)addr;
  9180. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9181. bool done = false;
  9182. while (!done && __uint_as_float(value) < val) {
  9183. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  9184. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9185. }
  9186. return __uint_as_float(value);
  9187. #endif
  9188. }
  9189. /**
  9190. * @brief Safe floating point rmw atomic min.
  9191. *
  9192. * Performs a relaxed read-modify-write floating point atomic min with
  9193. * device memory scope. The original value at \p addr is returned and
  9194. * the value at \p addr is replaced by \p val if lesser.
  9195. *
  9196. * @note This operation ensures that, on all targets, we produce safe atomics.
  9197. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9198. *
  9199. * @param [in,out] addr Pointer to value to be updated
  9200. * @param [in] val Value used to update the value at \p addr.
  9201. * @return Original value contained in \p addr.
  9202. */
  9203. __device__ inline float safeAtomicMin(float* addr, float val) {
  9204. #if __has_builtin(__hip_atomic_load) && \
  9205. __has_builtin(__hip_atomic_compare_exchange_strong)
  9206. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9207. bool done = false;
  9208. while (!done && value > val) {
  9209. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9210. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9211. }
  9212. return value;
  9213. #else
  9214. unsigned int *uaddr = (unsigned int *)addr;
  9215. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9216. bool done = false;
  9217. while (!done && __uint_as_float(value) > val) {
  9218. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  9219. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9220. }
  9221. return __uint_as_float(value);
  9222. #endif
  9223. }
  9224. /**
  9225. * @brief Safe double precision rmw atomic add.
  9226. *
  9227. * Performs a relaxed read-modify-write double precision atomic add with
  9228. * device memory scope. Original value at \p addr is returned and
  9229. * the value of \p addr is updated to have the original value plus \p value
  9230. *
  9231. * @note This operation ensures that, on all targets, we produce safe atomics.
  9232. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9233. *
  9234. * @param [in,out] addr Pointer to value to be increment by \p value.
  9235. * @param [in] value Value by \p addr is to be incremented.
  9236. * @return Original value contained in \p addr.
  9237. */
  9238. __device__ inline double safeAtomicAdd(double* addr, double value) {
  9239. #if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
  9240. // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
  9241. // atomics will produce safe CAS loops, but are otherwise not different than
  9242. // agent-scope atomics. This logic is only applicable for gfx90a, and should
  9243. // not be assumed on other architectures.
  9244. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9245. #elif defined(__gfx90a__)
  9246. // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
  9247. // force a CAS loop here.
  9248. double old_val;
  9249. #if __has_builtin(__hip_atomic_load)
  9250. old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9251. #else // !__has_builtin(__hip_atomic_load)
  9252. old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
  9253. #endif // __has_builtin(__hip_atomic_load)
  9254. double expected, temp;
  9255. do {
  9256. temp = expected = old_val;
  9257. #if __has_builtin(__hip_atomic_compare_exchange_strong)
  9258. __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
  9259. __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9260. #else // !__has_builtin(__hip_atomic_compare_exchange_strong)
  9261. __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
  9262. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9263. #endif // __has_builtin(__hip_atomic_compare_exchange_strong)
  9264. old_val = expected;
  9265. } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
  9266. return old_val;
  9267. #else // !defined(__gfx90a__)
  9268. #if __has_builtin(__hip_atomic_fetch_add)
  9269. return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9270. #else // !__has_builtin(__hip_atomic_fetch_add)
  9271. return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
  9272. #endif // __has_builtin(__hip_atomic_fetch_add)
  9273. #endif
  9274. }
  9275. /**
  9276. * @brief Safe double precision rmw atomic max.
  9277. *
  9278. * Performs a relaxed read-modify-write double precision atomic max with
  9279. * device memory scope. Original value at \p addr is returned and
  9280. * the value of \p addr is updated with \p val if greater.
  9281. *
  9282. * @note This operation ensures that, on all targets, we produce safe atomics.
  9283. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9284. *
  9285. * @param [in,out] addr Pointer to value to be updated.
  9286. * @param [in] val Value used to updated the contents at \p addr
  9287. * @return Original value contained at \p addr.
  9288. */
  9289. __device__ inline double safeAtomicMax(double* addr, double val) {
  9290. #if __has_builtin(__builtin_amdgcn_is_private)
  9291. if (__builtin_amdgcn_is_private(
  9292. (const __attribute__((address_space(0))) void*)addr)) {
  9293. double old = *addr;
  9294. *addr = __builtin_fmax(old, val);
  9295. return old;
  9296. } else {
  9297. #endif
  9298. #if __has_builtin(__hip_atomic_load) && \
  9299. __has_builtin(__hip_atomic_compare_exchange_strong)
  9300. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9301. bool done = false;
  9302. while (!done && value < val) {
  9303. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9304. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9305. }
  9306. return value;
  9307. #else
  9308. unsigned long long *uaddr = (unsigned long long *)addr;
  9309. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9310. bool done = false;
  9311. while (!done && __longlong_as_double(value) < val) {
  9312. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  9313. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9314. }
  9315. return __longlong_as_double(value);
  9316. #endif
  9317. #if __has_builtin(__builtin_amdgcn_is_private)
  9318. }
  9319. #endif
  9320. }
  9321. /**
  9322. * @brief Safe double precision rmw atomic min.
  9323. *
  9324. * Performs a relaxed read-modify-write double precision atomic min with
  9325. * device memory scope. Original value at \p addr is returned and
  9326. * the value of \p addr is updated with \p val if lesser.
  9327. *
  9328. * @note This operation ensures that, on all targets, we produce safe atomics.
  9329. * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
  9330. *
  9331. * @param [in,out] addr Pointer to value to be updated.
  9332. * @param [in] val Value used to updated the contents at \p addr
  9333. * @return Original value contained at \p addr.
  9334. */
  9335. __device__ inline double safeAtomicMin(double* addr, double val) {
  9336. #if __has_builtin(__builtin_amdgcn_is_private)
  9337. if (__builtin_amdgcn_is_private(
  9338. (const __attribute__((address_space(0))) void*)addr)) {
  9339. double old = *addr;
  9340. *addr = __builtin_fmin(old, val);
  9341. return old;
  9342. } else {
  9343. #endif
  9344. #if __has_builtin(__hip_atomic_load) && \
  9345. __has_builtin(__hip_atomic_compare_exchange_strong)
  9346. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9347. bool done = false;
  9348. while (!done && value > val) {
  9349. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9350. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9351. }
  9352. return value;
  9353. #else
  9354. unsigned long long *uaddr = (unsigned long long *)addr;
  9355. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9356. bool done = false;
  9357. while (!done && __longlong_as_double(value) > val) {
  9358. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  9359. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9360. }
  9361. return __longlong_as_double(value);
  9362. #endif
  9363. #if __has_builtin(__builtin_amdgcn_is_private)
  9364. }
  9365. #endif
  9366. }
  9367. #endif
  9368. /*
  9369. Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
  9370. Permission is hereby granted, free of charge, to any person obtaining a copy
  9371. of this software and associated documentation files (the "Software"), to deal
  9372. in the Software without restriction, including without limitation the rights
  9373. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9374. copies of the Software, and to permit persons to whom the Software is
  9375. furnished to do so, subject to the following conditions:
  9376. The above copyright notice and this permission notice shall be included in
  9377. all copies or substantial portions of the Software.
  9378. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9379. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9380. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9381. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  9382. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  9383. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  9384. THE SOFTWARE.
  9385. */
  9386. #pragma once
  9387. #if !defined(__HIPCC_RTC__)
  9388. #include "amd_device_functions.h"
  9389. #endif
  9390. #if __has_builtin(__hip_atomic_compare_exchange_strong)
  9391. template<bool B, typename T, typename F> struct Cond_t;
  9392. template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
  9393. template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
  9394. #if !__HIP_DEVICE_COMPILE__
  9395. //TODO: Remove this after compiler pre-defines the following Macros.
  9396. #define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
  9397. #define __HIP_MEMORY_SCOPE_WAVEFRONT 2
  9398. #define __HIP_MEMORY_SCOPE_WORKGROUP 3
  9399. #define __HIP_MEMORY_SCOPE_AGENT 4
  9400. #define __HIP_MEMORY_SCOPE_SYSTEM 5
  9401. #endif
  9402. #if !defined(__HIPCC_RTC__)
  9403. #include "amd_hip_unsafe_atomics.h"
  9404. #endif
  9405. // Atomic expanders
  9406. template<
  9407. int mem_order = __ATOMIC_SEQ_CST,
  9408. int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  9409. typename T,
  9410. typename Op,
  9411. typename F>
  9412. inline
  9413. __attribute__((always_inline, device))
  9414. T hip_cas_expander(T* p, T x, Op op, F f) noexcept
  9415. {
  9416. using FP = __attribute__((address_space(0))) const void*;
  9417. __device__
  9418. extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
  9419. if (is_shared_workaround((FP)p))
  9420. return f();
  9421. using U = typename Cond_t<
  9422. sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
  9423. auto q = reinterpret_cast<U*>(p);
  9424. U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
  9425. U tmp1;
  9426. do {
  9427. tmp1 = tmp0;
  9428. op(reinterpret_cast<T&>(tmp1), x);
  9429. } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
  9430. mem_order, mem_scope));
  9431. return reinterpret_cast<const T&>(tmp0);
  9432. }
  9433. template<
  9434. int mem_order = __ATOMIC_SEQ_CST,
  9435. int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  9436. typename T,
  9437. typename Cmp,
  9438. typename F>
  9439. inline
  9440. __attribute__((always_inline, device))
  9441. T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
  9442. {
  9443. using FP = __attribute__((address_space(0))) const void*;
  9444. __device__
  9445. extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
  9446. if (is_shared_workaround((FP)p))
  9447. return f();
  9448. using U = typename Cond_t<
  9449. sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
  9450. auto q = reinterpret_cast<U*>(p);
  9451. U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
  9452. while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
  9453. !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
  9454. mem_scope));
  9455. return reinterpret_cast<const T&>(tmp);
  9456. }
  9457. __device__
  9458. inline
  9459. int atomicCAS(int* address, int compare, int val) {
  9460. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9461. __HIP_MEMORY_SCOPE_AGENT);
  9462. return compare;
  9463. }
  9464. __device__
  9465. inline
  9466. int atomicCAS_system(int* address, int compare, int val) {
  9467. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9468. __HIP_MEMORY_SCOPE_SYSTEM);
  9469. return compare;
  9470. }
  9471. __device__
  9472. inline
  9473. unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
  9474. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9475. __HIP_MEMORY_SCOPE_AGENT);
  9476. return compare;
  9477. }
  9478. __device__
  9479. inline
  9480. unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
  9481. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9482. __HIP_MEMORY_SCOPE_SYSTEM);
  9483. return compare;
  9484. }
  9485. __device__
  9486. inline
  9487. unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
  9488. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9489. __HIP_MEMORY_SCOPE_AGENT);
  9490. return compare;
  9491. }
  9492. __device__
  9493. inline
  9494. unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
  9495. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9496. __HIP_MEMORY_SCOPE_SYSTEM);
  9497. return compare;
  9498. }
  9499. __device__
  9500. inline
  9501. unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
  9502. unsigned long long val) {
  9503. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9504. __HIP_MEMORY_SCOPE_AGENT);
  9505. return compare;
  9506. }
  9507. __device__
  9508. inline
  9509. unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
  9510. unsigned long long val) {
  9511. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9512. __HIP_MEMORY_SCOPE_SYSTEM);
  9513. return compare;
  9514. }
  9515. __device__
  9516. inline
  9517. float atomicCAS(float* address, float compare, float val) {
  9518. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9519. __HIP_MEMORY_SCOPE_AGENT);
  9520. return compare;
  9521. }
  9522. __device__
  9523. inline
  9524. float atomicCAS_system(float* address, float compare, float val) {
  9525. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9526. __HIP_MEMORY_SCOPE_SYSTEM);
  9527. return compare;
  9528. }
  9529. __device__
  9530. inline
  9531. double atomicCAS(double* address, double compare, double val) {
  9532. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9533. __HIP_MEMORY_SCOPE_AGENT);
  9534. return compare;
  9535. }
  9536. __device__
  9537. inline
  9538. double atomicCAS_system(double* address, double compare, double val) {
  9539. __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
  9540. __HIP_MEMORY_SCOPE_SYSTEM);
  9541. return compare;
  9542. }
  9543. __device__
  9544. inline
  9545. int atomicAdd(int* address, int val) {
  9546. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9547. }
  9548. __device__
  9549. inline
  9550. int atomicAdd_system(int* address, int val) {
  9551. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9552. }
  9553. __device__
  9554. inline
  9555. unsigned int atomicAdd(unsigned int* address, unsigned int val) {
  9556. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9557. }
  9558. __device__
  9559. inline
  9560. unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
  9561. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9562. }
  9563. __device__
  9564. inline
  9565. unsigned long atomicAdd(unsigned long* address, unsigned long val) {
  9566. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9567. }
  9568. __device__
  9569. inline
  9570. unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
  9571. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9572. }
  9573. __device__
  9574. inline
  9575. unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
  9576. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9577. }
  9578. __device__
  9579. inline
  9580. unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
  9581. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9582. }
  9583. __device__
  9584. inline
  9585. float atomicAdd(float* address, float val) {
  9586. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9587. return unsafeAtomicAdd(address, val);
  9588. #else
  9589. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9590. #endif
  9591. }
  9592. __device__
  9593. inline
  9594. float atomicAdd_system(float* address, float val) {
  9595. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9596. }
  9597. #if !defined(__HIPCC_RTC__)
  9598. DEPRECATED("use atomicAdd instead")
  9599. #endif // !defined(__HIPCC_RTC__)
  9600. __device__
  9601. inline
  9602. void atomicAddNoRet(float* address, float val)
  9603. {
  9604. __ockl_atomic_add_noret_f32(address, val);
  9605. }
  9606. __device__
  9607. inline
  9608. double atomicAdd(double* address, double val) {
  9609. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9610. return unsafeAtomicAdd(address, val);
  9611. #else
  9612. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9613. #endif
  9614. }
  9615. __device__
  9616. inline
  9617. double atomicAdd_system(double* address, double val) {
  9618. return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9619. }
  9620. __device__
  9621. inline
  9622. int atomicSub(int* address, int val) {
  9623. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9624. }
  9625. __device__
  9626. inline
  9627. int atomicSub_system(int* address, int val) {
  9628. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9629. }
  9630. __device__
  9631. inline
  9632. unsigned int atomicSub(unsigned int* address, unsigned int val) {
  9633. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9634. }
  9635. __device__
  9636. inline
  9637. unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
  9638. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9639. }
  9640. __device__
  9641. inline
  9642. unsigned long atomicSub(unsigned long* address, unsigned long val) {
  9643. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9644. }
  9645. __device__
  9646. inline
  9647. unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
  9648. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9649. }
  9650. __device__
  9651. inline
  9652. unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
  9653. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9654. }
  9655. __device__
  9656. inline
  9657. unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
  9658. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9659. }
  9660. __device__
  9661. inline
  9662. float atomicSub(float* address, float val) {
  9663. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9664. return unsafeAtomicAdd(address, -val);
  9665. #else
  9666. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9667. #endif
  9668. }
  9669. __device__
  9670. inline
  9671. float atomicSub_system(float* address, float val) {
  9672. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9673. }
  9674. __device__
  9675. inline
  9676. double atomicSub(double* address, double val) {
  9677. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9678. return unsafeAtomicAdd(address, -val);
  9679. #else
  9680. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9681. #endif
  9682. }
  9683. __device__
  9684. inline
  9685. double atomicSub_system(double* address, double val) {
  9686. return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9687. }
  9688. __device__
  9689. inline
  9690. int atomicExch(int* address, int val) {
  9691. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9692. }
  9693. __device__
  9694. inline
  9695. int atomicExch_system(int* address, int val) {
  9696. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9697. }
  9698. __device__
  9699. inline
  9700. unsigned int atomicExch(unsigned int* address, unsigned int val) {
  9701. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9702. }
  9703. __device__
  9704. inline
  9705. unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
  9706. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9707. }
  9708. __device__
  9709. inline
  9710. unsigned long atomicExch(unsigned long* address, unsigned long val) {
  9711. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9712. }
  9713. __device__
  9714. inline
  9715. unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
  9716. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9717. }
  9718. __device__
  9719. inline
  9720. unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
  9721. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9722. }
  9723. __device__
  9724. inline
  9725. unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
  9726. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9727. }
  9728. __device__
  9729. inline
  9730. float atomicExch(float* address, float val) {
  9731. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9732. }
  9733. __device__
  9734. inline
  9735. float atomicExch_system(float* address, float val) {
  9736. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9737. }
  9738. __device__
  9739. inline
  9740. double atomicExch(double* address, double val) {
  9741. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9742. }
  9743. __device__
  9744. inline
  9745. double atomicExch_system(double* address, double val) {
  9746. return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9747. }
  9748. __device__
  9749. inline
  9750. int atomicMin(int* address, int val) {
  9751. #if defined(__gfx941__)
  9752. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9753. address, val, [](int x, int y) { return x < y; }, [=]() {
  9754. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9755. __HIP_MEMORY_SCOPE_AGENT);
  9756. });
  9757. #else
  9758. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9759. #endif // __gfx941__
  9760. }
  9761. __device__
  9762. inline
  9763. int atomicMin_system(int* address, int val) {
  9764. #if defined(__gfx941__)
  9765. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9766. address, val, [](int x, int y) { return x < y; }, [=]() {
  9767. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9768. __HIP_MEMORY_SCOPE_SYSTEM);
  9769. });
  9770. #else
  9771. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9772. #endif // __gfx941__
  9773. }
  9774. __device__
  9775. inline
  9776. unsigned int atomicMin(unsigned int* address, unsigned int val) {
  9777. #if defined(__gfx941__)
  9778. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9779. address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
  9780. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9781. __HIP_MEMORY_SCOPE_AGENT);
  9782. });
  9783. #else
  9784. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9785. #endif // __gfx941__
  9786. }
  9787. __device__
  9788. inline
  9789. unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
  9790. #if defined(__gfx941__)
  9791. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9792. address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
  9793. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9794. __HIP_MEMORY_SCOPE_SYSTEM);
  9795. });
  9796. #else
  9797. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9798. #endif // __gfx941__
  9799. }
  9800. __device__
  9801. inline
  9802. unsigned long long atomicMin(unsigned long* address, unsigned long val) {
  9803. #if defined(__gfx941__)
  9804. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9805. address,
  9806. val,
  9807. [](unsigned long x, unsigned long y) { return x < y; },
  9808. [=]() {
  9809. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9810. __HIP_MEMORY_SCOPE_AGENT);
  9811. });
  9812. #else
  9813. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9814. #endif // __gfx941__
  9815. }
  9816. __device__
  9817. inline
  9818. unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
  9819. #if defined(__gfx941__)
  9820. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9821. address,
  9822. val,
  9823. [](unsigned long x, unsigned long y) { return x < y; },
  9824. [=]() {
  9825. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9826. __HIP_MEMORY_SCOPE_SYSTEM);
  9827. });
  9828. #else
  9829. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9830. #endif // __gfx941__
  9831. }
  9832. __device__
  9833. inline
  9834. unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
  9835. #if defined(__gfx941__)
  9836. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9837. address,
  9838. val,
  9839. [](unsigned long long x, unsigned long long y) { return x < y; },
  9840. [=]() {
  9841. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9842. __HIP_MEMORY_SCOPE_AGENT);
  9843. });
  9844. #else
  9845. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9846. #endif // __gfx941__
  9847. }
  9848. __device__
  9849. inline
  9850. unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
  9851. #if defined(__gfx941__)
  9852. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9853. address,
  9854. val,
  9855. [](unsigned long long x, unsigned long long y) { return x < y; },
  9856. [=]() {
  9857. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
  9858. __HIP_MEMORY_SCOPE_SYSTEM);
  9859. });
  9860. #else
  9861. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9862. #endif // __gfx941__
  9863. }
  9864. __device__
  9865. inline
  9866. long long atomicMin(long long* address, long long val) {
  9867. #if defined(__gfx941__)
  9868. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9869. address, val, [](long long x, long long y) { return x < y; },
  9870. [=]() {
  9871. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9872. });
  9873. #else
  9874. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9875. #endif // __gfx941__
  9876. }
  9877. __device__
  9878. inline
  9879. long long atomicMin_system(long long* address, long long val) {
  9880. #if defined(__gfx941__)
  9881. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9882. address, val, [](long long x, long long y) { return x < y; },
  9883. [=]() {
  9884. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9885. });
  9886. #else
  9887. return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9888. #endif // __gfx941__
  9889. }
  9890. __device__
  9891. inline
  9892. float atomicMin(float* addr, float val) {
  9893. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9894. return unsafeAtomicMin(addr, val);
  9895. #else
  9896. #if __has_builtin(__hip_atomic_load) && \
  9897. __has_builtin(__hip_atomic_compare_exchange_strong)
  9898. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9899. bool done = false;
  9900. while (!done && value > val) {
  9901. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9902. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9903. }
  9904. return value;
  9905. #else
  9906. unsigned int *uaddr = (unsigned int *)addr;
  9907. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9908. bool done = false;
  9909. while (!done && __uint_as_float(value) > val) {
  9910. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  9911. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9912. }
  9913. return __uint_as_float(value);
  9914. #endif
  9915. #endif
  9916. }
  9917. __device__
  9918. inline
  9919. float atomicMin_system(float* address, float val) {
  9920. unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
  9921. #if __has_builtin(__hip_atomic_load)
  9922. unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  9923. #else
  9924. unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  9925. #endif
  9926. float value = __uint_as_float(tmp);
  9927. while (val < value) {
  9928. value = atomicCAS_system(address, value, val);
  9929. }
  9930. return value;
  9931. }
  9932. __device__
  9933. inline
  9934. double atomicMin(double* addr, double val) {
  9935. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  9936. return unsafeAtomicMin(addr, val);
  9937. #else
  9938. #if __has_builtin(__hip_atomic_load) && \
  9939. __has_builtin(__hip_atomic_compare_exchange_strong)
  9940. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9941. bool done = false;
  9942. while (!done && value > val) {
  9943. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  9944. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9945. }
  9946. return value;
  9947. #else
  9948. unsigned long long *uaddr = (unsigned long long *)addr;
  9949. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  9950. bool done = false;
  9951. while (!done && __longlong_as_double(value) > val) {
  9952. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  9953. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  9954. }
  9955. return __longlong_as_double(value);
  9956. #endif
  9957. #endif
  9958. }
  9959. __device__
  9960. inline
  9961. double atomicMin_system(double* address, double val) {
  9962. unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
  9963. #if __has_builtin(__hip_atomic_load)
  9964. unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  9965. #else
  9966. unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  9967. #endif
  9968. double value = __longlong_as_double(tmp);
  9969. while (val < value) {
  9970. value = atomicCAS_system(address, value, val);
  9971. }
  9972. return value;
  9973. }
  9974. __device__
  9975. inline
  9976. int atomicMax(int* address, int val) {
  9977. #if defined(__gfx941__)
  9978. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  9979. address, val, [](int x, int y) { return y < x; }, [=]() {
  9980. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  9981. __HIP_MEMORY_SCOPE_AGENT);
  9982. });
  9983. #else
  9984. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  9985. #endif // __gfx941__
  9986. }
  9987. __device__
  9988. inline
  9989. int atomicMax_system(int* address, int val) {
  9990. #if defined(__gfx941__)
  9991. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  9992. address, val, [](int x, int y) { return y < x; }, [=]() {
  9993. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  9994. __HIP_MEMORY_SCOPE_SYSTEM);
  9995. });
  9996. #else
  9997. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  9998. #endif // __gfx941__
  9999. }
  10000. __device__
  10001. inline
  10002. unsigned int atomicMax(unsigned int* address, unsigned int val) {
  10003. #if defined(__gfx941__)
  10004. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10005. address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
  10006. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10007. __HIP_MEMORY_SCOPE_AGENT);
  10008. });
  10009. #else
  10010. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10011. #endif // __gfx941__
  10012. }
  10013. __device__
  10014. inline
  10015. unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
  10016. #if defined(__gfx941__)
  10017. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10018. address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
  10019. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10020. __HIP_MEMORY_SCOPE_SYSTEM);
  10021. });
  10022. #else
  10023. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10024. #endif // __gfx941__
  10025. }
  10026. __device__
  10027. inline
  10028. unsigned long atomicMax(unsigned long* address, unsigned long val) {
  10029. #if defined(__gfx941__)
  10030. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10031. address,
  10032. val,
  10033. [](unsigned long x, unsigned long y) { return y < x; },
  10034. [=]() {
  10035. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10036. __HIP_MEMORY_SCOPE_AGENT);
  10037. });
  10038. #else
  10039. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10040. #endif // __gfx941__
  10041. }
  10042. __device__
  10043. inline
  10044. unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
  10045. #if defined(__gfx941__)
  10046. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10047. address,
  10048. val,
  10049. [](unsigned long x, unsigned long y) { return y < x; },
  10050. [=]() {
  10051. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10052. __HIP_MEMORY_SCOPE_SYSTEM);
  10053. });
  10054. #else
  10055. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10056. #endif // __gfx941__
  10057. }
  10058. __device__
  10059. inline
  10060. unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
  10061. #if defined(__gfx941__)
  10062. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10063. address,
  10064. val,
  10065. [](unsigned long long x, unsigned long long y) { return y < x; },
  10066. [=]() {
  10067. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10068. __HIP_MEMORY_SCOPE_AGENT);
  10069. });
  10070. #else
  10071. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10072. #endif // __gfx941__
  10073. }
  10074. __device__
  10075. inline
  10076. unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
  10077. #if defined(__gfx941__)
  10078. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10079. address,
  10080. val,
  10081. [](unsigned long long x, unsigned long long y) { return y < x; },
  10082. [=]() {
  10083. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
  10084. __HIP_MEMORY_SCOPE_SYSTEM);
  10085. });
  10086. #else
  10087. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10088. #endif // __gfx941__
  10089. }
  10090. __device__
  10091. inline
  10092. long long atomicMax(long long* address, long long val) {
  10093. #if defined(__gfx941__)
  10094. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10095. address, val, [](long long x, long long y) { return y < x; },
  10096. [=]() {
  10097. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10098. });
  10099. #else
  10100. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10101. #endif // __gfx941__
  10102. }
  10103. __device__
  10104. inline
  10105. long long atomicMax_system(long long* address, long long val) {
  10106. #if defined(__gfx941__)
  10107. return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10108. address, val, [](long long x, long long y) { return y < x; },
  10109. [=]() {
  10110. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10111. });
  10112. #else
  10113. return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10114. #endif // __gfx941__
  10115. }
  10116. __device__
  10117. inline
  10118. float atomicMax(float* addr, float val) {
  10119. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  10120. return unsafeAtomicMax(addr, val);
  10121. #else
  10122. #if __has_builtin(__hip_atomic_load) && \
  10123. __has_builtin(__hip_atomic_compare_exchange_strong)
  10124. float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10125. bool done = false;
  10126. while (!done && value < val) {
  10127. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  10128. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10129. }
  10130. return value;
  10131. #else
  10132. unsigned int *uaddr = (unsigned int *)addr;
  10133. unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  10134. bool done = false;
  10135. while (!done && __uint_as_float(value) < val) {
  10136. done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
  10137. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  10138. }
  10139. return __uint_as_float(value);
  10140. #endif
  10141. #endif
  10142. }
  10143. __device__
  10144. inline
  10145. float atomicMax_system(float* address, float val) {
  10146. unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
  10147. #if __has_builtin(__hip_atomic_load)
  10148. unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  10149. #else
  10150. unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  10151. #endif
  10152. float value = __uint_as_float(tmp);
  10153. while (value < val) {
  10154. value = atomicCAS_system(address, value, val);
  10155. }
  10156. return value;
  10157. }
  10158. __device__
  10159. inline
  10160. double atomicMax(double* addr, double val) {
  10161. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  10162. return unsafeAtomicMax(addr, val);
  10163. #else
  10164. #if __has_builtin(__hip_atomic_load) && \
  10165. __has_builtin(__hip_atomic_compare_exchange_strong)
  10166. double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10167. bool done = false;
  10168. while (!done && value < val) {
  10169. done = __hip_atomic_compare_exchange_strong(addr, &value, val,
  10170. __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10171. }
  10172. return value;
  10173. #else
  10174. unsigned long long *uaddr = (unsigned long long *)addr;
  10175. unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  10176. bool done = false;
  10177. while (!done && __longlong_as_double(value) < val) {
  10178. done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
  10179. __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  10180. }
  10181. return __longlong_as_double(value);
  10182. #endif
  10183. #endif
  10184. }
  10185. __device__
  10186. inline
  10187. double atomicMax_system(double* address, double val) {
  10188. unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
  10189. #if __has_builtin(__hip_atomic_load)
  10190. unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  10191. #else
  10192. unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  10193. #endif
  10194. double value = __longlong_as_double(tmp);
  10195. while (value < val) {
  10196. value = atomicCAS_system(address, value, val);
  10197. }
  10198. return value;
  10199. }
  10200. __device__
  10201. inline
  10202. unsigned int atomicInc(unsigned int* address, unsigned int val)
  10203. {
  10204. #if defined(__gfx941__)
  10205. __device__
  10206. extern
  10207. unsigned int __builtin_amdgcn_atomic_inc(
  10208. unsigned int*,
  10209. unsigned int,
  10210. unsigned int,
  10211. unsigned int,
  10212. bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
  10213. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10214. address,
  10215. val,
  10216. [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
  10217. [=]() {
  10218. return
  10219. __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
  10220. });
  10221. #else
  10222. return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
  10223. #endif // __gfx941__
  10224. }
  10225. __device__
  10226. inline
  10227. unsigned int atomicDec(unsigned int* address, unsigned int val)
  10228. {
  10229. #if defined(__gfx941__)
  10230. __device__
  10231. extern
  10232. unsigned int __builtin_amdgcn_atomic_dec(
  10233. unsigned int*,
  10234. unsigned int,
  10235. unsigned int,
  10236. unsigned int,
  10237. bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
  10238. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10239. address,
  10240. val,
  10241. [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
  10242. [=]() {
  10243. return
  10244. __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
  10245. });
  10246. #else
  10247. return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
  10248. #endif // __gfx941__
  10249. }
  10250. __device__
  10251. inline
  10252. int atomicAnd(int* address, int val) {
  10253. #if defined(__gfx941__)
  10254. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10255. address, val, [](int& x, int y) { x &= y; }, [=]() {
  10256. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10257. __HIP_MEMORY_SCOPE_AGENT);
  10258. });
  10259. #else
  10260. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10261. #endif // __gfx941__
  10262. }
  10263. __device__
  10264. inline
  10265. int atomicAnd_system(int* address, int val) {
  10266. #if defined(__gfx941__)
  10267. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10268. address, val, [](int& x, int y) { x &= y; }, [=]() {
  10269. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10270. __HIP_MEMORY_SCOPE_SYSTEM);
  10271. });
  10272. #else
  10273. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10274. #endif // __gfx941__
  10275. }
  10276. __device__
  10277. inline
  10278. unsigned int atomicAnd(unsigned int* address, unsigned int val) {
  10279. #if defined(__gfx941__)
  10280. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10281. address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
  10282. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10283. __HIP_MEMORY_SCOPE_AGENT);
  10284. });
  10285. #else
  10286. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10287. #endif // __gfx941__
  10288. }
  10289. __device__
  10290. inline
  10291. unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
  10292. #if defined(__gfx941__)
  10293. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10294. address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
  10295. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10296. __HIP_MEMORY_SCOPE_SYSTEM);
  10297. });
  10298. #else
  10299. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10300. #endif // __gfx941__
  10301. }
  10302. __device__
  10303. inline
  10304. unsigned long atomicAnd(unsigned long* address, unsigned long val) {
  10305. #if defined(__gfx941__)
  10306. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10307. address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
  10308. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10309. __HIP_MEMORY_SCOPE_AGENT);
  10310. });
  10311. #else
  10312. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10313. #endif // __gfx941__
  10314. }
  10315. __device__
  10316. inline
  10317. unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
  10318. #if defined(__gfx941__)
  10319. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10320. address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
  10321. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10322. __HIP_MEMORY_SCOPE_SYSTEM);
  10323. });
  10324. #else
  10325. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10326. #endif // __gfx941__
  10327. }
  10328. __device__
  10329. inline
  10330. unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
  10331. #if defined(__gfx941__)
  10332. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10333. address,
  10334. val,
  10335. [](unsigned long long& x, unsigned long long y) { x &= y; },
  10336. [=]() {
  10337. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10338. __HIP_MEMORY_SCOPE_AGENT);
  10339. });
  10340. #else
  10341. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10342. #endif // __gfx941__
  10343. }
  10344. __device__
  10345. inline
  10346. unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
  10347. #if defined(__gfx941__)
  10348. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10349. address,
  10350. val,
  10351. [](unsigned long long& x, unsigned long long y) { x &= y; },
  10352. [=]() {
  10353. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
  10354. __HIP_MEMORY_SCOPE_SYSTEM);
  10355. });
  10356. #else
  10357. return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10358. #endif // __gfx941__
  10359. }
  10360. __device__
  10361. inline
  10362. int atomicOr(int* address, int val) {
  10363. #if defined(__gfx941__)
  10364. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10365. address, val, [](int& x, int y) { x |= y; }, [=]() {
  10366. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10367. __HIP_MEMORY_SCOPE_AGENT);
  10368. });
  10369. #else
  10370. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10371. #endif // __gfx941__
  10372. }
  10373. __device__
  10374. inline
  10375. int atomicOr_system(int* address, int val) {
  10376. #if defined(__gfx941__)
  10377. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10378. address, val, [](int& x, int y) { x |= y; }, [=]() {
  10379. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10380. __HIP_MEMORY_SCOPE_SYSTEM);
  10381. });
  10382. #else
  10383. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10384. #endif // __gfx941__
  10385. }
  10386. __device__
  10387. inline
  10388. unsigned int atomicOr(unsigned int* address, unsigned int val) {
  10389. #if defined(__gfx941__)
  10390. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10391. address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
  10392. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10393. __HIP_MEMORY_SCOPE_AGENT);
  10394. });
  10395. #else
  10396. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10397. #endif // __gfx941__
  10398. }
  10399. __device__
  10400. inline
  10401. unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
  10402. #if defined(__gfx941__)
  10403. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10404. address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
  10405. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10406. __HIP_MEMORY_SCOPE_SYSTEM);
  10407. });
  10408. #else
  10409. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10410. #endif // __gfx941__
  10411. }
  10412. __device__
  10413. inline
  10414. unsigned long atomicOr(unsigned long* address, unsigned long val) {
  10415. #if defined(__gfx941__)
  10416. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10417. address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
  10418. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10419. __HIP_MEMORY_SCOPE_AGENT);
  10420. });
  10421. #else
  10422. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10423. #endif // __gfx941__
  10424. }
  10425. __device__
  10426. inline
  10427. unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
  10428. #if defined(__gfx941__)
  10429. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10430. address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
  10431. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10432. __HIP_MEMORY_SCOPE_SYSTEM);
  10433. });
  10434. #else
  10435. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10436. #endif // __gfx941__
  10437. }
  10438. __device__
  10439. inline
  10440. unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
  10441. #if defined(__gfx941__)
  10442. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10443. address,
  10444. val,
  10445. [](unsigned long long& x, unsigned long long y) { x |= y; },
  10446. [=]() {
  10447. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10448. __HIP_MEMORY_SCOPE_AGENT);
  10449. });
  10450. #else
  10451. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10452. #endif // __gfx941__
  10453. }
  10454. __device__
  10455. inline
  10456. unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
  10457. #if defined(__gfx941__)
  10458. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10459. address,
  10460. val,
  10461. [](unsigned long long& x, unsigned long long y) { x |= y; },
  10462. [=]() {
  10463. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
  10464. __HIP_MEMORY_SCOPE_SYSTEM);
  10465. });
  10466. #else
  10467. return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10468. #endif // __gfx941__
  10469. }
  10470. __device__
  10471. inline
  10472. int atomicXor(int* address, int val) {
  10473. #if defined(__gfx941__)
  10474. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10475. address, val, [](int& x, int y) { x ^= y; }, [=]() {
  10476. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10477. __HIP_MEMORY_SCOPE_AGENT);
  10478. });
  10479. #else
  10480. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10481. #endif // __gfx941__
  10482. }
  10483. __device__
  10484. inline
  10485. int atomicXor_system(int* address, int val) {
  10486. #if defined(__gfx941__)
  10487. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10488. address, val, [](int& x, int y) { x ^= y; }, [=]() {
  10489. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10490. __HIP_MEMORY_SCOPE_SYSTEM);
  10491. });
  10492. #else
  10493. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10494. #endif // __gfx941__
  10495. }
  10496. __device__
  10497. inline
  10498. unsigned int atomicXor(unsigned int* address, unsigned int val) {
  10499. #if defined(__gfx941__)
  10500. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10501. address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
  10502. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10503. __HIP_MEMORY_SCOPE_AGENT);
  10504. });
  10505. #else
  10506. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10507. #endif // __gfx941__
  10508. }
  10509. __device__
  10510. inline
  10511. unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
  10512. #if defined(__gfx941__)
  10513. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10514. address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
  10515. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10516. __HIP_MEMORY_SCOPE_SYSTEM);
  10517. });
  10518. #else
  10519. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10520. #endif // __gfx941__
  10521. }
  10522. __device__
  10523. inline
  10524. unsigned long atomicXor(unsigned long* address, unsigned long val) {
  10525. #if defined(__gfx941__)
  10526. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10527. address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
  10528. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10529. __HIP_MEMORY_SCOPE_AGENT);
  10530. });
  10531. #else
  10532. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10533. #endif // __gfx941__
  10534. }
  10535. __device__
  10536. inline
  10537. unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
  10538. #if defined(__gfx941__)
  10539. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
  10540. address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
  10541. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10542. __HIP_MEMORY_SCOPE_SYSTEM);
  10543. });
  10544. #else
  10545. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10546. #endif // __gfx941__
  10547. }
  10548. __device__
  10549. inline
  10550. unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
  10551. #if defined(__gfx941__)
  10552. return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
  10553. address,
  10554. val,
  10555. [](unsigned long long& x, unsigned long long y) { x ^= y; },
  10556. [=]() {
  10557. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
  10558. __HIP_MEMORY_SCOPE_AGENT);
  10559. });
  10560. #else
  10561. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  10562. #endif // __gfx941__
  10563. }
  10564. __device__
  10565. inline
  10566. unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
  10567. return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
  10568. }
  10569. #else // __hip_atomic_compare_exchange_strong
  10570. __device__
  10571. inline
  10572. int atomicCAS(int* address, int compare, int val)
  10573. {
  10574. __atomic_compare_exchange_n(
  10575. address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  10576. return compare;
  10577. }
  10578. __device__
  10579. inline
  10580. unsigned int atomicCAS(
  10581. unsigned int* address, unsigned int compare, unsigned int val)
  10582. {
  10583. __atomic_compare_exchange_n(
  10584. address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  10585. return compare;
  10586. }
  10587. __device__
  10588. inline
  10589. unsigned long long atomicCAS(
  10590. unsigned long long* address,
  10591. unsigned long long compare,
  10592. unsigned long long val)
  10593. {
  10594. __atomic_compare_exchange_n(
  10595. address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  10596. return compare;
  10597. }
  10598. __device__
  10599. inline
  10600. int atomicAdd(int* address, int val)
  10601. {
  10602. return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
  10603. }
  10604. __device__
  10605. inline
  10606. unsigned int atomicAdd(unsigned int* address, unsigned int val)
  10607. {
  10608. return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
  10609. }
  10610. __device__
  10611. inline
  10612. unsigned long long atomicAdd(
  10613. unsigned long long* address, unsigned long long val)
  10614. {
  10615. return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
  10616. }
  10617. __device__
  10618. inline
  10619. float atomicAdd(float* address, float val)
  10620. {
  10621. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  10622. return unsafeAtomicAdd(address, val);
  10623. #else
  10624. return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
  10625. #endif
  10626. }
  10627. #if !defined(__HIPCC_RTC__)
  10628. DEPRECATED("use atomicAdd instead")
  10629. #endif // !defined(__HIPCC_RTC__)
  10630. __device__
  10631. inline
  10632. void atomicAddNoRet(float* address, float val)
  10633. {
  10634. __ockl_atomic_add_noret_f32(address, val);
  10635. }
  10636. __device__
  10637. inline
  10638. double atomicAdd(double* address, double val)
  10639. {
  10640. #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  10641. return unsafeAtomicAdd(address, val);
  10642. #else
  10643. return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
  10644. #endif
  10645. }
  10646. __device__
  10647. inline
  10648. int atomicSub(int* address, int val)
  10649. {
  10650. return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
  10651. }
  10652. __device__
  10653. inline
  10654. unsigned int atomicSub(unsigned int* address, unsigned int val)
  10655. {
  10656. return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
  10657. }
  10658. __device__
  10659. inline
  10660. int atomicExch(int* address, int val)
  10661. {
  10662. return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
  10663. }
  10664. __device__
  10665. inline
  10666. unsigned int atomicExch(unsigned int* address, unsigned int val)
  10667. {
  10668. return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
  10669. }
  10670. __device__
  10671. inline
  10672. unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
  10673. {
  10674. return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
  10675. }
  10676. __device__
  10677. inline
  10678. float atomicExch(float* address, float val)
  10679. {
  10680. return __uint_as_float(__atomic_exchange_n(
  10681. reinterpret_cast<unsigned int*>(address),
  10682. __float_as_uint(val),
  10683. __ATOMIC_RELAXED));
  10684. }
  10685. __device__
  10686. inline
  10687. int atomicMin(int* address, int val)
  10688. {
  10689. return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
  10690. }
  10691. __device__
  10692. inline
  10693. unsigned int atomicMin(unsigned int* address, unsigned int val)
  10694. {
  10695. return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
  10696. }
  10697. __device__
  10698. inline
  10699. unsigned long long atomicMin(
  10700. unsigned long long* address, unsigned long long val)
  10701. {
  10702. unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
  10703. while (val < tmp) {
  10704. const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
  10705. if (tmp1 != tmp) { tmp = tmp1; continue; }
  10706. tmp = atomicCAS(address, tmp, val);
  10707. }
  10708. return tmp;
  10709. }
  10710. __device__ inline long long atomicMin(long long* address, long long val) {
  10711. long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
  10712. while (val < tmp) {
  10713. const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
  10714. if (tmp1 != tmp) {
  10715. tmp = tmp1;
  10716. continue;
  10717. }
  10718. tmp = atomicCAS(address, tmp, val);
  10719. }
  10720. return tmp;
  10721. }
  10722. __device__
  10723. inline
  10724. int atomicMax(int* address, int val)
  10725. {
  10726. return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
  10727. }
  10728. __device__
  10729. inline
  10730. unsigned int atomicMax(unsigned int* address, unsigned int val)
  10731. {
  10732. return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
  10733. }
  10734. __device__
  10735. inline
  10736. unsigned long long atomicMax(
  10737. unsigned long long* address, unsigned long long val)
  10738. {
  10739. unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
  10740. while (tmp < val) {
  10741. const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
  10742. if (tmp1 != tmp) { tmp = tmp1; continue; }
  10743. tmp = atomicCAS(address, tmp, val);
  10744. }
  10745. return tmp;
  10746. }
  10747. __device__ inline long long atomicMax(long long* address, long long val) {
  10748. long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
  10749. while (tmp < val) {
  10750. const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
  10751. if (tmp1 != tmp) {
  10752. tmp = tmp1;
  10753. continue;
  10754. }
  10755. tmp = atomicCAS(address, tmp, val);
  10756. }
  10757. return tmp;
  10758. }
  10759. __device__
  10760. inline
  10761. unsigned int atomicInc(unsigned int* address, unsigned int val)
  10762. {
  10763. return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
  10764. }
  10765. __device__
  10766. inline
  10767. unsigned int atomicDec(unsigned int* address, unsigned int val)
  10768. {
  10769. return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
  10770. }
  10771. __device__
  10772. inline
  10773. int atomicAnd(int* address, int val)
  10774. {
  10775. return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
  10776. }
  10777. __device__
  10778. inline
  10779. unsigned int atomicAnd(unsigned int* address, unsigned int val)
  10780. {
  10781. return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
  10782. }
  10783. __device__
  10784. inline
  10785. unsigned long long atomicAnd(
  10786. unsigned long long* address, unsigned long long val)
  10787. {
  10788. return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
  10789. }
  10790. __device__
  10791. inline
  10792. int atomicOr(int* address, int val)
  10793. {
  10794. return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
  10795. }
  10796. __device__
  10797. inline
  10798. unsigned int atomicOr(unsigned int* address, unsigned int val)
  10799. {
  10800. return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
  10801. }
  10802. __device__
  10803. inline
  10804. unsigned long long atomicOr(
  10805. unsigned long long* address, unsigned long long val)
  10806. {
  10807. return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
  10808. }
  10809. __device__
  10810. inline
  10811. int atomicXor(int* address, int val)
  10812. {
  10813. return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
  10814. }
  10815. __device__
  10816. inline
  10817. unsigned int atomicXor(unsigned int* address, unsigned int val)
  10818. {
  10819. return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
  10820. }
  10821. __device__
  10822. inline
  10823. unsigned long long atomicXor(
  10824. unsigned long long* address, unsigned long long val)
  10825. {
  10826. return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
  10827. }
  10828. #endif // __hip_atomic_compare_exchange_strong
  10829. /*
  10830. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  10831. Permission is hereby granted, free of charge, to any person obtaining a copy
  10832. of this software and associated documentation files (the "Software"), to deal
  10833. in the Software without restriction, including without limitation the rights
  10834. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10835. copies of the Software, and to permit persons to whom the Software is
  10836. furnished to do so, subject to the following conditions:
  10837. The above copyright notice and this permission notice shall be included in
  10838. all copies or substantial portions of the Software.
  10839. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  10840. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  10841. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  10842. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  10843. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  10844. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  10845. THE SOFTWARE.
  10846. */
  10847. #pragma once
  10848. #if !defined(__HIPCC_RTC__)
  10849. #include "host_defines.h"
  10850. #include "amd_hip_vector_types.h" // For Native_vec_
  10851. #endif
  10852. #if defined(__cplusplus)
  10853. extern "C" {
  10854. #endif
  10855. // DOT FUNCTIONS
  10856. #if defined(__clang__) && defined(__HIP__)
  10857. __device__
  10858. __attribute__((const))
  10859. int __ockl_sdot2(
  10860. HIP_vector_base<short, 2>::Native_vec_,
  10861. HIP_vector_base<short, 2>::Native_vec_,
  10862. int, bool);
  10863. __device__
  10864. __attribute__((const))
  10865. unsigned int __ockl_udot2(
  10866. HIP_vector_base<unsigned short, 2>::Native_vec_,
  10867. HIP_vector_base<unsigned short, 2>::Native_vec_,
  10868. unsigned int, bool);
  10869. __device__
  10870. __attribute__((const))
  10871. int __ockl_sdot4(
  10872. HIP_vector_base<char, 4>::Native_vec_,
  10873. HIP_vector_base<char, 4>::Native_vec_,
  10874. int, bool);
  10875. __device__
  10876. __attribute__((const))
  10877. unsigned int __ockl_udot4(
  10878. HIP_vector_base<unsigned char, 4>::Native_vec_,
  10879. HIP_vector_base<unsigned char, 4>::Native_vec_,
  10880. unsigned int, bool);
  10881. __device__
  10882. __attribute__((const))
  10883. int __ockl_sdot8(int, int, int, bool);
  10884. __device__
  10885. __attribute__((const))
  10886. unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
  10887. #endif
  10888. #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  10889. // BEGIN FLOAT
  10890. __device__
  10891. __attribute__((const))
  10892. float __ocml_acos_f32(float);
  10893. __device__
  10894. __attribute__((pure))
  10895. float __ocml_acosh_f32(float);
  10896. __device__
  10897. __attribute__((const))
  10898. float __ocml_asin_f32(float);
  10899. __device__
  10900. __attribute__((pure))
  10901. float __ocml_asinh_f32(float);
  10902. __device__
  10903. __attribute__((const))
  10904. float __ocml_atan2_f32(float, float);
  10905. __device__
  10906. __attribute__((const))
  10907. float __ocml_atan_f32(float);
  10908. __device__
  10909. __attribute__((pure))
  10910. float __ocml_atanh_f32(float);
  10911. __device__
  10912. __attribute__((pure))
  10913. float __ocml_cbrt_f32(float);
  10914. __device__
  10915. __attribute__((const))
  10916. float __ocml_ceil_f32(float);
  10917. __device__
  10918. __attribute__((const))
  10919. __device__
  10920. float __ocml_copysign_f32(float, float);
  10921. __device__
  10922. float __ocml_cos_f32(float);
  10923. __device__
  10924. float __ocml_native_cos_f32(float);
  10925. __device__
  10926. __attribute__((pure))
  10927. __device__
  10928. float __ocml_cosh_f32(float);
  10929. __device__
  10930. float __ocml_cospi_f32(float);
  10931. __device__
  10932. float __ocml_i0_f32(float);
  10933. __device__
  10934. float __ocml_i1_f32(float);
  10935. __device__
  10936. __attribute__((pure))
  10937. float __ocml_erfc_f32(float);
  10938. __device__
  10939. __attribute__((pure))
  10940. float __ocml_erfcinv_f32(float);
  10941. __device__
  10942. __attribute__((pure))
  10943. float __ocml_erfcx_f32(float);
  10944. __device__
  10945. __attribute__((pure))
  10946. float __ocml_erf_f32(float);
  10947. __device__
  10948. __attribute__((pure))
  10949. float __ocml_erfinv_f32(float);
  10950. __device__
  10951. __attribute__((pure))
  10952. float __ocml_exp10_f32(float);
  10953. __device__
  10954. __attribute__((pure))
  10955. float __ocml_native_exp10_f32(float);
  10956. __device__
  10957. __attribute__((pure))
  10958. float __ocml_exp2_f32(float);
  10959. __device__
  10960. __attribute__((pure))
  10961. float __ocml_exp_f32(float);
  10962. __device__
  10963. __attribute__((pure))
  10964. float __ocml_native_exp_f32(float);
  10965. __device__
  10966. __attribute__((pure))
  10967. float __ocml_expm1_f32(float);
  10968. __device__
  10969. __attribute__((const))
  10970. float __ocml_fabs_f32(float);
  10971. __device__
  10972. __attribute__((const))
  10973. float __ocml_fdim_f32(float, float);
  10974. __device__
  10975. __attribute__((const))
  10976. float __ocml_floor_f32(float);
  10977. __device__
  10978. __attribute__((const))
  10979. float __ocml_fma_f32(float, float, float);
  10980. __device__
  10981. __attribute__((const))
  10982. float __ocml_fmax_f32(float, float);
  10983. __device__
  10984. __attribute__((const))
  10985. float __ocml_fmin_f32(float, float);
  10986. __device__
  10987. __attribute__((const))
  10988. __device__
  10989. float __ocml_fmod_f32(float, float);
  10990. __device__
  10991. float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
  10992. __device__
  10993. __attribute__((const))
  10994. float __ocml_hypot_f32(float, float);
  10995. __device__
  10996. __attribute__((const))
  10997. int __ocml_ilogb_f32(float);
  10998. __device__
  10999. __attribute__((const))
  11000. int __ocml_isfinite_f32(float);
  11001. __device__
  11002. __attribute__((const))
  11003. int __ocml_isinf_f32(float);
  11004. __device__
  11005. __attribute__((const))
  11006. int __ocml_isnan_f32(float);
  11007. __device__
  11008. float __ocml_j0_f32(float);
  11009. __device__
  11010. float __ocml_j1_f32(float);
  11011. __device__
  11012. __attribute__((const))
  11013. float __ocml_ldexp_f32(float, int);
  11014. __device__
  11015. float __ocml_lgamma_f32(float);
  11016. __device__
  11017. __attribute__((pure))
  11018. float __ocml_log10_f32(float);
  11019. __device__
  11020. __attribute__((pure))
  11021. float __ocml_native_log10_f32(float);
  11022. __device__
  11023. __attribute__((pure))
  11024. float __ocml_log1p_f32(float);
  11025. __device__
  11026. __attribute__((pure))
  11027. float __ocml_log2_f32(float);
  11028. __device__
  11029. __attribute__((pure))
  11030. float __ocml_native_log2_f32(float);
  11031. __device__
  11032. __attribute__((const))
  11033. float __ocml_logb_f32(float);
  11034. __device__
  11035. __attribute__((pure))
  11036. float __ocml_log_f32(float);
  11037. __device__
  11038. __attribute__((pure))
  11039. float __ocml_native_log_f32(float);
  11040. __device__
  11041. float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
  11042. __device__
  11043. __attribute__((const))
  11044. float __ocml_nearbyint_f32(float);
  11045. __device__
  11046. __attribute__((const))
  11047. float __ocml_nextafter_f32(float, float);
  11048. __device__
  11049. __attribute__((const))
  11050. float __ocml_len3_f32(float, float, float);
  11051. __device__
  11052. __attribute__((const))
  11053. float __ocml_len4_f32(float, float, float, float);
  11054. __device__
  11055. __attribute__((pure))
  11056. float __ocml_ncdf_f32(float);
  11057. __device__
  11058. __attribute__((pure))
  11059. float __ocml_ncdfinv_f32(float);
  11060. __device__
  11061. __attribute__((pure))
  11062. float __ocml_pow_f32(float, float);
  11063. __device__
  11064. __attribute__((pure))
  11065. float __ocml_pown_f32(float, int);
  11066. __device__
  11067. __attribute__((pure))
  11068. float __ocml_rcbrt_f32(float);
  11069. __device__
  11070. __attribute__((const))
  11071. float __ocml_remainder_f32(float, float);
  11072. __device__
  11073. float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
  11074. __device__
  11075. __attribute__((const))
  11076. float __ocml_rhypot_f32(float, float);
  11077. __device__
  11078. __attribute__((const))
  11079. float __ocml_rint_f32(float);
  11080. __device__
  11081. __attribute__((const))
  11082. float __ocml_rlen3_f32(float, float, float);
  11083. __device__
  11084. __attribute__((const))
  11085. float __ocml_rlen4_f32(float, float, float, float);
  11086. __device__
  11087. __attribute__((const))
  11088. float __ocml_round_f32(float);
  11089. __device__
  11090. __attribute__((pure))
  11091. float __ocml_rsqrt_f32(float);
  11092. __device__
  11093. __attribute__((const))
  11094. float __ocml_scalb_f32(float, float);
  11095. __device__
  11096. __attribute__((const))
  11097. float __ocml_scalbn_f32(float, int);
  11098. __device__
  11099. __attribute__((const))
  11100. int __ocml_signbit_f32(float);
  11101. __device__
  11102. float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
  11103. __device__
  11104. float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
  11105. __device__
  11106. float __ocml_sin_f32(float);
  11107. __device__
  11108. float __ocml_native_sin_f32(float);
  11109. __device__
  11110. __attribute__((pure))
  11111. float __ocml_sinh_f32(float);
  11112. __device__
  11113. float __ocml_sinpi_f32(float);
  11114. __device__
  11115. __attribute__((const))
  11116. float __ocml_sqrt_f32(float);
  11117. __device__
  11118. __attribute__((const))
  11119. float __ocml_native_sqrt_f32(float);
  11120. __device__
  11121. float __ocml_tan_f32(float);
  11122. __device__
  11123. __attribute__((pure))
  11124. float __ocml_tanh_f32(float);
  11125. __device__
  11126. float __ocml_tgamma_f32(float);
  11127. __device__
  11128. __attribute__((const))
  11129. float __ocml_trunc_f32(float);
  11130. __device__
  11131. float __ocml_y0_f32(float);
  11132. __device__
  11133. float __ocml_y1_f32(float);
  11134. // BEGIN INTRINSICS
  11135. __device__
  11136. __attribute__((const))
  11137. float __ocml_add_rte_f32(float, float);
  11138. __device__
  11139. __attribute__((const))
  11140. float __ocml_add_rtn_f32(float, float);
  11141. __device__
  11142. __attribute__((const))
  11143. float __ocml_add_rtp_f32(float, float);
  11144. __device__
  11145. __attribute__((const))
  11146. float __ocml_add_rtz_f32(float, float);
  11147. __device__
  11148. __attribute__((const))
  11149. float __ocml_sub_rte_f32(float, float);
  11150. __device__
  11151. __attribute__((const))
  11152. float __ocml_sub_rtn_f32(float, float);
  11153. __device__
  11154. __attribute__((const))
  11155. float __ocml_sub_rtp_f32(float, float);
  11156. __device__
  11157. __attribute__((const))
  11158. float __ocml_sub_rtz_f32(float, float);
  11159. __device__
  11160. __attribute__((const))
  11161. float __ocml_mul_rte_f32(float, float);
  11162. __device__
  11163. __attribute__((const))
  11164. float __ocml_mul_rtn_f32(float, float);
  11165. __device__
  11166. __attribute__((const))
  11167. float __ocml_mul_rtp_f32(float, float);
  11168. __device__
  11169. __attribute__((const))
  11170. float __ocml_mul_rtz_f32(float, float);
  11171. __device__
  11172. __attribute__((const))
  11173. float __ocml_div_rte_f32(float, float);
  11174. __device__
  11175. __attribute__((const))
  11176. float __ocml_div_rtn_f32(float, float);
  11177. __device__
  11178. __attribute__((const))
  11179. float __ocml_div_rtp_f32(float, float);
  11180. __device__
  11181. __attribute__((const))
  11182. float __ocml_div_rtz_f32(float, float);
  11183. __device__
  11184. __attribute__((const))
  11185. float __ocml_sqrt_rte_f32(float);
  11186. __device__
  11187. __attribute__((const))
  11188. float __ocml_sqrt_rtn_f32(float);
  11189. __device__
  11190. __attribute__((const))
  11191. float __ocml_sqrt_rtp_f32(float);
  11192. __device__
  11193. __attribute__((const))
  11194. float __ocml_sqrt_rtz_f32(float);
  11195. __device__
  11196. __attribute__((const))
  11197. float __ocml_fma_rte_f32(float, float, float);
  11198. __device__
  11199. __attribute__((const))
  11200. float __ocml_fma_rtn_f32(float, float, float);
  11201. __device__
  11202. __attribute__((const))
  11203. float __ocml_fma_rtp_f32(float, float, float);
  11204. __device__
  11205. __attribute__((const))
  11206. float __ocml_fma_rtz_f32(float, float, float);
  11207. // END INTRINSICS
  11208. // END FLOAT
  11209. // BEGIN DOUBLE
  11210. __device__
  11211. __attribute__((const))
  11212. double __ocml_acos_f64(double);
  11213. __device__
  11214. __attribute__((pure))
  11215. double __ocml_acosh_f64(double);
  11216. __device__
  11217. __attribute__((const))
  11218. double __ocml_asin_f64(double);
  11219. __device__
  11220. __attribute__((pure))
  11221. double __ocml_asinh_f64(double);
  11222. __device__
  11223. __attribute__((const))
  11224. double __ocml_atan2_f64(double, double);
  11225. __device__
  11226. __attribute__((const))
  11227. double __ocml_atan_f64(double);
  11228. __device__
  11229. __attribute__((pure))
  11230. double __ocml_atanh_f64(double);
  11231. __device__
  11232. __attribute__((pure))
  11233. double __ocml_cbrt_f64(double);
  11234. __device__
  11235. __attribute__((const))
  11236. double __ocml_ceil_f64(double);
  11237. __device__
  11238. __attribute__((const))
  11239. double __ocml_copysign_f64(double, double);
  11240. __device__
  11241. double __ocml_cos_f64(double);
  11242. __device__
  11243. __attribute__((pure))
  11244. double __ocml_cosh_f64(double);
  11245. __device__
  11246. double __ocml_cospi_f64(double);
  11247. __device__
  11248. double __ocml_i0_f64(double);
  11249. __device__
  11250. double __ocml_i1_f64(double);
  11251. __device__
  11252. __attribute__((pure))
  11253. double __ocml_erfc_f64(double);
  11254. __device__
  11255. __attribute__((pure))
  11256. double __ocml_erfcinv_f64(double);
  11257. __device__
  11258. __attribute__((pure))
  11259. double __ocml_erfcx_f64(double);
  11260. __device__
  11261. __attribute__((pure))
  11262. double __ocml_erf_f64(double);
  11263. __device__
  11264. __attribute__((pure))
  11265. double __ocml_erfinv_f64(double);
  11266. __device__
  11267. __attribute__((pure))
  11268. double __ocml_exp10_f64(double);
  11269. __device__
  11270. __attribute__((pure))
  11271. double __ocml_exp2_f64(double);
  11272. __device__
  11273. __attribute__((pure))
  11274. double __ocml_exp_f64(double);
  11275. __device__
  11276. __attribute__((pure))
  11277. double __ocml_expm1_f64(double);
  11278. __device__
  11279. __attribute__((const))
  11280. double __ocml_fabs_f64(double);
  11281. __device__
  11282. __attribute__((const))
  11283. double __ocml_fdim_f64(double, double);
  11284. __device__
  11285. __attribute__((const))
  11286. double __ocml_floor_f64(double);
  11287. __device__
  11288. __attribute__((const))
  11289. double __ocml_fma_f64(double, double, double);
  11290. __device__
  11291. __attribute__((const))
  11292. double __ocml_fmax_f64(double, double);
  11293. __device__
  11294. __attribute__((const))
  11295. double __ocml_fmin_f64(double, double);
  11296. __device__
  11297. __attribute__((const))
  11298. double __ocml_fmod_f64(double, double);
  11299. __device__
  11300. double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
  11301. __device__
  11302. __attribute__((const))
  11303. double __ocml_hypot_f64(double, double);
  11304. __device__
  11305. __attribute__((const))
  11306. int __ocml_ilogb_f64(double);
  11307. __device__
  11308. __attribute__((const))
  11309. int __ocml_isfinite_f64(double);
  11310. __device__
  11311. __attribute__((const))
  11312. int __ocml_isinf_f64(double);
  11313. __device__
  11314. __attribute__((const))
  11315. int __ocml_isnan_f64(double);
  11316. __device__
  11317. double __ocml_j0_f64(double);
  11318. __device__
  11319. double __ocml_j1_f64(double);
  11320. __device__
  11321. __attribute__((const))
  11322. double __ocml_ldexp_f64(double, int);
  11323. __device__
  11324. double __ocml_lgamma_f64(double);
  11325. __device__
  11326. __attribute__((pure))
  11327. double __ocml_log10_f64(double);
  11328. __device__
  11329. __attribute__((pure))
  11330. double __ocml_log1p_f64(double);
  11331. __device__
  11332. __attribute__((pure))
  11333. double __ocml_log2_f64(double);
  11334. __device__
  11335. __attribute__((const))
  11336. double __ocml_logb_f64(double);
  11337. __device__
  11338. __attribute__((pure))
  11339. double __ocml_log_f64(double);
  11340. __device__
  11341. double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
  11342. __device__
  11343. __attribute__((const))
  11344. double __ocml_nearbyint_f64(double);
  11345. __device__
  11346. __attribute__((const))
  11347. double __ocml_nextafter_f64(double, double);
  11348. __device__
  11349. __attribute__((const))
  11350. double __ocml_len3_f64(double, double, double);
  11351. __device__
  11352. __attribute__((const))
  11353. double __ocml_len4_f64(double, double, double, double);
  11354. __device__
  11355. __attribute__((pure))
  11356. double __ocml_ncdf_f64(double);
  11357. __device__
  11358. __attribute__((pure))
  11359. double __ocml_ncdfinv_f64(double);
  11360. __device__
  11361. __attribute__((pure))
  11362. double __ocml_pow_f64(double, double);
  11363. __device__
  11364. __attribute__((pure))
  11365. double __ocml_pown_f64(double, int);
  11366. __device__
  11367. __attribute__((pure))
  11368. double __ocml_rcbrt_f64(double);
  11369. __device__
  11370. __attribute__((const))
  11371. double __ocml_remainder_f64(double, double);
  11372. __device__
  11373. double __ocml_remquo_f64(
  11374. double, double, __attribute__((address_space(5))) int*);
  11375. __device__
  11376. __attribute__((const))
  11377. double __ocml_rhypot_f64(double, double);
  11378. __device__
  11379. __attribute__((const))
  11380. double __ocml_rint_f64(double);
  11381. __device__
  11382. __attribute__((const))
  11383. double __ocml_rlen3_f64(double, double, double);
  11384. __device__
  11385. __attribute__((const))
  11386. double __ocml_rlen4_f64(double, double, double, double);
  11387. __device__
  11388. __attribute__((const))
  11389. double __ocml_round_f64(double);
  11390. __device__
  11391. __attribute__((pure))
  11392. double __ocml_rsqrt_f64(double);
  11393. __device__
  11394. __attribute__((const))
  11395. double __ocml_scalb_f64(double, double);
  11396. __device__
  11397. __attribute__((const))
  11398. double __ocml_scalbn_f64(double, int);
  11399. __device__
  11400. __attribute__((const))
  11401. int __ocml_signbit_f64(double);
  11402. __device__
  11403. double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
  11404. __device__
  11405. double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
  11406. __device__
  11407. double __ocml_sin_f64(double);
  11408. __device__
  11409. __attribute__((pure))
  11410. double __ocml_sinh_f64(double);
  11411. __device__
  11412. double __ocml_sinpi_f64(double);
  11413. __device__
  11414. __attribute__((const))
  11415. double __ocml_sqrt_f64(double);
  11416. __device__
  11417. double __ocml_tan_f64(double);
  11418. __device__
  11419. __attribute__((pure))
  11420. double __ocml_tanh_f64(double);
  11421. __device__
  11422. double __ocml_tgamma_f64(double);
  11423. __device__
  11424. __attribute__((const))
  11425. double __ocml_trunc_f64(double);
  11426. __device__
  11427. double __ocml_y0_f64(double);
  11428. __device__
  11429. double __ocml_y1_f64(double);
  11430. // BEGIN INTRINSICS
  11431. __device__
  11432. __attribute__((const))
  11433. double __ocml_add_rte_f64(double, double);
  11434. __device__
  11435. __attribute__((const))
  11436. double __ocml_add_rtn_f64(double, double);
  11437. __device__
  11438. __attribute__((const))
  11439. double __ocml_add_rtp_f64(double, double);
  11440. __device__
  11441. __attribute__((const))
  11442. double __ocml_add_rtz_f64(double, double);
  11443. __device__
  11444. __attribute__((const))
  11445. double __ocml_sub_rte_f64(double, double);
  11446. __device__
  11447. __attribute__((const))
  11448. double __ocml_sub_rtn_f64(double, double);
  11449. __device__
  11450. __attribute__((const))
  11451. double __ocml_sub_rtp_f64(double, double);
  11452. __device__
  11453. __attribute__((const))
  11454. double __ocml_sub_rtz_f64(double, double);
  11455. __device__
  11456. __attribute__((const))
  11457. double __ocml_mul_rte_f64(double, double);
  11458. __device__
  11459. __attribute__((const))
  11460. double __ocml_mul_rtn_f64(double, double);
  11461. __device__
  11462. __attribute__((const))
  11463. double __ocml_mul_rtp_f64(double, double);
  11464. __device__
  11465. __attribute__((const))
  11466. double __ocml_mul_rtz_f64(double, double);
  11467. __device__
  11468. __attribute__((const))
  11469. double __ocml_div_rte_f64(double, double);
  11470. __device__
  11471. __attribute__((const))
  11472. double __ocml_div_rtn_f64(double, double);
  11473. __device__
  11474. __attribute__((const))
  11475. double __ocml_div_rtp_f64(double, double);
  11476. __device__
  11477. __attribute__((const))
  11478. double __ocml_div_rtz_f64(double, double);
  11479. __device__
  11480. __attribute__((const))
  11481. double __ocml_sqrt_rte_f64(double);
  11482. __device__
  11483. __attribute__((const))
  11484. double __ocml_sqrt_rtn_f64(double);
  11485. __device__
  11486. __attribute__((const))
  11487. double __ocml_sqrt_rtp_f64(double);
  11488. __device__
  11489. __attribute__((const))
  11490. double __ocml_sqrt_rtz_f64(double);
  11491. __device__
  11492. __attribute__((const))
  11493. double __ocml_fma_rte_f64(double, double, double);
  11494. __device__
  11495. __attribute__((const))
  11496. double __ocml_fma_rtn_f64(double, double, double);
  11497. __device__
  11498. __attribute__((const))
  11499. double __ocml_fma_rtp_f64(double, double, double);
  11500. __device__
  11501. __attribute__((const))
  11502. double __ocml_fma_rtz_f64(double, double, double);
  11503. // END INTRINSICS
  11504. // END DOUBLE
  11505. #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  11506. #if defined(__cplusplus)
  11507. } // extern "C"
  11508. #endif
  11509. /*
  11510. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  11511. Permission is hereby granted, free of charge, to any person obtaining a copy
  11512. of this software and associated documentation files (the "Software"), to deal
  11513. in the Software without restriction, including without limitation the rights
  11514. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11515. copies of the Software, and to permit persons to whom the Software is
  11516. furnished to do so, subject to the following conditions:
  11517. The above copyright notice and this permission notice shall be included in
  11518. all copies or substantial portions of the Software.
  11519. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  11520. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  11521. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  11522. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  11523. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  11524. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  11525. THE SOFTWARE.
  11526. */
  11527. #pragma once
  11528. // /*
  11529. // Half Math Functions
  11530. // */
  11531. #if !defined(__HIPCC_RTC__)
  11532. #include "host_defines.h"
  11533. #endif
  11534. #ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  11535. extern "C"
  11536. {
  11537. __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
  11538. __device__ _Float16 __ocml_cos_f16(_Float16);
  11539. __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
  11540. __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
  11541. __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
  11542. __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
  11543. __device__ __attribute__((const))
  11544. _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
  11545. __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
  11546. __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
  11547. __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
  11548. __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
  11549. __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
  11550. __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
  11551. __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
  11552. __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
  11553. __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
  11554. __device__ _Float16 __ocml_sin_f16(_Float16);
  11555. __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
  11556. __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
  11557. __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
  11558. __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
  11559. typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
  11560. typedef short __2i16 __attribute__((ext_vector_type(2)));
  11561. #if defined(__clang__) && defined(__HIP__)
  11562. __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
  11563. #endif
  11564. __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
  11565. __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
  11566. __device__ __2f16 __ocml_cos_2f16(__2f16);
  11567. __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
  11568. __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
  11569. __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
  11570. __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
  11571. __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
  11572. __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
  11573. __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
  11574. __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
  11575. __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
  11576. __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
  11577. __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
  11578. __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
  11579. __device__ __2f16 __ocml_sin_2f16(__2f16);
  11580. __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
  11581. __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
  11582. __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
  11583. __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
  11584. __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
  11585. }
  11586. #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
  11587. //TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
  11588. extern "C" {
  11589. __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
  11590. __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
  11591. __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
  11592. __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
  11593. __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
  11594. }
  11595. /*
  11596. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  11597. Permission is hereby granted, free of charge, to any person obtaining a copy
  11598. of this software and associated documentation files (the "Software"), to deal
  11599. in the Software without restriction, including without limitation the rights
  11600. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11601. copies of the Software, and to permit persons to whom the Software is
  11602. furnished to do so, subject to the following conditions:
  11603. The above copyright notice and this permission notice shall be included in
  11604. all copies or substantial portions of the Software.
  11605. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  11606. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  11607. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  11608. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  11609. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  11610. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  11611. THE SOFTWARE.
  11612. */
  11613. #pragma once
  11614. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
  11615. #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
  11616. #if defined(__HIPCC_RTC__)
  11617. #define __HOST_DEVICE__ __device__
  11618. #else
  11619. #define __HOST_DEVICE__ __host__ __device__
  11620. #include <hip/amd_detail/amd_hip_common.h>
  11621. #include "hip/amd_detail/host_defines.h"
  11622. #include <assert.h>
  11623. #if defined(__cplusplus)
  11624. #include <algorithm>
  11625. #include <type_traits>
  11626. #include <utility>
  11627. #endif
  11628. #endif // !defined(__HIPCC_RTC__)
  11629. #if defined(__clang__) && defined(__HIP__)
  11630. typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
  11631. struct __half_raw {
  11632. union {
  11633. static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
  11634. _Float16 data;
  11635. unsigned short x;
  11636. };
  11637. };
  11638. struct __half2_raw {
  11639. union {
  11640. static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
  11641. struct {
  11642. __half_raw x;
  11643. __half_raw y;
  11644. };
  11645. _Float16_2 data;
  11646. };
  11647. };
  11648. #if defined(__cplusplus)
  11649. #if !defined(__HIPCC_RTC__)
  11650. #include "hip_fp16_math_fwd.h"
  11651. #include "amd_hip_vector_types.h"
  11652. #include "host_defines.h"
  11653. #include "amd_device_functions.h"
  11654. #include "amd_warp_functions.h"
  11655. #endif
  11656. namespace std
  11657. {
  11658. template<> struct is_floating_point<_Float16> : std::true_type {};
  11659. }
  11660. template<bool cond, typename T = void>
  11661. using Enable_if_t = typename std::enable_if<cond, T>::type;
  11662. // BEGIN STRUCT __HALF
  11663. struct __half {
  11664. protected:
  11665. union {
  11666. static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
  11667. _Float16 data;
  11668. unsigned short __x;
  11669. };
  11670. public:
  11671. // CREATORS
  11672. __HOST_DEVICE__
  11673. __half() = default;
  11674. __HOST_DEVICE__
  11675. __half(const __half_raw& x) : data{x.data} {}
  11676. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11677. __HOST_DEVICE__
  11678. __half(decltype(data) x) : data{x} {}
  11679. template<
  11680. typename T,
  11681. Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
  11682. __HOST_DEVICE__
  11683. __half(T x) : data{static_cast<_Float16>(x)} {}
  11684. #endif
  11685. __HOST_DEVICE__
  11686. __half(const __half&) = default;
  11687. __HOST_DEVICE__
  11688. __half(__half&&) = default;
  11689. __HOST_DEVICE__
  11690. ~__half() = default;
  11691. // CREATORS - DEVICE ONLY
  11692. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11693. template<
  11694. typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
  11695. __HOST_DEVICE__
  11696. __half(T x) : data{static_cast<_Float16>(x)} {}
  11697. #endif
  11698. // MANIPULATORS
  11699. __HOST_DEVICE__
  11700. __half& operator=(const __half&) = default;
  11701. __HOST_DEVICE__
  11702. __half& operator=(__half&&) = default;
  11703. __HOST_DEVICE__
  11704. __half& operator=(const __half_raw& x)
  11705. {
  11706. data = x.data;
  11707. return *this;
  11708. }
  11709. __HOST_DEVICE__
  11710. volatile __half& operator=(const __half_raw& x) volatile
  11711. {
  11712. data = x.data;
  11713. return *this;
  11714. }
  11715. volatile __half& operator=(const volatile __half_raw& x) volatile
  11716. {
  11717. data = x.data;
  11718. return *this;
  11719. }
  11720. __half& operator=(__half_raw&& x)
  11721. {
  11722. data = x.data;
  11723. return *this;
  11724. }
  11725. volatile __half& operator=(__half_raw&& x) volatile
  11726. {
  11727. data = x.data;
  11728. return *this;
  11729. }
  11730. volatile __half& operator=(volatile __half_raw&& x) volatile
  11731. {
  11732. data = x.data;
  11733. return *this;
  11734. }
  11735. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11736. template<
  11737. typename T,
  11738. Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
  11739. __HOST_DEVICE__
  11740. __half& operator=(T x)
  11741. {
  11742. data = static_cast<_Float16>(x);
  11743. return *this;
  11744. }
  11745. #endif
  11746. // MANIPULATORS - DEVICE ONLY
  11747. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11748. template<
  11749. typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
  11750. __device__
  11751. __half& operator=(T x)
  11752. {
  11753. data = static_cast<_Float16>(x);
  11754. return *this;
  11755. }
  11756. #endif
  11757. #if !defined(__HIP_NO_HALF_OPERATORS__)
  11758. __device__
  11759. __half& operator+=(const __half& x)
  11760. {
  11761. data += x.data;
  11762. return *this;
  11763. }
  11764. __device__
  11765. __half& operator-=(const __half& x)
  11766. {
  11767. data -= x.data;
  11768. return *this;
  11769. }
  11770. __device__
  11771. __half& operator*=(const __half& x)
  11772. {
  11773. data *= x.data;
  11774. return *this;
  11775. }
  11776. __device__
  11777. __half& operator/=(const __half& x)
  11778. {
  11779. data /= x.data;
  11780. return *this;
  11781. }
  11782. __device__
  11783. __half& operator++() { ++data; return *this; }
  11784. __device__
  11785. __half operator++(int)
  11786. {
  11787. __half tmp{*this};
  11788. ++*this;
  11789. return tmp;
  11790. }
  11791. __device__
  11792. __half& operator--() { --data; return *this; }
  11793. __device__
  11794. __half operator--(int)
  11795. {
  11796. __half tmp{*this};
  11797. --*this;
  11798. return tmp;
  11799. }
  11800. #endif
  11801. // ACCESSORS
  11802. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11803. template<
  11804. typename T,
  11805. Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
  11806. __HOST_DEVICE__
  11807. operator T() const { return data; }
  11808. #endif
  11809. __HOST_DEVICE__
  11810. operator __half_raw() const { return __half_raw{data}; }
  11811. __HOST_DEVICE__
  11812. operator __half_raw() const volatile
  11813. {
  11814. return __half_raw{data};
  11815. }
  11816. #if !defined(__HIP_NO_HALF_CONVERSIONS__)
  11817. template<
  11818. typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
  11819. __HOST_DEVICE__
  11820. operator T() const { return data; }
  11821. #endif
  11822. #if !defined(__HIP_NO_HALF_OPERATORS__)
  11823. __device__
  11824. __half operator+() const { return *this; }
  11825. __device__
  11826. __half operator-() const
  11827. {
  11828. __half tmp{*this};
  11829. tmp.data = -tmp.data;
  11830. return tmp;
  11831. }
  11832. #endif
  11833. // FRIENDS
  11834. #if !defined(__HIP_NO_HALF_OPERATORS__)
  11835. friend
  11836. inline
  11837. __device__
  11838. __half operator+(const __half& x, const __half& y)
  11839. {
  11840. return __half{x} += y;
  11841. }
  11842. friend
  11843. inline
  11844. __device__
  11845. __half operator-(const __half& x, const __half& y)
  11846. {
  11847. return __half{x} -= y;
  11848. }
  11849. friend
  11850. inline
  11851. __device__
  11852. __half operator*(const __half& x, const __half& y)
  11853. {
  11854. return __half{x} *= y;
  11855. }
  11856. friend
  11857. inline
  11858. __device__
  11859. __half operator/(const __half& x, const __half& y)
  11860. {
  11861. return __half{x} /= y;
  11862. }
  11863. friend
  11864. inline
  11865. __device__
  11866. bool operator==(const __half& x, const __half& y)
  11867. {
  11868. return x.data == y.data;
  11869. }
  11870. friend
  11871. inline
  11872. __device__
  11873. bool operator!=(const __half& x, const __half& y)
  11874. {
  11875. return !(x == y);
  11876. }
  11877. friend
  11878. inline
  11879. __device__
  11880. bool operator<(const __half& x, const __half& y)
  11881. {
  11882. return x.data < y.data;
  11883. }
  11884. friend
  11885. inline
  11886. __device__
  11887. bool operator>(const __half& x, const __half& y)
  11888. {
  11889. return y.data < x.data;
  11890. }
  11891. friend
  11892. inline
  11893. __device__
  11894. bool operator<=(const __half& x, const __half& y)
  11895. {
  11896. return !(y < x);
  11897. }
  11898. friend
  11899. inline
  11900. __device__
  11901. bool operator>=(const __half& x, const __half& y)
  11902. {
  11903. return !(x < y);
  11904. }
  11905. #endif // !defined(__HIP_NO_HALF_OPERATORS__)
  11906. };
  11907. // END STRUCT __HALF
  11908. // BEGIN STRUCT __HALF2
  11909. struct __half2 {
  11910. public:
  11911. union {
  11912. static_assert(
  11913. sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
  11914. struct {
  11915. __half x;
  11916. __half y;
  11917. };
  11918. _Float16_2 data;
  11919. };
  11920. // CREATORS
  11921. __HOST_DEVICE__
  11922. __half2() = default;
  11923. __HOST_DEVICE__
  11924. __half2(const __half2_raw& xx) : data{xx.data} {}
  11925. __HOST_DEVICE__
  11926. __half2(decltype(data) xx) : data{xx} {}
  11927. __HOST_DEVICE__
  11928. __half2(const __half& xx, const __half& yy)
  11929. :
  11930. data{static_cast<__half_raw>(xx).data,
  11931. static_cast<__half_raw>(yy).data}
  11932. {}
  11933. __HOST_DEVICE__
  11934. __half2(const __half2&) = default;
  11935. __HOST_DEVICE__
  11936. __half2(__half2&&) = default;
  11937. __HOST_DEVICE__
  11938. ~__half2() = default;
  11939. // MANIPULATORS
  11940. __HOST_DEVICE__
  11941. __half2& operator=(const __half2&) = default;
  11942. __HOST_DEVICE__
  11943. __half2& operator=(__half2&&) = default;
  11944. __HOST_DEVICE__
  11945. __half2& operator=(const __half2_raw& xx)
  11946. {
  11947. data = xx.data;
  11948. return *this;
  11949. }
  11950. // MANIPULATORS - DEVICE ONLY
  11951. #if !defined(__HIP_NO_HALF_OPERATORS__)
  11952. __device__
  11953. __half2& operator+=(const __half2& xx)
  11954. {
  11955. data += xx.data;
  11956. return *this;
  11957. }
  11958. __device__
  11959. __half2& operator-=(const __half2& xx)
  11960. {
  11961. data -= xx.data;
  11962. return *this;
  11963. }
  11964. __device__
  11965. __half2& operator*=(const __half2& xx)
  11966. {
  11967. data *= xx.data;
  11968. return *this;
  11969. }
  11970. __device__
  11971. __half2& operator/=(const __half2& xx)
  11972. {
  11973. data /= xx.data;
  11974. return *this;
  11975. }
  11976. __device__
  11977. __half2& operator++() { return *this += _Float16_2{1, 1}; }
  11978. __device__
  11979. __half2 operator++(int)
  11980. {
  11981. __half2 tmp{*this};
  11982. ++*this;
  11983. return tmp;
  11984. }
  11985. __device__
  11986. __half2& operator--() { return *this -= _Float16_2{1, 1}; }
  11987. __device__
  11988. __half2 operator--(int)
  11989. {
  11990. __half2 tmp{*this};
  11991. --*this;
  11992. return tmp;
  11993. }
  11994. #endif
  11995. // ACCESSORS
  11996. __HOST_DEVICE__
  11997. operator decltype(data)() const { return data; }
  11998. __HOST_DEVICE__
  11999. operator __half2_raw() const {
  12000. __half2_raw r;
  12001. r.data = data;
  12002. return r;
  12003. }
  12004. // ACCESSORS - DEVICE ONLY
  12005. #if !defined(__HIP_NO_HALF_OPERATORS__)
  12006. __device__
  12007. __half2 operator+() const { return *this; }
  12008. __device__
  12009. __half2 operator-() const
  12010. {
  12011. __half2 tmp{*this};
  12012. tmp.data = -tmp.data;
  12013. return tmp;
  12014. }
  12015. #endif
  12016. // FRIENDS
  12017. #if !defined(__HIP_NO_HALF_OPERATORS__)
  12018. friend
  12019. inline
  12020. __device__
  12021. __half2 operator+(const __half2& xx, const __half2& yy)
  12022. {
  12023. return __half2{xx} += yy;
  12024. }
  12025. friend
  12026. inline
  12027. __device__
  12028. __half2 operator-(const __half2& xx, const __half2& yy)
  12029. {
  12030. return __half2{xx} -= yy;
  12031. }
  12032. friend
  12033. inline
  12034. __device__
  12035. __half2 operator*(const __half2& xx, const __half2& yy)
  12036. {
  12037. return __half2{xx} *= yy;
  12038. }
  12039. friend
  12040. inline
  12041. __device__
  12042. __half2 operator/(const __half2& xx, const __half2& yy)
  12043. {
  12044. return __half2{xx} /= yy;
  12045. }
  12046. friend
  12047. inline
  12048. __device__
  12049. bool operator==(const __half2& xx, const __half2& yy)
  12050. {
  12051. auto r = xx.data == yy.data;
  12052. return r.x != 0 && r.y != 0;
  12053. }
  12054. friend
  12055. inline
  12056. __device__
  12057. bool operator!=(const __half2& xx, const __half2& yy)
  12058. {
  12059. return !(xx == yy);
  12060. }
  12061. friend
  12062. inline
  12063. __device__
  12064. bool operator<(const __half2& xx, const __half2& yy)
  12065. {
  12066. auto r = xx.data < yy.data;
  12067. return r.x != 0 && r.y != 0;
  12068. }
  12069. friend
  12070. inline
  12071. __device__
  12072. bool operator>(const __half2& xx, const __half2& yy)
  12073. {
  12074. return yy < xx;
  12075. }
  12076. friend
  12077. inline
  12078. __device__
  12079. bool operator<=(const __half2& xx, const __half2& yy)
  12080. {
  12081. return !(yy < xx);
  12082. }
  12083. friend
  12084. inline
  12085. __device__
  12086. bool operator>=(const __half2& xx, const __half2& yy)
  12087. {
  12088. return !(xx < yy);
  12089. }
  12090. #endif // !defined(__HIP_NO_HALF_OPERATORS__)
  12091. };
  12092. // END STRUCT __HALF2
  12093. namespace
  12094. {
  12095. inline
  12096. __HOST_DEVICE__
  12097. __half2 make_half2(__half x, __half y)
  12098. {
  12099. return __half2{x, y};
  12100. }
  12101. inline
  12102. __HOST_DEVICE__
  12103. __half __low2half(__half2 x)
  12104. {
  12105. return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
  12106. }
  12107. inline
  12108. __HOST_DEVICE__
  12109. __half __high2half(__half2 x)
  12110. {
  12111. return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
  12112. }
  12113. inline
  12114. __HOST_DEVICE__
  12115. __half2 __half2half2(__half x)
  12116. {
  12117. return __half2{x, x};
  12118. }
  12119. inline
  12120. __HOST_DEVICE__
  12121. __half2 __halves2half2(__half x, __half y)
  12122. {
  12123. return __half2{x, y};
  12124. }
  12125. inline
  12126. __HOST_DEVICE__
  12127. __half2 __low2half2(__half2 x)
  12128. {
  12129. return __half2{
  12130. _Float16_2{
  12131. static_cast<__half2_raw>(x).data.x,
  12132. static_cast<__half2_raw>(x).data.x}};
  12133. }
  12134. inline
  12135. __HOST_DEVICE__
  12136. __half2 __high2half2(__half2 x)
  12137. {
  12138. return __half2{
  12139. _Float16_2{
  12140. static_cast<__half2_raw>(x).data.y,
  12141. static_cast<__half2_raw>(x).data.y}};
  12142. }
  12143. inline
  12144. __HOST_DEVICE__
  12145. __half2 __lows2half2(__half2 x, __half2 y)
  12146. {
  12147. return __half2{
  12148. _Float16_2{
  12149. static_cast<__half2_raw>(x).data.x,
  12150. static_cast<__half2_raw>(y).data.x}};
  12151. }
  12152. inline
  12153. __HOST_DEVICE__
  12154. __half2 __highs2half2(__half2 x, __half2 y)
  12155. {
  12156. return __half2{
  12157. _Float16_2{
  12158. static_cast<__half2_raw>(x).data.y,
  12159. static_cast<__half2_raw>(y).data.y}};
  12160. }
  12161. inline
  12162. __HOST_DEVICE__
  12163. __half2 __lowhigh2highlow(__half2 x)
  12164. {
  12165. return __half2{
  12166. _Float16_2{
  12167. static_cast<__half2_raw>(x).data.y,
  12168. static_cast<__half2_raw>(x).data.x}};
  12169. }
  12170. // Bitcasts
  12171. inline
  12172. __device__
  12173. short __half_as_short(__half x)
  12174. {
  12175. return static_cast<__half_raw>(x).x;
  12176. }
  12177. inline
  12178. __device__
  12179. unsigned short __half_as_ushort(__half x)
  12180. {
  12181. return static_cast<__half_raw>(x).x;
  12182. }
  12183. inline
  12184. __device__
  12185. __half __short_as_half(short x)
  12186. {
  12187. __half_raw r; r.x = x;
  12188. return r;
  12189. }
  12190. inline
  12191. __device__
  12192. __half __ushort_as_half(unsigned short x)
  12193. {
  12194. __half_raw r; r.x = x;
  12195. return r;
  12196. }
  12197. // float -> half | half2
  12198. inline
  12199. __HOST_DEVICE__
  12200. __half __float2half(float x)
  12201. {
  12202. return __half_raw{static_cast<_Float16>(x)};
  12203. }
  12204. inline
  12205. __HOST_DEVICE__
  12206. __half __float2half_rn(float x)
  12207. {
  12208. return __half_raw{static_cast<_Float16>(x)};
  12209. }
  12210. #if !defined(__HIPCC_RTC__)
  12211. // TODO: rounding behaviour is not correct for host functions.
  12212. inline
  12213. __host__
  12214. __half __float2half_rz(float x)
  12215. {
  12216. return __half_raw{static_cast<_Float16>(x)};
  12217. }
  12218. inline
  12219. __host__
  12220. __half __float2half_rd(float x)
  12221. {
  12222. return __half_raw{static_cast<_Float16>(x)};
  12223. }
  12224. inline
  12225. __host__
  12226. __half __float2half_ru(float x)
  12227. {
  12228. return __half_raw{static_cast<_Float16>(x)};
  12229. }
  12230. #endif
  12231. inline
  12232. __device__
  12233. __half __float2half_rz(float x)
  12234. {
  12235. return __half_raw{__ocml_cvtrtz_f16_f32(x)};
  12236. }
  12237. inline
  12238. __device__
  12239. __half __float2half_rd(float x)
  12240. {
  12241. return __half_raw{__ocml_cvtrtn_f16_f32(x)};
  12242. }
  12243. inline
  12244. __device__
  12245. __half __float2half_ru(float x)
  12246. {
  12247. return __half_raw{__ocml_cvtrtp_f16_f32(x)};
  12248. }
  12249. inline
  12250. __HOST_DEVICE__
  12251. __half2 __float2half2_rn(float x)
  12252. {
  12253. return __half2{
  12254. _Float16_2{
  12255. static_cast<_Float16>(x), static_cast<_Float16>(x)}};
  12256. }
  12257. inline
  12258. __HOST_DEVICE__
  12259. __half2 __floats2half2_rn(float x, float y)
  12260. {
  12261. return __half2{_Float16_2{
  12262. static_cast<_Float16>(x), static_cast<_Float16>(y)}};
  12263. }
  12264. inline
  12265. __HOST_DEVICE__
  12266. __half2 __float22half2_rn(float2 x)
  12267. {
  12268. return __floats2half2_rn(x.x, x.y);
  12269. }
  12270. // half | half2 -> float
  12271. inline
  12272. __HOST_DEVICE__
  12273. float __half2float(__half x)
  12274. {
  12275. return static_cast<__half_raw>(x).data;
  12276. }
  12277. inline
  12278. __HOST_DEVICE__
  12279. float __low2float(__half2 x)
  12280. {
  12281. return static_cast<__half2_raw>(x).data.x;
  12282. }
  12283. inline
  12284. __HOST_DEVICE__
  12285. float __high2float(__half2 x)
  12286. {
  12287. return static_cast<__half2_raw>(x).data.y;
  12288. }
  12289. inline
  12290. __HOST_DEVICE__
  12291. float2 __half22float2(__half2 x)
  12292. {
  12293. return make_float2(
  12294. static_cast<__half2_raw>(x).data.x,
  12295. static_cast<__half2_raw>(x).data.y);
  12296. }
  12297. // half -> int
  12298. inline
  12299. __device__
  12300. int __half2int_rn(__half x)
  12301. {
  12302. return static_cast<__half_raw>(x).data;
  12303. }
  12304. inline
  12305. __device__
  12306. int __half2int_rz(__half x)
  12307. {
  12308. return static_cast<__half_raw>(x).data;
  12309. }
  12310. inline
  12311. __device__
  12312. int __half2int_rd(__half x)
  12313. {
  12314. return static_cast<__half_raw>(x).data;
  12315. }
  12316. inline
  12317. __device__
  12318. int __half2int_ru(__half x)
  12319. {
  12320. return static_cast<__half_raw>(x).data;
  12321. }
  12322. // int -> half
  12323. inline
  12324. __device__
  12325. __half __int2half_rn(int x)
  12326. {
  12327. return __half_raw{static_cast<_Float16>(x)};
  12328. }
  12329. inline
  12330. __device__
  12331. __half __int2half_rz(int x)
  12332. {
  12333. return __half_raw{static_cast<_Float16>(x)};
  12334. }
  12335. inline
  12336. __device__
  12337. __half __int2half_rd(int x)
  12338. {
  12339. return __half_raw{static_cast<_Float16>(x)};
  12340. }
  12341. inline
  12342. __device__
  12343. __half __int2half_ru(int x)
  12344. {
  12345. return __half_raw{static_cast<_Float16>(x)};
  12346. }
  12347. // half -> short
  12348. inline
  12349. __device__
  12350. short __half2short_rn(__half x)
  12351. {
  12352. return static_cast<__half_raw>(x).data;
  12353. }
  12354. inline
  12355. __device__
  12356. short __half2short_rz(__half x)
  12357. {
  12358. return static_cast<__half_raw>(x).data;
  12359. }
  12360. inline
  12361. __device__
  12362. short __half2short_rd(__half x)
  12363. {
  12364. return static_cast<__half_raw>(x).data;
  12365. }
  12366. inline
  12367. __device__
  12368. short __half2short_ru(__half x)
  12369. {
  12370. return static_cast<__half_raw>(x).data;
  12371. }
  12372. // short -> half
  12373. inline
  12374. __device__
  12375. __half __short2half_rn(short x)
  12376. {
  12377. return __half_raw{static_cast<_Float16>(x)};
  12378. }
  12379. inline
  12380. __device__
  12381. __half __short2half_rz(short x)
  12382. {
  12383. return __half_raw{static_cast<_Float16>(x)};
  12384. }
  12385. inline
  12386. __device__
  12387. __half __short2half_rd(short x)
  12388. {
  12389. return __half_raw{static_cast<_Float16>(x)};
  12390. }
  12391. inline
  12392. __device__
  12393. __half __short2half_ru(short x)
  12394. {
  12395. return __half_raw{static_cast<_Float16>(x)};
  12396. }
  12397. // half -> long long
  12398. inline
  12399. __device__
  12400. long long __half2ll_rn(__half x)
  12401. {
  12402. return static_cast<__half_raw>(x).data;
  12403. }
  12404. inline
  12405. __device__
  12406. long long __half2ll_rz(__half x)
  12407. {
  12408. return static_cast<__half_raw>(x).data;
  12409. }
  12410. inline
  12411. __device__
  12412. long long __half2ll_rd(__half x)
  12413. {
  12414. return static_cast<__half_raw>(x).data;
  12415. }
  12416. inline
  12417. __device__
  12418. long long __half2ll_ru(__half x)
  12419. {
  12420. return static_cast<__half_raw>(x).data;
  12421. }
  12422. // long long -> half
  12423. inline
  12424. __device__
  12425. __half __ll2half_rn(long long x)
  12426. {
  12427. return __half_raw{static_cast<_Float16>(x)};
  12428. }
  12429. inline
  12430. __device__
  12431. __half __ll2half_rz(long long x)
  12432. {
  12433. return __half_raw{static_cast<_Float16>(x)};
  12434. }
  12435. inline
  12436. __device__
  12437. __half __ll2half_rd(long long x)
  12438. {
  12439. return __half_raw{static_cast<_Float16>(x)};
  12440. }
  12441. inline
  12442. __device__
  12443. __half __ll2half_ru(long long x)
  12444. {
  12445. return __half_raw{static_cast<_Float16>(x)};
  12446. }
  12447. // half -> unsigned int
  12448. inline
  12449. __device__
  12450. unsigned int __half2uint_rn(__half x)
  12451. {
  12452. return static_cast<__half_raw>(x).data;
  12453. }
  12454. inline
  12455. __device__
  12456. unsigned int __half2uint_rz(__half x)
  12457. {
  12458. return static_cast<__half_raw>(x).data;
  12459. }
  12460. inline
  12461. __device__
  12462. unsigned int __half2uint_rd(__half x)
  12463. {
  12464. return static_cast<__half_raw>(x).data;
  12465. }
  12466. inline
  12467. __device__
  12468. unsigned int __half2uint_ru(__half x)
  12469. {
  12470. return static_cast<__half_raw>(x).data;
  12471. }
  12472. // unsigned int -> half
  12473. inline
  12474. __device__
  12475. __half __uint2half_rn(unsigned int x)
  12476. {
  12477. return __half_raw{static_cast<_Float16>(x)};
  12478. }
  12479. inline
  12480. __device__
  12481. __half __uint2half_rz(unsigned int x)
  12482. {
  12483. return __half_raw{static_cast<_Float16>(x)};
  12484. }
  12485. inline
  12486. __device__
  12487. __half __uint2half_rd(unsigned int x)
  12488. {
  12489. return __half_raw{static_cast<_Float16>(x)};
  12490. }
  12491. inline
  12492. __device__
  12493. __half __uint2half_ru(unsigned int x)
  12494. {
  12495. return __half_raw{static_cast<_Float16>(x)};
  12496. }
  12497. // half -> unsigned short
  12498. inline
  12499. __device__
  12500. unsigned short __half2ushort_rn(__half x)
  12501. {
  12502. return static_cast<__half_raw>(x).data;
  12503. }
  12504. inline
  12505. __device__
  12506. unsigned short __half2ushort_rz(__half x)
  12507. {
  12508. return static_cast<__half_raw>(x).data;
  12509. }
  12510. inline
  12511. __device__
  12512. unsigned short __half2ushort_rd(__half x)
  12513. {
  12514. return static_cast<__half_raw>(x).data;
  12515. }
  12516. inline
  12517. __device__
  12518. unsigned short __half2ushort_ru(__half x)
  12519. {
  12520. return static_cast<__half_raw>(x).data;
  12521. }
  12522. // unsigned short -> half
  12523. inline
  12524. __device__
  12525. __half __ushort2half_rn(unsigned short x)
  12526. {
  12527. return __half_raw{static_cast<_Float16>(x)};
  12528. }
  12529. inline
  12530. __device__
  12531. __half __ushort2half_rz(unsigned short x)
  12532. {
  12533. return __half_raw{static_cast<_Float16>(x)};
  12534. }
  12535. inline
  12536. __device__
  12537. __half __ushort2half_rd(unsigned short x)
  12538. {
  12539. return __half_raw{static_cast<_Float16>(x)};
  12540. }
  12541. inline
  12542. __device__
  12543. __half __ushort2half_ru(unsigned short x)
  12544. {
  12545. return __half_raw{static_cast<_Float16>(x)};
  12546. }
  12547. // half -> unsigned long long
  12548. inline
  12549. __device__
  12550. unsigned long long __half2ull_rn(__half x)
  12551. {
  12552. return static_cast<__half_raw>(x).data;
  12553. }
  12554. inline
  12555. __device__
  12556. unsigned long long __half2ull_rz(__half x)
  12557. {
  12558. return static_cast<__half_raw>(x).data;
  12559. }
  12560. inline
  12561. __device__
  12562. unsigned long long __half2ull_rd(__half x)
  12563. {
  12564. return static_cast<__half_raw>(x).data;
  12565. }
  12566. inline
  12567. __device__
  12568. unsigned long long __half2ull_ru(__half x)
  12569. {
  12570. return static_cast<__half_raw>(x).data;
  12571. }
  12572. // unsigned long long -> half
  12573. inline
  12574. __device__
  12575. __half __ull2half_rn(unsigned long long x)
  12576. {
  12577. return __half_raw{static_cast<_Float16>(x)};
  12578. }
  12579. inline
  12580. __device__
  12581. __half __ull2half_rz(unsigned long long x)
  12582. {
  12583. return __half_raw{static_cast<_Float16>(x)};
  12584. }
  12585. inline
  12586. __device__
  12587. __half __ull2half_rd(unsigned long long x)
  12588. {
  12589. return __half_raw{static_cast<_Float16>(x)};
  12590. }
  12591. inline
  12592. __device__
  12593. __half __ull2half_ru(unsigned long long x)
  12594. {
  12595. return __half_raw{static_cast<_Float16>(x)};
  12596. }
  12597. // Load primitives
  12598. inline
  12599. __device__
  12600. __half __ldg(const __half* ptr) { return *ptr; }
  12601. inline
  12602. __device__
  12603. __half __ldcg(const __half* ptr) { return *ptr; }
  12604. inline
  12605. __device__
  12606. __half __ldca(const __half* ptr) { return *ptr; }
  12607. inline
  12608. __device__
  12609. __half __ldcs(const __half* ptr) { return *ptr; }
  12610. inline
  12611. __HOST_DEVICE__
  12612. __half2 __ldg(const __half2* ptr) { return *ptr; }
  12613. inline
  12614. __HOST_DEVICE__
  12615. __half2 __ldcg(const __half2* ptr) { return *ptr; }
  12616. inline
  12617. __HOST_DEVICE__
  12618. __half2 __ldca(const __half2* ptr) { return *ptr; }
  12619. inline
  12620. __HOST_DEVICE__
  12621. __half2 __ldcs(const __half2* ptr) { return *ptr; }
  12622. // Relations
  12623. inline
  12624. __device__
  12625. bool __heq(__half x, __half y)
  12626. {
  12627. return static_cast<__half_raw>(x).data ==
  12628. static_cast<__half_raw>(y).data;
  12629. }
  12630. inline
  12631. __device__
  12632. bool __hne(__half x, __half y)
  12633. {
  12634. return static_cast<__half_raw>(x).data !=
  12635. static_cast<__half_raw>(y).data;
  12636. }
  12637. inline
  12638. __device__
  12639. bool __hle(__half x, __half y)
  12640. {
  12641. return static_cast<__half_raw>(x).data <=
  12642. static_cast<__half_raw>(y).data;
  12643. }
  12644. inline
  12645. __device__
  12646. bool __hge(__half x, __half y)
  12647. {
  12648. return static_cast<__half_raw>(x).data >=
  12649. static_cast<__half_raw>(y).data;
  12650. }
  12651. inline
  12652. __device__
  12653. bool __hlt(__half x, __half y)
  12654. {
  12655. return static_cast<__half_raw>(x).data <
  12656. static_cast<__half_raw>(y).data;
  12657. }
  12658. inline
  12659. __device__
  12660. bool __hgt(__half x, __half y)
  12661. {
  12662. return static_cast<__half_raw>(x).data >
  12663. static_cast<__half_raw>(y).data;
  12664. }
  12665. inline __device__
  12666. bool __hequ(__half x, __half y) {
  12667. return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
  12668. !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
  12669. }
  12670. inline __device__
  12671. bool __hneu(__half x, __half y) {
  12672. return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
  12673. }
  12674. inline __device__
  12675. bool __hleu(__half x, __half y) {
  12676. return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
  12677. }
  12678. inline
  12679. __device__
  12680. bool __hgeu(__half x, __half y) {
  12681. return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data);
  12682. }
  12683. inline
  12684. __device__
  12685. bool __hltu(__half x, __half y) {
  12686. return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data);
  12687. }
  12688. inline
  12689. __device__
  12690. bool __hgtu(__half x, __half y) {
  12691. return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data);
  12692. }
  12693. inline
  12694. __HOST_DEVICE__
  12695. __half2 __heq2(__half2 x, __half2 y)
  12696. {
  12697. auto r = static_cast<__half2_raw>(x).data ==
  12698. static_cast<__half2_raw>(y).data;
  12699. return __builtin_convertvector(-r, _Float16_2);
  12700. }
  12701. inline
  12702. __HOST_DEVICE__
  12703. __half2 __hne2(__half2 x, __half2 y)
  12704. {
  12705. auto r = static_cast<__half2_raw>(x).data !=
  12706. static_cast<__half2_raw>(y).data;
  12707. return __builtin_convertvector(-r, _Float16_2);
  12708. }
  12709. inline
  12710. __HOST_DEVICE__
  12711. __half2 __hle2(__half2 x, __half2 y)
  12712. {
  12713. auto r = static_cast<__half2_raw>(x).data <=
  12714. static_cast<__half2_raw>(y).data;
  12715. return __builtin_convertvector(-r, _Float16_2);
  12716. }
  12717. inline
  12718. __HOST_DEVICE__
  12719. __half2 __hge2(__half2 x, __half2 y)
  12720. {
  12721. auto r = static_cast<__half2_raw>(x).data >=
  12722. static_cast<__half2_raw>(y).data;
  12723. return __builtin_convertvector(-r, _Float16_2);
  12724. }
  12725. inline
  12726. __HOST_DEVICE__
  12727. __half2 __hlt2(__half2 x, __half2 y)
  12728. {
  12729. auto r = static_cast<__half2_raw>(x).data <
  12730. static_cast<__half2_raw>(y).data;
  12731. return __builtin_convertvector(-r, _Float16_2);
  12732. }
  12733. inline
  12734. __HOST_DEVICE__
  12735. __half2 __hgt2(__half2 x, __half2 y)
  12736. {
  12737. auto r = static_cast<__half2_raw>(x).data >
  12738. static_cast<__half2_raw>(y).data;
  12739. return __builtin_convertvector(-r, _Float16_2);
  12740. }
  12741. inline __HOST_DEVICE__
  12742. __half2 __hequ2(__half2 x, __half2 y) {
  12743. auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
  12744. !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
  12745. return __builtin_convertvector(-r, _Float16_2);
  12746. }
  12747. inline
  12748. __HOST_DEVICE__
  12749. __half2 __hneu2(__half2 x, __half2 y) {
  12750. auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data);
  12751. return __builtin_convertvector(-r, _Float16_2);
  12752. }
  12753. inline
  12754. __HOST_DEVICE__
  12755. __half2 __hleu2(__half2 x, __half2 y) {
  12756. auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
  12757. return __builtin_convertvector(-r, _Float16_2);
  12758. }
  12759. inline
  12760. __HOST_DEVICE__
  12761. __half2 __hgeu2(__half2 x, __half2 y) {
  12762. auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data);
  12763. return __builtin_convertvector(-r, _Float16_2);
  12764. }
  12765. inline
  12766. __HOST_DEVICE__
  12767. __half2 __hltu2(__half2 x, __half2 y) {
  12768. auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data);
  12769. return __builtin_convertvector(-r, _Float16_2);
  12770. }
  12771. inline
  12772. __HOST_DEVICE__
  12773. __half2 __hgtu2(__half2 x, __half2 y) {
  12774. auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data);
  12775. return __builtin_convertvector(-r, _Float16_2);
  12776. }
  12777. inline
  12778. __HOST_DEVICE__
  12779. bool __hbeq2(__half2 x, __half2 y)
  12780. {
  12781. auto r = static_cast<__half2_raw>(__heq2(x, y));
  12782. return r.data.x != 0 && r.data.y != 0;
  12783. }
  12784. inline
  12785. __HOST_DEVICE__
  12786. bool __hbne2(__half2 x, __half2 y)
  12787. {
  12788. auto r = static_cast<__half2_raw>(__hne2(x, y));
  12789. return r.data.x != 0 && r.data.y != 0;
  12790. }
  12791. inline
  12792. __HOST_DEVICE__
  12793. bool __hble2(__half2 x, __half2 y)
  12794. {
  12795. auto r = static_cast<__half2_raw>(__hle2(x, y));
  12796. return r.data.x != 0 && r.data.y != 0;
  12797. }
  12798. inline
  12799. __HOST_DEVICE__
  12800. bool __hbge2(__half2 x, __half2 y)
  12801. {
  12802. auto r = static_cast<__half2_raw>(__hge2(x, y));
  12803. return r.data.x != 0 && r.data.y != 0;
  12804. }
  12805. inline
  12806. __HOST_DEVICE__
  12807. bool __hblt2(__half2 x, __half2 y)
  12808. {
  12809. auto r = static_cast<__half2_raw>(__hlt2(x, y));
  12810. return r.data.x != 0 && r.data.y != 0;
  12811. }
  12812. inline
  12813. __HOST_DEVICE__
  12814. bool __hbgt2(__half2 x, __half2 y)
  12815. {
  12816. auto r = static_cast<__half2_raw>(__hgt2(x, y));
  12817. return r.data.x != 0 && r.data.y != 0;
  12818. }
  12819. inline
  12820. __HOST_DEVICE__
  12821. bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
  12822. inline
  12823. __HOST_DEVICE__
  12824. bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
  12825. inline
  12826. __HOST_DEVICE__
  12827. bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
  12828. inline
  12829. __HOST_DEVICE__
  12830. bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
  12831. inline
  12832. __HOST_DEVICE__
  12833. bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
  12834. inline
  12835. __HOST_DEVICE__
  12836. bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
  12837. inline
  12838. __device__
  12839. __half __hmax(const __half x, const __half y) {
  12840. return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data,
  12841. static_cast<__half_raw>(y).data)};
  12842. }
  12843. inline
  12844. __device__
  12845. __half __hmax_nan(const __half x, const __half y) {
  12846. if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
  12847. return x;
  12848. } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
  12849. return y;
  12850. }
  12851. return __hmax(x, y);
  12852. }
  12853. inline
  12854. __device__
  12855. __half __hmin(const __half x, const __half y) {
  12856. return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data,
  12857. static_cast<__half_raw>(y).data)};
  12858. }
  12859. inline
  12860. __device__
  12861. __half __hmin_nan(const __half x, const __half y) {
  12862. if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
  12863. return x;
  12864. } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
  12865. return y;
  12866. }
  12867. return __hmin(x, y);
  12868. }
  12869. // Arithmetic
  12870. inline
  12871. __device__
  12872. __half __clamp_01(__half x)
  12873. {
  12874. auto r = static_cast<__half_raw>(x);
  12875. if (__hlt(x, __half_raw{0})) return __half_raw{0};
  12876. if (__hlt(__half_raw{1}, x)) return __half_raw{1};
  12877. return r;
  12878. }
  12879. inline
  12880. __device__
  12881. __half __hadd(__half x, __half y)
  12882. {
  12883. return __half_raw{
  12884. static_cast<__half_raw>(x).data +
  12885. static_cast<__half_raw>(y).data};
  12886. }
  12887. inline
  12888. __device__
  12889. __half __habs(__half x)
  12890. {
  12891. return __half_raw{
  12892. __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
  12893. }
  12894. inline
  12895. __device__
  12896. __half __hsub(__half x, __half y)
  12897. {
  12898. return __half_raw{
  12899. static_cast<__half_raw>(x).data -
  12900. static_cast<__half_raw>(y).data};
  12901. }
  12902. inline
  12903. __device__
  12904. __half __hmul(__half x, __half y)
  12905. {
  12906. return __half_raw{
  12907. static_cast<__half_raw>(x).data *
  12908. static_cast<__half_raw>(y).data};
  12909. }
  12910. inline
  12911. __device__
  12912. __half __hadd_sat(__half x, __half y)
  12913. {
  12914. return __clamp_01(__hadd(x, y));
  12915. }
  12916. inline
  12917. __device__
  12918. __half __hsub_sat(__half x, __half y)
  12919. {
  12920. return __clamp_01(__hsub(x, y));
  12921. }
  12922. inline
  12923. __device__
  12924. __half __hmul_sat(__half x, __half y)
  12925. {
  12926. return __clamp_01(__hmul(x, y));
  12927. }
  12928. inline
  12929. __device__
  12930. __half __hfma(__half x, __half y, __half z)
  12931. {
  12932. return __half_raw{__ocml_fma_f16(
  12933. static_cast<__half_raw>(x).data,
  12934. static_cast<__half_raw>(y).data,
  12935. static_cast<__half_raw>(z).data)};
  12936. }
  12937. inline
  12938. __device__
  12939. __half __hfma_sat(__half x, __half y, __half z)
  12940. {
  12941. return __clamp_01(__hfma(x, y, z));
  12942. }
  12943. inline
  12944. __device__
  12945. __half __hdiv(__half x, __half y)
  12946. {
  12947. return __half_raw{
  12948. static_cast<__half_raw>(x).data /
  12949. static_cast<__half_raw>(y).data};
  12950. }
  12951. inline
  12952. __HOST_DEVICE__
  12953. __half2 __hadd2(__half2 x, __half2 y)
  12954. {
  12955. return __half2{
  12956. static_cast<__half2_raw>(x).data +
  12957. static_cast<__half2_raw>(y).data};
  12958. }
  12959. inline
  12960. __HOST_DEVICE__
  12961. __half2 __habs2(__half2 x)
  12962. {
  12963. return __half2{
  12964. __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
  12965. }
  12966. inline
  12967. __HOST_DEVICE__
  12968. __half2 __hsub2(__half2 x, __half2 y)
  12969. {
  12970. return __half2{
  12971. static_cast<__half2_raw>(x).data -
  12972. static_cast<__half2_raw>(y).data};
  12973. }
  12974. inline
  12975. __HOST_DEVICE__
  12976. __half2 __hmul2(__half2 x, __half2 y)
  12977. {
  12978. return __half2{
  12979. static_cast<__half2_raw>(x).data *
  12980. static_cast<__half2_raw>(y).data};
  12981. }
  12982. inline
  12983. __HOST_DEVICE__
  12984. __half2 __hadd2_sat(__half2 x, __half2 y)
  12985. {
  12986. auto r = static_cast<__half2_raw>(__hadd2(x, y));
  12987. return __half2{
  12988. __clamp_01(__half_raw{r.data.x}),
  12989. __clamp_01(__half_raw{r.data.y})};
  12990. }
  12991. inline
  12992. __HOST_DEVICE__
  12993. __half2 __hsub2_sat(__half2 x, __half2 y)
  12994. {
  12995. auto r = static_cast<__half2_raw>(__hsub2(x, y));
  12996. return __half2{
  12997. __clamp_01(__half_raw{r.data.x}),
  12998. __clamp_01(__half_raw{r.data.y})};
  12999. }
  13000. inline
  13001. __HOST_DEVICE__
  13002. __half2 __hmul2_sat(__half2 x, __half2 y)
  13003. {
  13004. auto r = static_cast<__half2_raw>(__hmul2(x, y));
  13005. return __half2{
  13006. __clamp_01(__half_raw{r.data.x}),
  13007. __clamp_01(__half_raw{r.data.y})};
  13008. }
  13009. inline
  13010. __HOST_DEVICE__
  13011. __half2 __hfma2(__half2 x, __half2 y, __half2 z)
  13012. {
  13013. return __half2{__ocml_fma_2f16(x, y, z)};
  13014. }
  13015. inline
  13016. __HOST_DEVICE__
  13017. __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
  13018. {
  13019. auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
  13020. return __half2{
  13021. __clamp_01(__half_raw{r.data.x}),
  13022. __clamp_01(__half_raw{r.data.y})};
  13023. }
  13024. inline
  13025. __HOST_DEVICE__
  13026. __half2 __h2div(__half2 x, __half2 y)
  13027. {
  13028. return __half2{
  13029. static_cast<__half2_raw>(x).data /
  13030. static_cast<__half2_raw>(y).data};
  13031. }
  13032. // Math functions
  13033. #if defined(__clang__) && defined(__HIP__)
  13034. inline
  13035. __device__
  13036. float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
  13037. return __ockl_fdot2(static_cast<__half2_raw>(a).data,
  13038. static_cast<__half2_raw>(b).data,
  13039. c, saturate);
  13040. }
  13041. #endif
  13042. inline
  13043. __device__
  13044. __half htrunc(__half x)
  13045. {
  13046. return __half_raw{
  13047. __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
  13048. }
  13049. inline
  13050. __device__
  13051. __half hceil(__half x)
  13052. {
  13053. return __half_raw{
  13054. __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
  13055. }
  13056. inline
  13057. __device__
  13058. __half hfloor(__half x)
  13059. {
  13060. return __half_raw{
  13061. __ocml_floor_f16(static_cast<__half_raw>(x).data)};
  13062. }
  13063. inline
  13064. __device__
  13065. __half hrint(__half x)
  13066. {
  13067. return __half_raw{
  13068. __ocml_rint_f16(static_cast<__half_raw>(x).data)};
  13069. }
  13070. inline
  13071. __device__
  13072. __half hsin(__half x)
  13073. {
  13074. return __half_raw{
  13075. __ocml_sin_f16(static_cast<__half_raw>(x).data)};
  13076. }
  13077. inline
  13078. __device__
  13079. __half hcos(__half x)
  13080. {
  13081. return __half_raw{
  13082. __ocml_cos_f16(static_cast<__half_raw>(x).data)};
  13083. }
  13084. inline
  13085. __device__
  13086. __half hexp(__half x)
  13087. {
  13088. return __half_raw{
  13089. __ocml_exp_f16(static_cast<__half_raw>(x).data)};
  13090. }
  13091. inline
  13092. __device__
  13093. __half hexp2(__half x)
  13094. {
  13095. return __half_raw{
  13096. __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
  13097. }
  13098. inline
  13099. __device__
  13100. __half hexp10(__half x)
  13101. {
  13102. return __half_raw{
  13103. __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
  13104. }
  13105. inline
  13106. __device__
  13107. __half hlog2(__half x)
  13108. {
  13109. return __half_raw{
  13110. __ocml_log2_f16(static_cast<__half_raw>(x).data)};
  13111. }
  13112. inline
  13113. __device__
  13114. __half hlog(__half x)
  13115. {
  13116. return __half_raw{
  13117. __ocml_log_f16(static_cast<__half_raw>(x).data)};
  13118. }
  13119. inline
  13120. __device__
  13121. __half hlog10(__half x)
  13122. {
  13123. return __half_raw{
  13124. __ocml_log10_f16(static_cast<__half_raw>(x).data)};
  13125. }
  13126. inline
  13127. __device__
  13128. __half hrcp(__half x)
  13129. {
  13130. return __half_raw{
  13131. static_cast<_Float16>(1.0f) /static_cast<__half_raw>(x).data};
  13132. }
  13133. inline
  13134. __device__
  13135. __half hrsqrt(__half x)
  13136. {
  13137. return __half_raw{
  13138. __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
  13139. }
  13140. inline
  13141. __device__
  13142. __half hsqrt(__half x)
  13143. {
  13144. return __half_raw{
  13145. __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
  13146. }
  13147. inline
  13148. __device__
  13149. bool __hisinf(__half x)
  13150. {
  13151. return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
  13152. }
  13153. inline
  13154. __device__
  13155. bool __hisnan(__half x)
  13156. {
  13157. return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
  13158. }
  13159. inline
  13160. __device__
  13161. __half __hneg(__half x)
  13162. {
  13163. return __half_raw{-static_cast<__half_raw>(x).data};
  13164. }
  13165. inline
  13166. __HOST_DEVICE__
  13167. __half2 h2trunc(__half2 x)
  13168. {
  13169. return __half2{__ocml_trunc_2f16(x)};
  13170. }
  13171. inline
  13172. __HOST_DEVICE__
  13173. __half2 h2ceil(__half2 x)
  13174. {
  13175. return __half2{__ocml_ceil_2f16(x)};
  13176. }
  13177. inline
  13178. __HOST_DEVICE__
  13179. __half2 h2floor(__half2 x)
  13180. {
  13181. return __half2{__ocml_floor_2f16(x)};
  13182. }
  13183. inline
  13184. __HOST_DEVICE__
  13185. __half2 h2rint(__half2 x)
  13186. {
  13187. return __half2{__ocml_rint_2f16(x)};
  13188. }
  13189. inline
  13190. __HOST_DEVICE__
  13191. __half2 h2sin(__half2 x)
  13192. {
  13193. return __half2{__ocml_sin_2f16(x)};
  13194. }
  13195. inline
  13196. __HOST_DEVICE__
  13197. __half2 h2cos(__half2 x)
  13198. {
  13199. return __half2{__ocml_cos_2f16(x)};
  13200. }
  13201. inline
  13202. __HOST_DEVICE__
  13203. __half2 h2exp(__half2 x)
  13204. {
  13205. return __half2{__ocml_exp_2f16(x)};
  13206. }
  13207. inline
  13208. __HOST_DEVICE__
  13209. __half2 h2exp2(__half2 x)
  13210. {
  13211. return __half2{__ocml_exp2_2f16(x)};
  13212. }
  13213. inline
  13214. __HOST_DEVICE__
  13215. __half2 h2exp10(__half2 x)
  13216. {
  13217. return __half2{__ocml_exp10_2f16(x)};
  13218. }
  13219. inline
  13220. __HOST_DEVICE__
  13221. __half2 h2log2(__half2 x)
  13222. {
  13223. return __half2{__ocml_log2_2f16(x)};
  13224. }
  13225. inline
  13226. __HOST_DEVICE__
  13227. __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
  13228. inline
  13229. __HOST_DEVICE__
  13230. __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
  13231. inline
  13232. __HOST_DEVICE__
  13233. __half2 h2rcp(__half2 x) {
  13234. return _Float16_2{
  13235. _Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
  13236. }
  13237. inline
  13238. __HOST_DEVICE__
  13239. __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
  13240. inline
  13241. __HOST_DEVICE__
  13242. __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
  13243. inline
  13244. __HOST_DEVICE__
  13245. __half2 __hisinf2(__half2 x)
  13246. {
  13247. auto r = __ocml_isinf_2f16(x);
  13248. return __half2{_Float16_2{
  13249. static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
  13250. }
  13251. inline
  13252. __HOST_DEVICE__
  13253. __half2 __hisnan2(__half2 x)
  13254. {
  13255. auto r = __ocml_isnan_2f16(x);
  13256. return __half2{_Float16_2{
  13257. static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
  13258. }
  13259. inline
  13260. __HOST_DEVICE__
  13261. __half2 __hneg2(__half2 x)
  13262. {
  13263. return __half2{-static_cast<__half2_raw>(x).data};
  13264. }
  13265. } // Anonymous namespace.
  13266. #if !defined(HIP_NO_HALF)
  13267. using half = __half;
  13268. using half2 = __half2;
  13269. #endif
  13270. __device__
  13271. inline
  13272. __half __shfl(__half var, int src_lane, int width = warpSize) {
  13273. union { int i; __half h; } tmp; tmp.h = var;
  13274. tmp.i = __shfl(tmp.i, src_lane, width);
  13275. return tmp.h;
  13276. }
  13277. __device__
  13278. inline
  13279. __half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
  13280. union { int i; __half2 h; } tmp; tmp.h = var;
  13281. tmp.i = __shfl(tmp.i, src_lane, width);
  13282. return tmp.h;
  13283. }
  13284. __device__
  13285. inline
  13286. __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
  13287. union { int i; __half h; } tmp; tmp.h = var;
  13288. tmp.i = __shfl_up(tmp.i, lane_delta, width);
  13289. return tmp.h;
  13290. }
  13291. __device__
  13292. inline
  13293. __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
  13294. union { int i; __half2 h; } tmp; tmp.h = var;
  13295. tmp.i = __shfl_up(tmp.i, lane_delta, width);
  13296. return tmp.h;
  13297. }
  13298. __device__
  13299. inline
  13300. __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
  13301. union { int i; __half h; } tmp; tmp.h = var;
  13302. tmp.i = __shfl_down(tmp.i, lane_delta, width);
  13303. return tmp.h;
  13304. }
  13305. __device__
  13306. inline
  13307. __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
  13308. union { int i; __half2 h; } tmp; tmp.h = var;
  13309. tmp.i = __shfl_down(tmp.i, lane_delta, width);
  13310. return tmp.h;
  13311. }
  13312. __device__
  13313. inline
  13314. __half __shfl_xor(__half var, int lane_mask, int width = warpSize) {
  13315. union { int i; __half h; } tmp; tmp.h = var;
  13316. tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  13317. return tmp.h;
  13318. }
  13319. __device__
  13320. inline
  13321. __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) {
  13322. union { int i; __half2 h; } tmp; tmp.h = var;
  13323. tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  13324. return tmp.h;
  13325. }
  13326. #endif // defined(__cplusplus)
  13327. #elif defined(__GNUC__)
  13328. #if !defined(__HIPCC_RTC__)
  13329. #include "hip_fp16_gcc.h"
  13330. #endif
  13331. #endif // !defined(__clang__) && defined(__GNUC__)
  13332. #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
  13333. /*
  13334. Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  13335. Permission is hereby granted, free of charge, to any person obtaining a copy
  13336. of this software and associated documentation files (the "Software"), to deal
  13337. in the Software without restriction, including without limitation the rights
  13338. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13339. copies of the Software, and to permit persons to whom the Software is
  13340. furnished to do so, subject to the following conditions:
  13341. The above copyright notice and this permission notice shall be included in
  13342. all copies or substantial portions of the Software.
  13343. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13344. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13345. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  13346. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  13347. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  13348. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  13349. THE SOFTWARE.
  13350. */
  13351. #pragma once
  13352. #if !defined(__HIPCC_RTC__)
  13353. #include "hip_fp16_math_fwd.h"
  13354. #include "amd_hip_vector_types.h"
  13355. #include "math_fwd.h"
  13356. #include <hip/amd_detail/host_defines.h>
  13357. #include <algorithm>
  13358. // assert.h is only for the host version of assert.
  13359. // The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
  13360. // Users should include hip_runtime.h for the device version of assert.
  13361. #if !__HIP_DEVICE_COMPILE__
  13362. #include <assert.h>
  13363. #endif
  13364. #include <limits.h>
  13365. #include <limits>
  13366. #include <stdint.h>
  13367. #endif // !defined(__HIPCC_RTC__)
  13368. #if _LIBCPP_VERSION && __HIP__
  13369. namespace std {
  13370. template <>
  13371. struct __numeric_type<_Float16>
  13372. {
  13373. static _Float16 __test(_Float16);
  13374. typedef _Float16 type;
  13375. static const bool value = true;
  13376. };
  13377. }
  13378. #endif // _LIBCPP_VERSION
  13379. #pragma push_macro("__DEVICE__")
  13380. #pragma push_macro("__RETURN_TYPE")
  13381. #define __DEVICE__ static __device__
  13382. #define __RETURN_TYPE bool
  13383. // DOT FUNCTIONS
  13384. #if __HIP_CLANG_ONLY__
  13385. __DEVICE__
  13386. inline
  13387. int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
  13388. return __ockl_sdot2(a.data, b.data, c, saturate);
  13389. }
  13390. __DEVICE__
  13391. inline
  13392. uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
  13393. return __ockl_udot2(a.data, b.data, c, saturate);
  13394. }
  13395. __DEVICE__
  13396. inline
  13397. int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
  13398. return __ockl_sdot4(a.data, b.data, c, saturate);
  13399. }
  13400. __DEVICE__
  13401. inline
  13402. uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
  13403. return __ockl_udot4(a.data, b.data, c, saturate);
  13404. }
  13405. __DEVICE__
  13406. inline
  13407. int amd_mixed_dot(int a, int b, int c, bool saturate) {
  13408. return __ockl_sdot8(a, b, c, saturate);
  13409. }
  13410. __DEVICE__
  13411. inline
  13412. uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
  13413. return __ockl_udot8(a, b, c, saturate);
  13414. }
  13415. #endif
  13416. #pragma pop_macro("__DEVICE__")
  13417. #pragma pop_macro("__RETURN_TYPE")
  13418. // For backward compatibility.
  13419. // There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
  13420. // defined after including math_functions.h.
  13421. #if !defined(__HIPCC_RTC__)
  13422. #include <hip/amd_detail/amd_hip_runtime.h>
  13423. #endif