pnggccrd.c 230 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420
  1. /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
  2. *
  3. * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
  4. *
  5. * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
  6. * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
  7. * for Intel's performance analysis of the MMX vs. non-MMX code.
  8. *
  9. * Last changed in libpng 1.2.15 January 5, 2007
  10. * For conditions of distribution and use, see copyright notice in png.h
  11. * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  12. * Copyright (c) 1998, Intel Corporation
  13. *
  14. * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  15. * Interface to libpng contributed by Gilles Vollant, 1999.
  16. * GNU C port by Greg Roelofs, 1999-2001.
  17. *
  18. * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  19. *
  20. * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
  21. *
  22. * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
  23. *
  24. * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
  25. * is required to assemble the newer MMX instructions such as movq.
  26. * For djgpp, see
  27. *
  28. * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
  29. *
  30. * (or a later version in the same directory). For Linux, check your
  31. * distribution's web site(s) or try these links:
  32. *
  33. * http://rufus.w3.org/linux/RPM/binutils.html
  34. * http://www.debian.org/Packages/stable/devel/binutils.html
  35. * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
  36. * binutils.tgz
  37. *
  38. * For other platforms, see the main GNU site:
  39. *
  40. * ftp://ftp.gnu.org/pub/gnu/binutils/
  41. *
  42. * Version 2.5.2l.15 is definitely too old...
  43. */
  44. /*
  45. * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
  46. * =====================================
  47. *
  48. * 19991006:
  49. * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
  50. *
  51. * 19991007:
  52. * - additional optimizations (possible or definite):
  53. * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
  54. * - write MMX code for 48-bit case (pixel_bytes == 6)
  55. * - figure out what's up with 24-bit case (pixel_bytes == 3):
  56. * why subtract 8 from width_mmx in the pass 4/5 case?
  57. * (only width_mmx case) (near line 1606)
  58. * x [DONE] replace pixel_bytes within each block with the true
  59. * constant value (or are compilers smart enough to do that?)
  60. * - rewrite all MMX interlacing code so it's aligned with
  61. * the *beginning* of the row buffer, not the end. This
  62. * would not only allow one to eliminate half of the memory
  63. * writes for odd passes (that is, pass == odd), it may also
  64. * eliminate some unaligned-data-access exceptions (assuming
  65. * there's a penalty for not aligning 64-bit accesses on
  66. * 64-bit boundaries). The only catch is that the "leftover"
  67. * pixel(s) at the end of the row would have to be saved,
  68. * but there are enough unused MMX registers in every case,
  69. * so this is not a problem. A further benefit is that the
  70. * post-MMX cleanup code (C code) in at least some of the
  71. * cases could be done within the assembler block.
  72. * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
  73. * inconsistent, and don't match the MMX Programmer's Reference
  74. * Manual conventions anyway. They should be changed to
  75. * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
  76. * was lowest in memory (e.g., corresponding to a left pixel)
  77. * and b7 is the byte that was highest (e.g., a right pixel).
  78. *
  79. * 19991016:
  80. * - Brennan's Guide notwithstanding, gcc under Linux does *not*
  81. * want globals prefixed by underscores when referencing them--
  82. * i.e., if the variable is const4, then refer to it as const4,
  83. * not _const4. This seems to be a djgpp-specific requirement.
  84. * Also, such variables apparently *must* be declared outside
  85. * of functions; neither static nor automatic variables work if
  86. * defined within the scope of a single function, but both
  87. * static and truly global (multi-module) variables work fine.
  88. *
  89. * 19991023:
  90. * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
  91. * - switched from string-concatenation-with-macros to cleaner method of
  92. * renaming global variables for djgpp--i.e., always use prefixes in
  93. * inlined assembler code (== strings) and conditionally rename the
  94. * variables, not the other way around. Hence _const4, _mask8_0, etc.
  95. *
  96. * 19991024:
  97. * - fixed mmxsupport()/png_do_read_interlace() first-row bug
  98. * This one was severely weird: even though mmxsupport() doesn't touch
  99. * ebx (where "row" pointer was stored), it nevertheless managed to zero
  100. * the register (even in static/non-fPIC code--see below), which in turn
  101. * caused png_do_read_interlace() to return prematurely on the first row of
  102. * interlaced images (i.e., without expanding the interlaced pixels).
  103. * Inspection of the generated assembly code didn't turn up any clues,
  104. * although it did point at a minor optimization (i.e., get rid of
  105. * mmx_supported_local variable and just use eax). Possibly the CPUID
  106. * instruction is more destructive than it looks? (Not yet checked.)
  107. * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
  108. * listings... Apparently register spillage has to do with ebx, since
  109. * it's used to index the global offset table. Commenting it out of the
  110. * input-reg lists in png_combine_row() eliminated compiler barfage, so
  111. * ifdef'd with __PIC__ macro: if defined, use a global for unmask
  112. *
  113. * 19991107:
  114. * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
  115. * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
  116. *
  117. * 19991120:
  118. * - made "diff" variable (now "_dif") global to simplify conversion of
  119. * filtering routines (running out of regs, sigh). "diff" is still used
  120. * in interlacing routines, however.
  121. * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
  122. * macro determines which is used); original not yet tested.
  123. *
  124. * 20000213:
  125. * - when compiling with gcc, be sure to use -fomit-frame-pointer
  126. *
  127. * 20000319:
  128. * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
  129. * pass == 4 or 5, that caused visible corruption of interlaced images
  130. *
  131. * 20000623:
  132. * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
  133. * many of the form "forbidden register 0 (ax) was spilled for class AREG."
  134. * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
  135. * Chuck Wilson supplied a patch involving dummy output registers. See
  136. * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
  137. * for the original (anonymous) SourceForge bug report.
  138. *
  139. * 20000706:
  140. * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
  141. * pnggccrd.c: In function `png_combine_row':
  142. * pnggccrd.c:525: more than 10 operands in `asm'
  143. * pnggccrd.c:669: more than 10 operands in `asm'
  144. * pnggccrd.c:828: more than 10 operands in `asm'
  145. * pnggccrd.c:994: more than 10 operands in `asm'
  146. * pnggccrd.c:1177: more than 10 operands in `asm'
  147. * They are all the same problem and can be worked around by using the
  148. * global _unmask variable unconditionally, not just in the -fPIC case.
  149. * Reportedly earlier versions of gcc also have the problem with more than
  150. * 10 operands; they just don't report it. Much strangeness ensues, etc.
  151. *
  152. * 20000729:
  153. * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
  154. * MMX routine); began converting png_read_filter_row_mmx_sub()
  155. * - to finish remaining sections:
  156. * - clean up indentation and comments
  157. * - preload local variables
  158. * - add output and input regs (order of former determines numerical
  159. * mapping of latter)
  160. * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
  161. * - remove "$" from addressing of Shift and Mask variables [20000823]
  162. *
  163. * 20000731:
  164. * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
  165. *
  166. * 20000822:
  167. * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
  168. * shared-library (-fPIC) version! Code works just fine as part of static
  169. * library. Damn damn damn damn damn, should have tested that sooner.
  170. * ebx is getting clobbered again (explicitly this time); need to save it
  171. * on stack or rewrite asm code to avoid using it altogether. Blargh!
  172. *
  173. * 20000823:
  174. * - first section was trickiest; all remaining sections have ebx -> edx now.
  175. * (-fPIC works again.) Also added missing underscores to various Shift*
  176. * and *Mask* globals and got rid of leading "$" signs.
  177. *
  178. * 20000826:
  179. * - added visual separators to help navigate microscopic printed copies
  180. * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
  181. * on png_read_filter_row_mmx_avg()
  182. *
  183. * 20000828:
  184. * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
  185. * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
  186. * cleaned up/shortened in either routine, but functionality is complete
  187. * and seems to be working fine.
  188. *
  189. * 20000829:
  190. * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
  191. * as an input reg (with dummy output variables, etc.), then it *cannot*
  192. * also appear in the clobber list or gcc 2.95.2 will barf. The solution
  193. * is simple enough...
  194. *
  195. * 20000914:
  196. * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
  197. * correctly (but 48-bit RGB just fine)
  198. *
  199. * 20000916:
  200. * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
  201. * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
  202. * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
  203. * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
  204. *
  205. * 20010101:
  206. * - added new png_init_mmx_flags() function (here only because it needs to
  207. * call mmxsupport(), which should probably become global png_mmxsupport());
  208. * modified other MMX routines to run conditionally (png_ptr->asm_flags)
  209. *
  210. * 20010103:
  211. * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
  212. * and made it public; moved png_init_mmx_flags() to png.c as internal func
  213. *
  214. * 20010104:
  215. * - removed dependency on png_read_filter_row_c() (C code already duplicated
  216. * within MMX version of png_read_filter_row()) so no longer necessary to
  217. * compile it into pngrutil.o
  218. *
  219. * 20010310:
  220. * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
  221. *
  222. * 20020304:
  223. * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
  224. *
  225. * 20040724:
  226. * - more tinkering with clobber list at lines 4529 and 5033, to get
  227. * it to compile on gcc-3.4.
  228. *
  229. * STILL TO DO:
  230. * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
  231. * - write MMX code for 48-bit case (pixel_bytes == 6)
  232. * - figure out what's up with 24-bit case (pixel_bytes == 3):
  233. * why subtract 8 from width_mmx in the pass 4/5 case?
  234. * (only width_mmx case) (near line 1606)
  235. * - rewrite all MMX interlacing code so it's aligned with beginning
  236. * of the row buffer, not the end (see 19991007 for details)
  237. * x pick one version of mmxsupport() and get rid of the other
  238. * - add error messages to any remaining bogus default cases
  239. * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
  240. * x add support for runtime enable/disable/query of various MMX routines
  241. */
  242. #define PNG_INTERNAL
  243. #include "png.h"
  244. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
  245. int PNGAPI png_mmx_support(void);
  246. #ifdef PNG_USE_LOCAL_ARRAYS
  247. const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
  248. const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  249. const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
  250. #endif
  251. #if defined(PNG_MMX_CODE_SUPPORTED)
  252. /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
  253. * so define them without: */
  254. #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
  255. defined(__OS2__)
  256. # define _mmx_supported mmx_supported
  257. # define _const4 const4
  258. # define _const6 const6
  259. # define _mask8_0 mask8_0
  260. # define _mask16_1 mask16_1
  261. # define _mask16_0 mask16_0
  262. # define _mask24_2 mask24_2
  263. # define _mask24_1 mask24_1
  264. # define _mask24_0 mask24_0
  265. # define _mask32_3 mask32_3
  266. # define _mask32_2 mask32_2
  267. # define _mask32_1 mask32_1
  268. # define _mask32_0 mask32_0
  269. # define _mask48_5 mask48_5
  270. # define _mask48_4 mask48_4
  271. # define _mask48_3 mask48_3
  272. # define _mask48_2 mask48_2
  273. # define _mask48_1 mask48_1
  274. # define _mask48_0 mask48_0
  275. # define _LBCarryMask LBCarryMask
  276. # define _HBClearMask HBClearMask
  277. # define _ActiveMask ActiveMask
  278. # define _ActiveMask2 ActiveMask2
  279. # define _ActiveMaskEnd ActiveMaskEnd
  280. # define _ShiftBpp ShiftBpp
  281. # define _ShiftRem ShiftRem
  282. #ifdef PNG_THREAD_UNSAFE_OK
  283. # define _unmask unmask
  284. # define _FullLength FullLength
  285. # define _MMXLength MMXLength
  286. # define _dif dif
  287. # define _patemp patemp
  288. # define _pbtemp pbtemp
  289. # define _pctemp pctemp
  290. #endif
  291. #endif
  292. /* These constants are used in the inlined MMX assembly code.
  293. Ignore gcc's "At top level: defined but not used" warnings. */
  294. /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
  295. * since that case uses the %ebx register for indexing the Global Offset Table
  296. * and there were no other registers available. But gcc 2.95 and later emit
  297. * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
  298. * in the non-PIC case, so we'll just use the global unconditionally now.
  299. */
  300. #ifdef PNG_THREAD_UNSAFE_OK
  301. static int _unmask;
  302. #endif
  303. const static unsigned long long _mask8_0 = 0x0102040810204080LL;
  304. const static unsigned long long _mask16_1 = 0x0101020204040808LL;
  305. const static unsigned long long _mask16_0 = 0x1010202040408080LL;
  306. const static unsigned long long _mask24_2 = 0x0101010202020404LL;
  307. const static unsigned long long _mask24_1 = 0x0408080810101020LL;
  308. const static unsigned long long _mask24_0 = 0x2020404040808080LL;
  309. const static unsigned long long _mask32_3 = 0x0101010102020202LL;
  310. const static unsigned long long _mask32_2 = 0x0404040408080808LL;
  311. const static unsigned long long _mask32_1 = 0x1010101020202020LL;
  312. const static unsigned long long _mask32_0 = 0x4040404080808080LL;
  313. const static unsigned long long _mask48_5 = 0x0101010101010202LL;
  314. const static unsigned long long _mask48_4 = 0x0202020204040404LL;
  315. const static unsigned long long _mask48_3 = 0x0404080808080808LL;
  316. const static unsigned long long _mask48_2 = 0x1010101010102020LL;
  317. const static unsigned long long _mask48_1 = 0x2020202040404040LL;
  318. const static unsigned long long _mask48_0 = 0x4040808080808080LL;
  319. const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
  320. //const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
  321. const static unsigned long long _const6 = 0x00000000000000FFLL;
  322. // These are used in the row-filter routines and should/would be local
  323. // variables if not for gcc addressing limitations.
  324. // WARNING: Their presence probably defeats the thread safety of libpng.
  325. #ifdef PNG_THREAD_UNSAFE_OK
  326. static png_uint_32 _FullLength;
  327. static png_uint_32 _MMXLength;
  328. static int _dif;
  329. static int _patemp; // temp variables for Paeth routine
  330. static int _pbtemp;
  331. static int _pctemp;
  332. #endif
  333. void /* PRIVATE */
  334. png_squelch_warnings(void)
  335. {
  336. #ifdef PNG_THREAD_UNSAFE_OK
  337. _dif = _dif;
  338. _patemp = _patemp;
  339. _pbtemp = _pbtemp;
  340. _pctemp = _pctemp;
  341. _MMXLength = _MMXLength;
  342. #endif
  343. _const4 = _const4;
  344. _const6 = _const6;
  345. _mask8_0 = _mask8_0;
  346. _mask16_1 = _mask16_1;
  347. _mask16_0 = _mask16_0;
  348. _mask24_2 = _mask24_2;
  349. _mask24_1 = _mask24_1;
  350. _mask24_0 = _mask24_0;
  351. _mask32_3 = _mask32_3;
  352. _mask32_2 = _mask32_2;
  353. _mask32_1 = _mask32_1;
  354. _mask32_0 = _mask32_0;
  355. _mask48_5 = _mask48_5;
  356. _mask48_4 = _mask48_4;
  357. _mask48_3 = _mask48_3;
  358. _mask48_2 = _mask48_2;
  359. _mask48_1 = _mask48_1;
  360. _mask48_0 = _mask48_0;
  361. }
  362. #endif /* PNG_MMX_CODE_SUPPORTED */
  363. static int _mmx_supported = 2;
  364. /*===========================================================================*/
  365. /* */
  366. /* P N G _ C O M B I N E _ R O W */
  367. /* */
  368. /*===========================================================================*/
  369. #if defined(PNG_HAVE_MMX_COMBINE_ROW)
  370. #define BPP2 2
  371. #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
  372. #define BPP4 4
  373. #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
  374. #define BPP8 8
  375. /* Combines the row recently read in with the previous row.
  376. This routine takes care of alpha and transparency if requested.
  377. This routine also handles the two methods of progressive display
  378. of interlaced images, depending on the mask value.
  379. The mask value describes which pixels are to be combined with
  380. the row. The pattern always repeats every 8 pixels, so just 8
  381. bits are needed. A one indicates the pixel is to be combined; a
  382. zero indicates the pixel is to be skipped. This is in addition
  383. to any alpha or transparency value associated with the pixel.
  384. If you want all pixels to be combined, pass 0xff (255) in mask. */
  385. /* Use this routine for the x86 platform - it uses a faster MMX routine
  386. if the machine supports MMX. */
  387. void /* PRIVATE */
  388. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  389. {
  390. png_debug(1, "in png_combine_row (pnggccrd.c)\n");
  391. #if defined(PNG_MMX_CODE_SUPPORTED)
  392. if (_mmx_supported == 2) {
  393. #if !defined(PNG_1_0_X)
  394. /* this should have happened in png_init_mmx_flags() already */
  395. png_warning(png_ptr, "asm_flags may not have been initialized");
  396. #endif
  397. png_mmx_support();
  398. }
  399. #endif
  400. if (mask == 0xff)
  401. {
  402. png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
  403. png_memcpy(row, png_ptr->row_buf + 1,
  404. (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
  405. }
  406. else /* (png_combine_row() is never called with mask == 0) */
  407. {
  408. switch (png_ptr->row_info.pixel_depth)
  409. {
  410. case 1: /* png_ptr->row_info.pixel_depth */
  411. {
  412. png_bytep sp;
  413. png_bytep dp;
  414. int s_inc, s_start, s_end;
  415. int m;
  416. int shift;
  417. png_uint_32 i;
  418. sp = png_ptr->row_buf + 1;
  419. dp = row;
  420. m = 0x80;
  421. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  422. if (png_ptr->transformations & PNG_PACKSWAP)
  423. {
  424. s_start = 0;
  425. s_end = 7;
  426. s_inc = 1;
  427. }
  428. else
  429. #endif
  430. {
  431. s_start = 7;
  432. s_end = 0;
  433. s_inc = -1;
  434. }
  435. shift = s_start;
  436. for (i = 0; i < png_ptr->width; i++)
  437. {
  438. if (m & mask)
  439. {
  440. int value;
  441. value = (*sp >> shift) & 0x1;
  442. *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  443. *dp |= (png_byte)(value << shift);
  444. }
  445. if (shift == s_end)
  446. {
  447. shift = s_start;
  448. sp++;
  449. dp++;
  450. }
  451. else
  452. shift += s_inc;
  453. if (m == 1)
  454. m = 0x80;
  455. else
  456. m >>= 1;
  457. }
  458. break;
  459. }
  460. case 2: /* png_ptr->row_info.pixel_depth */
  461. {
  462. png_bytep sp;
  463. png_bytep dp;
  464. int s_start, s_end, s_inc;
  465. int m;
  466. int shift;
  467. png_uint_32 i;
  468. int value;
  469. sp = png_ptr->row_buf + 1;
  470. dp = row;
  471. m = 0x80;
  472. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  473. if (png_ptr->transformations & PNG_PACKSWAP)
  474. {
  475. s_start = 0;
  476. s_end = 6;
  477. s_inc = 2;
  478. }
  479. else
  480. #endif
  481. {
  482. s_start = 6;
  483. s_end = 0;
  484. s_inc = -2;
  485. }
  486. shift = s_start;
  487. for (i = 0; i < png_ptr->width; i++)
  488. {
  489. if (m & mask)
  490. {
  491. value = (*sp >> shift) & 0x3;
  492. *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  493. *dp |= (png_byte)(value << shift);
  494. }
  495. if (shift == s_end)
  496. {
  497. shift = s_start;
  498. sp++;
  499. dp++;
  500. }
  501. else
  502. shift += s_inc;
  503. if (m == 1)
  504. m = 0x80;
  505. else
  506. m >>= 1;
  507. }
  508. break;
  509. }
  510. case 4: /* png_ptr->row_info.pixel_depth */
  511. {
  512. png_bytep sp;
  513. png_bytep dp;
  514. int s_start, s_end, s_inc;
  515. int m;
  516. int shift;
  517. png_uint_32 i;
  518. int value;
  519. sp = png_ptr->row_buf + 1;
  520. dp = row;
  521. m = 0x80;
  522. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  523. if (png_ptr->transformations & PNG_PACKSWAP)
  524. {
  525. s_start = 0;
  526. s_end = 4;
  527. s_inc = 4;
  528. }
  529. else
  530. #endif
  531. {
  532. s_start = 4;
  533. s_end = 0;
  534. s_inc = -4;
  535. }
  536. shift = s_start;
  537. for (i = 0; i < png_ptr->width; i++)
  538. {
  539. if (m & mask)
  540. {
  541. value = (*sp >> shift) & 0xf;
  542. *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  543. *dp |= (png_byte)(value << shift);
  544. }
  545. if (shift == s_end)
  546. {
  547. shift = s_start;
  548. sp++;
  549. dp++;
  550. }
  551. else
  552. shift += s_inc;
  553. if (m == 1)
  554. m = 0x80;
  555. else
  556. m >>= 1;
  557. }
  558. break;
  559. }
  560. case 8: /* png_ptr->row_info.pixel_depth */
  561. {
  562. png_bytep srcptr;
  563. png_bytep dstptr;
  564. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  565. #if !defined(PNG_1_0_X)
  566. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  567. /* && _mmx_supported */ )
  568. #else
  569. if (_mmx_supported)
  570. #endif
  571. {
  572. png_uint_32 len;
  573. int diff;
  574. int dummy_value_a; // fix 'forbidden register spilled' error
  575. int dummy_value_d;
  576. int dummy_value_c;
  577. int dummy_value_S;
  578. int dummy_value_D;
  579. _unmask = ~mask; // global variable for -fPIC version
  580. srcptr = png_ptr->row_buf + 1;
  581. dstptr = row;
  582. len = png_ptr->width &~7; // reduce to multiple of 8
  583. diff = (int) (png_ptr->width & 7); // amount lost
  584. __asm__ __volatile__ (
  585. "movd _unmask, %%mm7 \n\t" // load bit pattern
  586. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  587. "punpcklbw %%mm7, %%mm7 \n\t"
  588. "punpcklwd %%mm7, %%mm7 \n\t"
  589. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  590. "movq _mask8_0, %%mm0 \n\t"
  591. "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
  592. "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
  593. // preload "movl len, %%ecx \n\t" // load length of line
  594. // preload "movl srcptr, %%esi \n\t" // load source
  595. // preload "movl dstptr, %%edi \n\t" // load dest
  596. "cmpl $0, %%ecx \n\t" // len == 0 ?
  597. "je mainloop8end \n\t"
  598. "mainloop8: \n\t"
  599. "movq (%%esi), %%mm4 \n\t" // *srcptr
  600. "pand %%mm0, %%mm4 \n\t"
  601. "movq %%mm0, %%mm6 \n\t"
  602. "pandn (%%edi), %%mm6 \n\t" // *dstptr
  603. "por %%mm6, %%mm4 \n\t"
  604. "movq %%mm4, (%%edi) \n\t"
  605. "addl $8, %%esi \n\t" // inc by 8 bytes processed
  606. "addl $8, %%edi \n\t"
  607. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  608. "ja mainloop8 \n\t"
  609. "mainloop8end: \n\t"
  610. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  611. "movl %%eax, %%ecx \n\t"
  612. "cmpl $0, %%ecx \n\t"
  613. "jz end8 \n\t"
  614. // preload "movl mask, %%edx \n\t"
  615. "sall $24, %%edx \n\t" // make low byte, high byte
  616. "secondloop8: \n\t"
  617. "sall %%edx \n\t" // move high bit to CF
  618. "jnc skip8 \n\t" // if CF = 0
  619. "movb (%%esi), %%al \n\t"
  620. "movb %%al, (%%edi) \n\t"
  621. "skip8: \n\t"
  622. "incl %%esi \n\t"
  623. "incl %%edi \n\t"
  624. "decl %%ecx \n\t"
  625. "jnz secondloop8 \n\t"
  626. "end8: \n\t"
  627. "EMMS \n\t" // DONE
  628. : "=a" (dummy_value_a), // output regs (dummy)
  629. "=d" (dummy_value_d),
  630. "=c" (dummy_value_c),
  631. "=S" (dummy_value_S),
  632. "=D" (dummy_value_D)
  633. : "3" (srcptr), // esi // input regs
  634. "4" (dstptr), // edi
  635. "0" (diff), // eax
  636. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  637. "2" (len), // ecx
  638. "1" (mask) // edx
  639. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  640. : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
  641. #endif
  642. );
  643. }
  644. else /* mmx _not supported - Use modified C routine */
  645. #endif /* PNG_MMX_CODE_SUPPORTED */
  646. {
  647. register png_uint_32 i;
  648. png_uint_32 initial_val = png_pass_start[png_ptr->pass];
  649. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  650. register int stride = png_pass_inc[png_ptr->pass];
  651. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  652. register int rep_bytes = png_pass_width[png_ptr->pass];
  653. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  654. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  655. int diff = (int) (png_ptr->width & 7); /* amount lost */
  656. register png_uint_32 final_val = len; /* GRR bugfix */
  657. srcptr = png_ptr->row_buf + 1 + initial_val;
  658. dstptr = row + initial_val;
  659. for (i = initial_val; i < final_val; i += stride)
  660. {
  661. png_memcpy(dstptr, srcptr, rep_bytes);
  662. srcptr += stride;
  663. dstptr += stride;
  664. }
  665. if (diff) /* number of leftover pixels: 3 for pngtest */
  666. {
  667. final_val+=diff /* *BPP1 */ ;
  668. for (; i < final_val; i += stride)
  669. {
  670. if (rep_bytes > (int)(final_val-i))
  671. rep_bytes = (int)(final_val-i);
  672. png_memcpy(dstptr, srcptr, rep_bytes);
  673. srcptr += stride;
  674. dstptr += stride;
  675. }
  676. }
  677. } /* end of else (_mmx_supported) */
  678. break;
  679. } /* end 8 bpp */
  680. case 16: /* png_ptr->row_info.pixel_depth */
  681. {
  682. png_bytep srcptr;
  683. png_bytep dstptr;
  684. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  685. #if !defined(PNG_1_0_X)
  686. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  687. /* && _mmx_supported */ )
  688. #else
  689. if (_mmx_supported)
  690. #endif
  691. {
  692. png_uint_32 len;
  693. int diff;
  694. int dummy_value_a; // fix 'forbidden register spilled' error
  695. int dummy_value_d;
  696. int dummy_value_c;
  697. int dummy_value_S;
  698. int dummy_value_D;
  699. _unmask = ~mask; // global variable for -fPIC version
  700. srcptr = png_ptr->row_buf + 1;
  701. dstptr = row;
  702. len = png_ptr->width &~7; // reduce to multiple of 8
  703. diff = (int) (png_ptr->width & 7); // amount lost //
  704. __asm__ __volatile__ (
  705. "movd _unmask, %%mm7 \n\t" // load bit pattern
  706. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  707. "punpcklbw %%mm7, %%mm7 \n\t"
  708. "punpcklwd %%mm7, %%mm7 \n\t"
  709. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  710. "movq _mask16_0, %%mm0 \n\t"
  711. "movq _mask16_1, %%mm1 \n\t"
  712. "pand %%mm7, %%mm0 \n\t"
  713. "pand %%mm7, %%mm1 \n\t"
  714. "pcmpeqb %%mm6, %%mm0 \n\t"
  715. "pcmpeqb %%mm6, %%mm1 \n\t"
  716. // preload "movl len, %%ecx \n\t" // load length of line
  717. // preload "movl srcptr, %%esi \n\t" // load source
  718. // preload "movl dstptr, %%edi \n\t" // load dest
  719. "cmpl $0, %%ecx \n\t"
  720. "jz mainloop16end \n\t"
  721. "mainloop16: \n\t"
  722. "movq (%%esi), %%mm4 \n\t"
  723. "pand %%mm0, %%mm4 \n\t"
  724. "movq %%mm0, %%mm6 \n\t"
  725. "movq (%%edi), %%mm7 \n\t"
  726. "pandn %%mm7, %%mm6 \n\t"
  727. "por %%mm6, %%mm4 \n\t"
  728. "movq %%mm4, (%%edi) \n\t"
  729. "movq 8(%%esi), %%mm5 \n\t"
  730. "pand %%mm1, %%mm5 \n\t"
  731. "movq %%mm1, %%mm7 \n\t"
  732. "movq 8(%%edi), %%mm6 \n\t"
  733. "pandn %%mm6, %%mm7 \n\t"
  734. "por %%mm7, %%mm5 \n\t"
  735. "movq %%mm5, 8(%%edi) \n\t"
  736. "addl $16, %%esi \n\t" // inc by 16 bytes processed
  737. "addl $16, %%edi \n\t"
  738. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  739. "ja mainloop16 \n\t"
  740. "mainloop16end: \n\t"
  741. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  742. "movl %%eax, %%ecx \n\t"
  743. "cmpl $0, %%ecx \n\t"
  744. "jz end16 \n\t"
  745. // preload "movl mask, %%edx \n\t"
  746. "sall $24, %%edx \n\t" // make low byte, high byte
  747. "secondloop16: \n\t"
  748. "sall %%edx \n\t" // move high bit to CF
  749. "jnc skip16 \n\t" // if CF = 0
  750. "movw (%%esi), %%ax \n\t"
  751. "movw %%ax, (%%edi) \n\t"
  752. "skip16: \n\t"
  753. "addl $2, %%esi \n\t"
  754. "addl $2, %%edi \n\t"
  755. "decl %%ecx \n\t"
  756. "jnz secondloop16 \n\t"
  757. "end16: \n\t"
  758. "EMMS \n\t" // DONE
  759. : "=a" (dummy_value_a), // output regs (dummy)
  760. "=c" (dummy_value_c),
  761. "=d" (dummy_value_d),
  762. "=S" (dummy_value_S),
  763. "=D" (dummy_value_D)
  764. : "0" (diff), // eax // input regs
  765. // was (unmask) " " RESERVED // ebx // Global Offset Table idx
  766. "1" (len), // ecx
  767. "2" (mask), // edx
  768. "3" (srcptr), // esi
  769. "4" (dstptr) // edi
  770. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  771. : "%mm0", "%mm1", "%mm4" // clobber list
  772. , "%mm5", "%mm6", "%mm7"
  773. #endif
  774. );
  775. }
  776. else /* mmx _not supported - Use modified C routine */
  777. #endif /* PNG_MMX_CODE_SUPPORTED */
  778. {
  779. register png_uint_32 i;
  780. png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
  781. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  782. register int stride = BPP2 * png_pass_inc[png_ptr->pass];
  783. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  784. register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
  785. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  786. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  787. int diff = (int) (png_ptr->width & 7); /* amount lost */
  788. register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
  789. srcptr = png_ptr->row_buf + 1 + initial_val;
  790. dstptr = row + initial_val;
  791. for (i = initial_val; i < final_val; i += stride)
  792. {
  793. png_memcpy(dstptr, srcptr, rep_bytes);
  794. srcptr += stride;
  795. dstptr += stride;
  796. }
  797. if (diff) /* number of leftover pixels: 3 for pngtest */
  798. {
  799. final_val+=diff*BPP2;
  800. for (; i < final_val; i += stride)
  801. {
  802. if (rep_bytes > (int)(final_val-i))
  803. rep_bytes = (int)(final_val-i);
  804. png_memcpy(dstptr, srcptr, rep_bytes);
  805. srcptr += stride;
  806. dstptr += stride;
  807. }
  808. }
  809. } /* end of else (_mmx_supported) */
  810. break;
  811. } /* end 16 bpp */
  812. case 24: /* png_ptr->row_info.pixel_depth */
  813. {
  814. png_bytep srcptr;
  815. png_bytep dstptr;
  816. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  817. #if !defined(PNG_1_0_X)
  818. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  819. /* && _mmx_supported */ )
  820. #else
  821. if (_mmx_supported)
  822. #endif
  823. {
  824. png_uint_32 len;
  825. int diff;
  826. int dummy_value_a; // fix 'forbidden register spilled' error
  827. int dummy_value_d;
  828. int dummy_value_c;
  829. int dummy_value_S;
  830. int dummy_value_D;
  831. _unmask = ~mask; // global variable for -fPIC version
  832. srcptr = png_ptr->row_buf + 1;
  833. dstptr = row;
  834. len = png_ptr->width &~7; // reduce to multiple of 8
  835. diff = (int) (png_ptr->width & 7); // amount lost //
  836. __asm__ __volatile__ (
  837. "movd _unmask, %%mm7 \n\t" // load bit pattern
  838. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  839. "punpcklbw %%mm7, %%mm7 \n\t"
  840. "punpcklwd %%mm7, %%mm7 \n\t"
  841. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  842. "movq _mask24_0, %%mm0 \n\t"
  843. "movq _mask24_1, %%mm1 \n\t"
  844. "movq _mask24_2, %%mm2 \n\t"
  845. "pand %%mm7, %%mm0 \n\t"
  846. "pand %%mm7, %%mm1 \n\t"
  847. "pand %%mm7, %%mm2 \n\t"
  848. "pcmpeqb %%mm6, %%mm0 \n\t"
  849. "pcmpeqb %%mm6, %%mm1 \n\t"
  850. "pcmpeqb %%mm6, %%mm2 \n\t"
  851. // preload "movl len, %%ecx \n\t" // load length of line
  852. // preload "movl srcptr, %%esi \n\t" // load source
  853. // preload "movl dstptr, %%edi \n\t" // load dest
  854. "cmpl $0, %%ecx \n\t"
  855. "jz mainloop24end \n\t"
  856. "mainloop24: \n\t"
  857. "movq (%%esi), %%mm4 \n\t"
  858. "pand %%mm0, %%mm4 \n\t"
  859. "movq %%mm0, %%mm6 \n\t"
  860. "movq (%%edi), %%mm7 \n\t"
  861. "pandn %%mm7, %%mm6 \n\t"
  862. "por %%mm6, %%mm4 \n\t"
  863. "movq %%mm4, (%%edi) \n\t"
  864. "movq 8(%%esi), %%mm5 \n\t"
  865. "pand %%mm1, %%mm5 \n\t"
  866. "movq %%mm1, %%mm7 \n\t"
  867. "movq 8(%%edi), %%mm6 \n\t"
  868. "pandn %%mm6, %%mm7 \n\t"
  869. "por %%mm7, %%mm5 \n\t"
  870. "movq %%mm5, 8(%%edi) \n\t"
  871. "movq 16(%%esi), %%mm6 \n\t"
  872. "pand %%mm2, %%mm6 \n\t"
  873. "movq %%mm2, %%mm4 \n\t"
  874. "movq 16(%%edi), %%mm7 \n\t"
  875. "pandn %%mm7, %%mm4 \n\t"
  876. "por %%mm4, %%mm6 \n\t"
  877. "movq %%mm6, 16(%%edi) \n\t"
  878. "addl $24, %%esi \n\t" // inc by 24 bytes processed
  879. "addl $24, %%edi \n\t"
  880. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  881. "ja mainloop24 \n\t"
  882. "mainloop24end: \n\t"
  883. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  884. "movl %%eax, %%ecx \n\t"
  885. "cmpl $0, %%ecx \n\t"
  886. "jz end24 \n\t"
  887. // preload "movl mask, %%edx \n\t"
  888. "sall $24, %%edx \n\t" // make low byte, high byte
  889. "secondloop24: \n\t"
  890. "sall %%edx \n\t" // move high bit to CF
  891. "jnc skip24 \n\t" // if CF = 0
  892. "movw (%%esi), %%ax \n\t"
  893. "movw %%ax, (%%edi) \n\t"
  894. "xorl %%eax, %%eax \n\t"
  895. "movb 2(%%esi), %%al \n\t"
  896. "movb %%al, 2(%%edi) \n\t"
  897. "skip24: \n\t"
  898. "addl $3, %%esi \n\t"
  899. "addl $3, %%edi \n\t"
  900. "decl %%ecx \n\t"
  901. "jnz secondloop24 \n\t"
  902. "end24: \n\t"
  903. "EMMS \n\t" // DONE
  904. : "=a" (dummy_value_a), // output regs (dummy)
  905. "=d" (dummy_value_d),
  906. "=c" (dummy_value_c),
  907. "=S" (dummy_value_S),
  908. "=D" (dummy_value_D)
  909. : "3" (srcptr), // esi // input regs
  910. "4" (dstptr), // edi
  911. "0" (diff), // eax
  912. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  913. "2" (len), // ecx
  914. "1" (mask) // edx
  915. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  916. : "%mm0", "%mm1", "%mm2" // clobber list
  917. , "%mm4", "%mm5", "%mm6", "%mm7"
  918. #endif
  919. );
  920. }
  921. else /* mmx _not supported - Use modified C routine */
  922. #endif /* PNG_MMX_CODE_SUPPORTED */
  923. {
  924. register png_uint_32 i;
  925. png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
  926. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  927. register int stride = BPP3 * png_pass_inc[png_ptr->pass];
  928. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  929. register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
  930. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  931. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  932. int diff = (int) (png_ptr->width & 7); /* amount lost */
  933. register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
  934. srcptr = png_ptr->row_buf + 1 + initial_val;
  935. dstptr = row + initial_val;
  936. for (i = initial_val; i < final_val; i += stride)
  937. {
  938. png_memcpy(dstptr, srcptr, rep_bytes);
  939. srcptr += stride;
  940. dstptr += stride;
  941. }
  942. if (diff) /* number of leftover pixels: 3 for pngtest */
  943. {
  944. final_val+=diff*BPP3;
  945. for (; i < final_val; i += stride)
  946. {
  947. if (rep_bytes > (int)(final_val-i))
  948. rep_bytes = (int)(final_val-i);
  949. png_memcpy(dstptr, srcptr, rep_bytes);
  950. srcptr += stride;
  951. dstptr += stride;
  952. }
  953. }
  954. } /* end of else (_mmx_supported) */
  955. break;
  956. } /* end 24 bpp */
  957. case 32: /* png_ptr->row_info.pixel_depth */
  958. {
  959. png_bytep srcptr;
  960. png_bytep dstptr;
  961. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  962. #if !defined(PNG_1_0_X)
  963. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  964. /* && _mmx_supported */ )
  965. #else
  966. if (_mmx_supported)
  967. #endif
  968. {
  969. png_uint_32 len;
  970. int diff;
  971. int dummy_value_a; // fix 'forbidden register spilled' error
  972. int dummy_value_d;
  973. int dummy_value_c;
  974. int dummy_value_S;
  975. int dummy_value_D;
  976. _unmask = ~mask; // global variable for -fPIC version
  977. srcptr = png_ptr->row_buf + 1;
  978. dstptr = row;
  979. len = png_ptr->width &~7; // reduce to multiple of 8
  980. diff = (int) (png_ptr->width & 7); // amount lost //
  981. __asm__ __volatile__ (
  982. "movd _unmask, %%mm7 \n\t" // load bit pattern
  983. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  984. "punpcklbw %%mm7, %%mm7 \n\t"
  985. "punpcklwd %%mm7, %%mm7 \n\t"
  986. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  987. "movq _mask32_0, %%mm0 \n\t"
  988. "movq _mask32_1, %%mm1 \n\t"
  989. "movq _mask32_2, %%mm2 \n\t"
  990. "movq _mask32_3, %%mm3 \n\t"
  991. "pand %%mm7, %%mm0 \n\t"
  992. "pand %%mm7, %%mm1 \n\t"
  993. "pand %%mm7, %%mm2 \n\t"
  994. "pand %%mm7, %%mm3 \n\t"
  995. "pcmpeqb %%mm6, %%mm0 \n\t"
  996. "pcmpeqb %%mm6, %%mm1 \n\t"
  997. "pcmpeqb %%mm6, %%mm2 \n\t"
  998. "pcmpeqb %%mm6, %%mm3 \n\t"
  999. // preload "movl len, %%ecx \n\t" // load length of line
  1000. // preload "movl srcptr, %%esi \n\t" // load source
  1001. // preload "movl dstptr, %%edi \n\t" // load dest
  1002. "cmpl $0, %%ecx \n\t" // lcr
  1003. "jz mainloop32end \n\t"
  1004. "mainloop32: \n\t"
  1005. "movq (%%esi), %%mm4 \n\t"
  1006. "pand %%mm0, %%mm4 \n\t"
  1007. "movq %%mm0, %%mm6 \n\t"
  1008. "movq (%%edi), %%mm7 \n\t"
  1009. "pandn %%mm7, %%mm6 \n\t"
  1010. "por %%mm6, %%mm4 \n\t"
  1011. "movq %%mm4, (%%edi) \n\t"
  1012. "movq 8(%%esi), %%mm5 \n\t"
  1013. "pand %%mm1, %%mm5 \n\t"
  1014. "movq %%mm1, %%mm7 \n\t"
  1015. "movq 8(%%edi), %%mm6 \n\t"
  1016. "pandn %%mm6, %%mm7 \n\t"
  1017. "por %%mm7, %%mm5 \n\t"
  1018. "movq %%mm5, 8(%%edi) \n\t"
  1019. "movq 16(%%esi), %%mm6 \n\t"
  1020. "pand %%mm2, %%mm6 \n\t"
  1021. "movq %%mm2, %%mm4 \n\t"
  1022. "movq 16(%%edi), %%mm7 \n\t"
  1023. "pandn %%mm7, %%mm4 \n\t"
  1024. "por %%mm4, %%mm6 \n\t"
  1025. "movq %%mm6, 16(%%edi) \n\t"
  1026. "movq 24(%%esi), %%mm7 \n\t"
  1027. "pand %%mm3, %%mm7 \n\t"
  1028. "movq %%mm3, %%mm5 \n\t"
  1029. "movq 24(%%edi), %%mm4 \n\t"
  1030. "pandn %%mm4, %%mm5 \n\t"
  1031. "por %%mm5, %%mm7 \n\t"
  1032. "movq %%mm7, 24(%%edi) \n\t"
  1033. "addl $32, %%esi \n\t" // inc by 32 bytes processed
  1034. "addl $32, %%edi \n\t"
  1035. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  1036. "ja mainloop32 \n\t"
  1037. "mainloop32end: \n\t"
  1038. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  1039. "movl %%eax, %%ecx \n\t"
  1040. "cmpl $0, %%ecx \n\t"
  1041. "jz end32 \n\t"
  1042. // preload "movl mask, %%edx \n\t"
  1043. "sall $24, %%edx \n\t" // low byte => high byte
  1044. "secondloop32: \n\t"
  1045. "sall %%edx \n\t" // move high bit to CF
  1046. "jnc skip32 \n\t" // if CF = 0
  1047. "movl (%%esi), %%eax \n\t"
  1048. "movl %%eax, (%%edi) \n\t"
  1049. "skip32: \n\t"
  1050. "addl $4, %%esi \n\t"
  1051. "addl $4, %%edi \n\t"
  1052. "decl %%ecx \n\t"
  1053. "jnz secondloop32 \n\t"
  1054. "end32: \n\t"
  1055. "EMMS \n\t" // DONE
  1056. : "=a" (dummy_value_a), // output regs (dummy)
  1057. "=d" (dummy_value_d),
  1058. "=c" (dummy_value_c),
  1059. "=S" (dummy_value_S),
  1060. "=D" (dummy_value_D)
  1061. : "3" (srcptr), // esi // input regs
  1062. "4" (dstptr), // edi
  1063. "0" (diff), // eax
  1064. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  1065. "2" (len), // ecx
  1066. "1" (mask) // edx
  1067. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1068. : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
  1069. , "%mm4", "%mm5", "%mm6", "%mm7"
  1070. #endif
  1071. );
  1072. }
  1073. else /* mmx _not supported - Use modified C routine */
  1074. #endif /* PNG_MMX_CODE_SUPPORTED */
  1075. {
  1076. register png_uint_32 i;
  1077. png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
  1078. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1079. register int stride = BPP4 * png_pass_inc[png_ptr->pass];
  1080. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1081. register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
  1082. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1083. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1084. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1085. register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
  1086. srcptr = png_ptr->row_buf + 1 + initial_val;
  1087. dstptr = row + initial_val;
  1088. for (i = initial_val; i < final_val; i += stride)
  1089. {
  1090. png_memcpy(dstptr, srcptr, rep_bytes);
  1091. srcptr += stride;
  1092. dstptr += stride;
  1093. }
  1094. if (diff) /* number of leftover pixels: 3 for pngtest */
  1095. {
  1096. final_val+=diff*BPP4;
  1097. for (; i < final_val; i += stride)
  1098. {
  1099. if (rep_bytes > (int)(final_val-i))
  1100. rep_bytes = (int)(final_val-i);
  1101. png_memcpy(dstptr, srcptr, rep_bytes);
  1102. srcptr += stride;
  1103. dstptr += stride;
  1104. }
  1105. }
  1106. } /* end of else (_mmx_supported) */
  1107. break;
  1108. } /* end 32 bpp */
  1109. case 48: /* png_ptr->row_info.pixel_depth */
  1110. {
  1111. png_bytep srcptr;
  1112. png_bytep dstptr;
  1113. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  1114. #if !defined(PNG_1_0_X)
  1115. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  1116. /* && _mmx_supported */ )
  1117. #else
  1118. if (_mmx_supported)
  1119. #endif
  1120. {
  1121. png_uint_32 len;
  1122. int diff;
  1123. int dummy_value_a; // fix 'forbidden register spilled' error
  1124. int dummy_value_d;
  1125. int dummy_value_c;
  1126. int dummy_value_S;
  1127. int dummy_value_D;
  1128. _unmask = ~mask; // global variable for -fPIC version
  1129. srcptr = png_ptr->row_buf + 1;
  1130. dstptr = row;
  1131. len = png_ptr->width &~7; // reduce to multiple of 8
  1132. diff = (int) (png_ptr->width & 7); // amount lost //
  1133. __asm__ __volatile__ (
  1134. "movd _unmask, %%mm7 \n\t" // load bit pattern
  1135. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  1136. "punpcklbw %%mm7, %%mm7 \n\t"
  1137. "punpcklwd %%mm7, %%mm7 \n\t"
  1138. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  1139. "movq _mask48_0, %%mm0 \n\t"
  1140. "movq _mask48_1, %%mm1 \n\t"
  1141. "movq _mask48_2, %%mm2 \n\t"
  1142. "movq _mask48_3, %%mm3 \n\t"
  1143. "movq _mask48_4, %%mm4 \n\t"
  1144. "movq _mask48_5, %%mm5 \n\t"
  1145. "pand %%mm7, %%mm0 \n\t"
  1146. "pand %%mm7, %%mm1 \n\t"
  1147. "pand %%mm7, %%mm2 \n\t"
  1148. "pand %%mm7, %%mm3 \n\t"
  1149. "pand %%mm7, %%mm4 \n\t"
  1150. "pand %%mm7, %%mm5 \n\t"
  1151. "pcmpeqb %%mm6, %%mm0 \n\t"
  1152. "pcmpeqb %%mm6, %%mm1 \n\t"
  1153. "pcmpeqb %%mm6, %%mm2 \n\t"
  1154. "pcmpeqb %%mm6, %%mm3 \n\t"
  1155. "pcmpeqb %%mm6, %%mm4 \n\t"
  1156. "pcmpeqb %%mm6, %%mm5 \n\t"
  1157. // preload "movl len, %%ecx \n\t" // load length of line
  1158. // preload "movl srcptr, %%esi \n\t" // load source
  1159. // preload "movl dstptr, %%edi \n\t" // load dest
  1160. "cmpl $0, %%ecx \n\t"
  1161. "jz mainloop48end \n\t"
  1162. "mainloop48: \n\t"
  1163. "movq (%%esi), %%mm7 \n\t"
  1164. "pand %%mm0, %%mm7 \n\t"
  1165. "movq %%mm0, %%mm6 \n\t"
  1166. "pandn (%%edi), %%mm6 \n\t"
  1167. "por %%mm6, %%mm7 \n\t"
  1168. "movq %%mm7, (%%edi) \n\t"
  1169. "movq 8(%%esi), %%mm6 \n\t"
  1170. "pand %%mm1, %%mm6 \n\t"
  1171. "movq %%mm1, %%mm7 \n\t"
  1172. "pandn 8(%%edi), %%mm7 \n\t"
  1173. "por %%mm7, %%mm6 \n\t"
  1174. "movq %%mm6, 8(%%edi) \n\t"
  1175. "movq 16(%%esi), %%mm6 \n\t"
  1176. "pand %%mm2, %%mm6 \n\t"
  1177. "movq %%mm2, %%mm7 \n\t"
  1178. "pandn 16(%%edi), %%mm7 \n\t"
  1179. "por %%mm7, %%mm6 \n\t"
  1180. "movq %%mm6, 16(%%edi) \n\t"
  1181. "movq 24(%%esi), %%mm7 \n\t"
  1182. "pand %%mm3, %%mm7 \n\t"
  1183. "movq %%mm3, %%mm6 \n\t"
  1184. "pandn 24(%%edi), %%mm6 \n\t"
  1185. "por %%mm6, %%mm7 \n\t"
  1186. "movq %%mm7, 24(%%edi) \n\t"
  1187. "movq 32(%%esi), %%mm6 \n\t"
  1188. "pand %%mm4, %%mm6 \n\t"
  1189. "movq %%mm4, %%mm7 \n\t"
  1190. "pandn 32(%%edi), %%mm7 \n\t"
  1191. "por %%mm7, %%mm6 \n\t"
  1192. "movq %%mm6, 32(%%edi) \n\t"
  1193. "movq 40(%%esi), %%mm7 \n\t"
  1194. "pand %%mm5, %%mm7 \n\t"
  1195. "movq %%mm5, %%mm6 \n\t"
  1196. "pandn 40(%%edi), %%mm6 \n\t"
  1197. "por %%mm6, %%mm7 \n\t"
  1198. "movq %%mm7, 40(%%edi) \n\t"
  1199. "addl $48, %%esi \n\t" // inc by 48 bytes processed
  1200. "addl $48, %%edi \n\t"
  1201. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  1202. "ja mainloop48 \n\t"
  1203. "mainloop48end: \n\t"
  1204. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  1205. "movl %%eax, %%ecx \n\t"
  1206. "cmpl $0, %%ecx \n\t"
  1207. "jz end48 \n\t"
  1208. // preload "movl mask, %%edx \n\t"
  1209. "sall $24, %%edx \n\t" // make low byte, high byte
  1210. "secondloop48: \n\t"
  1211. "sall %%edx \n\t" // move high bit to CF
  1212. "jnc skip48 \n\t" // if CF = 0
  1213. "movl (%%esi), %%eax \n\t"
  1214. "movl %%eax, (%%edi) \n\t"
  1215. "skip48: \n\t"
  1216. "addl $4, %%esi \n\t"
  1217. "addl $4, %%edi \n\t"
  1218. "decl %%ecx \n\t"
  1219. "jnz secondloop48 \n\t"
  1220. "end48: \n\t"
  1221. "EMMS \n\t" // DONE
  1222. : "=a" (dummy_value_a), // output regs (dummy)
  1223. "=d" (dummy_value_d),
  1224. "=c" (dummy_value_c),
  1225. "=S" (dummy_value_S),
  1226. "=D" (dummy_value_D)
  1227. : "3" (srcptr), // esi // input regs
  1228. "4" (dstptr), // edi
  1229. "0" (diff), // eax
  1230. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  1231. "2" (len), // ecx
  1232. "1" (mask) // edx
  1233. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1234. : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
  1235. , "%mm4", "%mm5", "%mm6", "%mm7"
  1236. #endif
  1237. );
  1238. }
  1239. else /* mmx _not supported - Use modified C routine */
  1240. #endif /* PNG_MMX_CODE_SUPPORTED */
  1241. {
  1242. register png_uint_32 i;
  1243. png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
  1244. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1245. register int stride = BPP6 * png_pass_inc[png_ptr->pass];
  1246. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1247. register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
  1248. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1249. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1250. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1251. register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
  1252. srcptr = png_ptr->row_buf + 1 + initial_val;
  1253. dstptr = row + initial_val;
  1254. for (i = initial_val; i < final_val; i += stride)
  1255. {
  1256. png_memcpy(dstptr, srcptr, rep_bytes);
  1257. srcptr += stride;
  1258. dstptr += stride;
  1259. }
  1260. if (diff) /* number of leftover pixels: 3 for pngtest */
  1261. {
  1262. final_val+=diff*BPP6;
  1263. for (; i < final_val; i += stride)
  1264. {
  1265. if (rep_bytes > (int)(final_val-i))
  1266. rep_bytes = (int)(final_val-i);
  1267. png_memcpy(dstptr, srcptr, rep_bytes);
  1268. srcptr += stride;
  1269. dstptr += stride;
  1270. }
  1271. }
  1272. } /* end of else (_mmx_supported) */
  1273. break;
  1274. } /* end 48 bpp */
  1275. case 64: /* png_ptr->row_info.pixel_depth */
  1276. {
  1277. png_bytep srcptr;
  1278. png_bytep dstptr;
  1279. register png_uint_32 i;
  1280. png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
  1281. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1282. register int stride = BPP8 * png_pass_inc[png_ptr->pass];
  1283. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1284. register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
  1285. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1286. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1287. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1288. register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
  1289. srcptr = png_ptr->row_buf + 1 + initial_val;
  1290. dstptr = row + initial_val;
  1291. for (i = initial_val; i < final_val; i += stride)
  1292. {
  1293. png_memcpy(dstptr, srcptr, rep_bytes);
  1294. srcptr += stride;
  1295. dstptr += stride;
  1296. }
  1297. if (diff) /* number of leftover pixels: 3 for pngtest */
  1298. {
  1299. final_val+=diff*BPP8;
  1300. for (; i < final_val; i += stride)
  1301. {
  1302. if (rep_bytes > (int)(final_val-i))
  1303. rep_bytes = (int)(final_val-i);
  1304. png_memcpy(dstptr, srcptr, rep_bytes);
  1305. srcptr += stride;
  1306. dstptr += stride;
  1307. }
  1308. }
  1309. break;
  1310. } /* end 64 bpp */
  1311. default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
  1312. {
  1313. /* this should never happen */
  1314. png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
  1315. break;
  1316. }
  1317. } /* end switch (png_ptr->row_info.pixel_depth) */
  1318. } /* end if (non-trivial mask) */
  1319. } /* end png_combine_row() */
  1320. #endif /* PNG_HAVE_MMX_COMBINE_ROW */
  1321. /*===========================================================================*/
  1322. /* */
  1323. /* P N G _ D O _ R E A D _ I N T E R L A C E */
  1324. /* */
  1325. /*===========================================================================*/
  1326. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  1327. #if defined(PNG_HAVE_MMX_READ_INTERLACE)
  1328. /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
  1329. * has taken place. [GRR: what other steps come before and/or after?]
  1330. */
  1331. void /* PRIVATE */
  1332. png_do_read_interlace(png_structp png_ptr)
  1333. {
  1334. png_row_infop row_info = &(png_ptr->row_info);
  1335. png_bytep row = png_ptr->row_buf + 1;
  1336. int pass = png_ptr->pass;
  1337. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1338. png_uint_32 transformations = png_ptr->transformations;
  1339. #endif
  1340. png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
  1341. #if defined(PNG_MMX_CODE_SUPPORTED)
  1342. if (_mmx_supported == 2) {
  1343. #if !defined(PNG_1_0_X)
  1344. /* this should have happened in png_init_mmx_flags() already */
  1345. png_warning(png_ptr, "asm_flags may not have been initialized");
  1346. #endif
  1347. png_mmx_support();
  1348. }
  1349. #endif
  1350. if (row != NULL && row_info != NULL)
  1351. {
  1352. png_uint_32 final_width;
  1353. final_width = row_info->width * png_pass_inc[pass];
  1354. switch (row_info->pixel_depth)
  1355. {
  1356. case 1:
  1357. {
  1358. png_bytep sp, dp;
  1359. int sshift, dshift;
  1360. int s_start, s_end, s_inc;
  1361. png_byte v;
  1362. png_uint_32 i;
  1363. int j;
  1364. sp = row + (png_size_t)((row_info->width - 1) >> 3);
  1365. dp = row + (png_size_t)((final_width - 1) >> 3);
  1366. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1367. if (transformations & PNG_PACKSWAP)
  1368. {
  1369. sshift = (int)((row_info->width + 7) & 7);
  1370. dshift = (int)((final_width + 7) & 7);
  1371. s_start = 7;
  1372. s_end = 0;
  1373. s_inc = -1;
  1374. }
  1375. else
  1376. #endif
  1377. {
  1378. sshift = 7 - (int)((row_info->width + 7) & 7);
  1379. dshift = 7 - (int)((final_width + 7) & 7);
  1380. s_start = 0;
  1381. s_end = 7;
  1382. s_inc = 1;
  1383. }
  1384. for (i = row_info->width; i; i--)
  1385. {
  1386. v = (png_byte)((*sp >> sshift) & 0x1);
  1387. for (j = 0; j < png_pass_inc[pass]; j++)
  1388. {
  1389. *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  1390. *dp |= (png_byte)(v << dshift);
  1391. if (dshift == s_end)
  1392. {
  1393. dshift = s_start;
  1394. dp--;
  1395. }
  1396. else
  1397. dshift += s_inc;
  1398. }
  1399. if (sshift == s_end)
  1400. {
  1401. sshift = s_start;
  1402. sp--;
  1403. }
  1404. else
  1405. sshift += s_inc;
  1406. }
  1407. break;
  1408. }
  1409. case 2:
  1410. {
  1411. png_bytep sp, dp;
  1412. int sshift, dshift;
  1413. int s_start, s_end, s_inc;
  1414. png_uint_32 i;
  1415. sp = row + (png_size_t)((row_info->width - 1) >> 2);
  1416. dp = row + (png_size_t)((final_width - 1) >> 2);
  1417. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1418. if (transformations & PNG_PACKSWAP)
  1419. {
  1420. sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  1421. dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  1422. s_start = 6;
  1423. s_end = 0;
  1424. s_inc = -2;
  1425. }
  1426. else
  1427. #endif
  1428. {
  1429. sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  1430. dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  1431. s_start = 0;
  1432. s_end = 6;
  1433. s_inc = 2;
  1434. }
  1435. for (i = row_info->width; i; i--)
  1436. {
  1437. png_byte v;
  1438. int j;
  1439. v = (png_byte)((*sp >> sshift) & 0x3);
  1440. for (j = 0; j < png_pass_inc[pass]; j++)
  1441. {
  1442. *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  1443. *dp |= (png_byte)(v << dshift);
  1444. if (dshift == s_end)
  1445. {
  1446. dshift = s_start;
  1447. dp--;
  1448. }
  1449. else
  1450. dshift += s_inc;
  1451. }
  1452. if (sshift == s_end)
  1453. {
  1454. sshift = s_start;
  1455. sp--;
  1456. }
  1457. else
  1458. sshift += s_inc;
  1459. }
  1460. break;
  1461. }
  1462. case 4:
  1463. {
  1464. png_bytep sp, dp;
  1465. int sshift, dshift;
  1466. int s_start, s_end, s_inc;
  1467. png_uint_32 i;
  1468. sp = row + (png_size_t)((row_info->width - 1) >> 1);
  1469. dp = row + (png_size_t)((final_width - 1) >> 1);
  1470. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1471. if (transformations & PNG_PACKSWAP)
  1472. {
  1473. sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  1474. dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  1475. s_start = 4;
  1476. s_end = 0;
  1477. s_inc = -4;
  1478. }
  1479. else
  1480. #endif
  1481. {
  1482. sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1483. dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1484. s_start = 0;
  1485. s_end = 4;
  1486. s_inc = 4;
  1487. }
  1488. for (i = row_info->width; i; i--)
  1489. {
  1490. png_byte v;
  1491. int j;
  1492. v = (png_byte)((*sp >> sshift) & 0xf);
  1493. for (j = 0; j < png_pass_inc[pass]; j++)
  1494. {
  1495. *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1496. *dp |= (png_byte)(v << dshift);
  1497. if (dshift == s_end)
  1498. {
  1499. dshift = s_start;
  1500. dp--;
  1501. }
  1502. else
  1503. dshift += s_inc;
  1504. }
  1505. if (sshift == s_end)
  1506. {
  1507. sshift = s_start;
  1508. sp--;
  1509. }
  1510. else
  1511. sshift += s_inc;
  1512. }
  1513. break;
  1514. }
  1515. /*====================================================================*/
  1516. default: /* 8-bit or larger (this is where the routine is modified) */
  1517. {
  1518. #if 0
  1519. // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
  1520. // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
  1521. // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
  1522. // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
  1523. #endif
  1524. png_bytep sptr, dp;
  1525. png_uint_32 i;
  1526. png_size_t pixel_bytes;
  1527. int width = (int)row_info->width;
  1528. pixel_bytes = (row_info->pixel_depth >> 3);
  1529. /* point sptr at the last pixel in the pre-expanded row: */
  1530. sptr = row + (width - 1) * pixel_bytes;
  1531. /* point dp at the last pixel position in the expanded row: */
  1532. dp = row + (final_width - 1) * pixel_bytes;
  1533. /* New code by Nirav Chhatrapati - Intel Corporation */
  1534. #if defined(PNG_MMX_CODE_SUPPORTED)
  1535. #if !defined(PNG_1_0_X)
  1536. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1537. /* && _mmx_supported */ )
  1538. #else
  1539. if (_mmx_supported)
  1540. #endif
  1541. {
  1542. //--------------------------------------------------------------
  1543. if (pixel_bytes == 3)
  1544. {
  1545. if (((pass == 0) || (pass == 1)) && width)
  1546. {
  1547. int dummy_value_c; // fix 'forbidden register spilled'
  1548. int dummy_value_S;
  1549. int dummy_value_D;
  1550. int dummy_value_a;
  1551. __asm__ __volatile__ (
  1552. "subl $21, %%edi \n\t"
  1553. // (png_pass_inc[pass] - 1)*pixel_bytes
  1554. ".loop3_pass0: \n\t"
  1555. "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
  1556. "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
  1557. "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
  1558. "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
  1559. "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
  1560. "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
  1561. "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
  1562. "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
  1563. "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
  1564. "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
  1565. "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
  1566. "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
  1567. "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
  1568. "movq %%mm4, 16(%%edi) \n\t"
  1569. "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
  1570. "movq %%mm3, 8(%%edi) \n\t"
  1571. "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
  1572. "subl $3, %%esi \n\t"
  1573. "movq %%mm0, (%%edi) \n\t"
  1574. "subl $24, %%edi \n\t"
  1575. "decl %%ecx \n\t"
  1576. "jnz .loop3_pass0 \n\t"
  1577. "EMMS \n\t" // DONE
  1578. : "=c" (dummy_value_c), // output regs (dummy)
  1579. "=S" (dummy_value_S),
  1580. "=D" (dummy_value_D),
  1581. "=a" (dummy_value_a)
  1582. : "1" (sptr), // esi // input regs
  1583. "2" (dp), // edi
  1584. "0" (width), // ecx
  1585. "3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
  1586. #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1587. : "%mm0", "%mm1", "%mm2" // clobber list
  1588. , "%mm3", "%mm4"
  1589. #endif
  1590. );
  1591. }
  1592. else if (((pass == 2) || (pass == 3)) && width)
  1593. {
  1594. int dummy_value_c; // fix 'forbidden register spilled'
  1595. int dummy_value_S;
  1596. int dummy_value_D;
  1597. int dummy_value_a;
  1598. __asm__ __volatile__ (
  1599. "subl $9, %%edi \n\t"
  1600. // (png_pass_inc[pass] - 1)*pixel_bytes
  1601. ".loop3_pass2: \n\t"
  1602. "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
  1603. "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
  1604. "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
  1605. "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
  1606. "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
  1607. "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
  1608. "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
  1609. "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
  1610. "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
  1611. "movq %%mm0, 4(%%edi) \n\t"
  1612. "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
  1613. "subl $3, %%esi \n\t"
  1614. "movd %%mm0, (%%edi) \n\t"
  1615. "subl $12, %%edi \n\t"
  1616. "decl %%ecx \n\t"
  1617. "jnz .loop3_pass2 \n\t"
  1618. "EMMS \n\t" // DONE
  1619. : "=c" (dummy_value_c), // output regs (dummy)
  1620. "=S" (dummy_value_S),
  1621. "=D" (dummy_value_D),
  1622. "=a" (dummy_value_a)
  1623. : "1" (sptr), // esi // input regs
  1624. "2" (dp), // edi
  1625. "0" (width), // ecx
  1626. "3" (&_const4) // (0x0000000000FFFFFFLL)
  1627. #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1628. : "%mm0", "%mm1", "%mm2" // clobber list
  1629. #endif
  1630. );
  1631. }
  1632. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1633. {
  1634. int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
  1635. if (width_mmx < 0)
  1636. width_mmx = 0;
  1637. width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
  1638. if (width_mmx)
  1639. {
  1640. // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
  1641. // sptr points at last pixel in pre-expanded row
  1642. // dp points at last pixel position in expanded row
  1643. int dummy_value_c; // fix 'forbidden register spilled'
  1644. int dummy_value_S;
  1645. int dummy_value_D;
  1646. int dummy_value_a;
  1647. int dummy_value_d;
  1648. __asm__ __volatile__ (
  1649. "subl $3, %%esi \n\t"
  1650. "subl $9, %%edi \n\t"
  1651. // (png_pass_inc[pass] + 1)*pixel_bytes
  1652. ".loop3_pass4: \n\t"
  1653. "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
  1654. "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
  1655. "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
  1656. "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
  1657. "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
  1658. "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
  1659. "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
  1660. "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
  1661. "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
  1662. "movq %%mm0, (%%edi) \n\t"
  1663. "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
  1664. "pand (%4), %%mm3 \n\t" // z z z z z z z 5
  1665. "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
  1666. "subl $6, %%esi \n\t"
  1667. "movd %%mm2, 8(%%edi) \n\t"
  1668. "subl $12, %%edi \n\t"
  1669. "subl $2, %%ecx \n\t"
  1670. "jnz .loop3_pass4 \n\t"
  1671. "EMMS \n\t" // DONE
  1672. : "=c" (dummy_value_c), // output regs (dummy)
  1673. "=S" (dummy_value_S),
  1674. "=D" (dummy_value_D),
  1675. "=a" (dummy_value_a),
  1676. "=d" (dummy_value_d)
  1677. : "1" (sptr), // esi // input regs
  1678. "2" (dp), // edi
  1679. "0" (width_mmx), // ecx
  1680. "3" (&_const4), // 0x0000000000FFFFFFLL
  1681. "4" (&_const6) // 0x00000000000000FFLL
  1682. #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1683. : "%mm0", "%mm1" // clobber list
  1684. , "%mm2", "%mm3"
  1685. #endif
  1686. );
  1687. }
  1688. sptr -= width_mmx*3;
  1689. dp -= width_mmx*6;
  1690. for (i = width; i; i--)
  1691. {
  1692. png_byte v[8];
  1693. int j;
  1694. png_memcpy(v, sptr, 3);
  1695. for (j = 0; j < png_pass_inc[pass]; j++)
  1696. {
  1697. png_memcpy(dp, v, 3);
  1698. dp -= 3;
  1699. }
  1700. sptr -= 3;
  1701. }
  1702. }
  1703. } /* end of pixel_bytes == 3 */
  1704. //--------------------------------------------------------------
  1705. else if (pixel_bytes == 1)
  1706. {
  1707. if (((pass == 0) || (pass == 1)) && width)
  1708. {
  1709. int width_mmx = ((width >> 2) << 2);
  1710. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1711. if (width_mmx)
  1712. {
  1713. int dummy_value_c; // fix 'forbidden register spilled'
  1714. int dummy_value_S;
  1715. int dummy_value_D;
  1716. __asm__ __volatile__ (
  1717. "subl $3, %%esi \n\t"
  1718. "subl $31, %%edi \n\t"
  1719. ".loop1_pass0: \n\t"
  1720. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1721. "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
  1722. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1723. "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
  1724. "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
  1725. "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
  1726. "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
  1727. "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
  1728. "movq %%mm0, (%%edi) \n\t"
  1729. "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
  1730. "movq %%mm3, 8(%%edi) \n\t"
  1731. "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
  1732. "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
  1733. "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
  1734. "movq %%mm2, 16(%%edi) \n\t"
  1735. "subl $4, %%esi \n\t"
  1736. "movq %%mm4, 24(%%edi) \n\t"
  1737. "subl $32, %%edi \n\t"
  1738. "subl $4, %%ecx \n\t"
  1739. "jnz .loop1_pass0 \n\t"
  1740. "EMMS \n\t" // DONE
  1741. : "=c" (dummy_value_c), // output regs (dummy)
  1742. "=S" (dummy_value_S),
  1743. "=D" (dummy_value_D)
  1744. : "1" (sptr), // esi // input regs
  1745. "2" (dp), // edi
  1746. "0" (width_mmx) // ecx
  1747. #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1748. : "%mm0", "%mm1", "%mm2" // clobber list
  1749. , "%mm3", "%mm4"
  1750. #endif
  1751. );
  1752. }
  1753. sptr -= width_mmx;
  1754. dp -= width_mmx*8;
  1755. for (i = width; i; i--)
  1756. {
  1757. int j;
  1758. /* I simplified this part in version 1.0.4e
  1759. * here and in several other instances where
  1760. * pixel_bytes == 1 -- GR-P
  1761. *
  1762. * Original code:
  1763. *
  1764. * png_byte v[8];
  1765. * png_memcpy(v, sptr, pixel_bytes);
  1766. * for (j = 0; j < png_pass_inc[pass]; j++)
  1767. * {
  1768. * png_memcpy(dp, v, pixel_bytes);
  1769. * dp -= pixel_bytes;
  1770. * }
  1771. * sptr -= pixel_bytes;
  1772. *
  1773. * Replacement code is in the next three lines:
  1774. */
  1775. for (j = 0; j < png_pass_inc[pass]; j++)
  1776. {
  1777. *dp-- = *sptr;
  1778. }
  1779. --sptr;
  1780. }
  1781. }
  1782. else if (((pass == 2) || (pass == 3)) && width)
  1783. {
  1784. int width_mmx = ((width >> 2) << 2);
  1785. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1786. if (width_mmx)
  1787. {
  1788. int dummy_value_c; // fix 'forbidden register spilled'
  1789. int dummy_value_S;
  1790. int dummy_value_D;
  1791. __asm__ __volatile__ (
  1792. "subl $3, %%esi \n\t"
  1793. "subl $15, %%edi \n\t"
  1794. ".loop1_pass2: \n\t"
  1795. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1796. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1797. "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
  1798. "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
  1799. "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
  1800. "movq %%mm0, (%%edi) \n\t"
  1801. "subl $4, %%esi \n\t"
  1802. "movq %%mm1, 8(%%edi) \n\t"
  1803. "subl $16, %%edi \n\t"
  1804. "subl $4, %%ecx \n\t"
  1805. "jnz .loop1_pass2 \n\t"
  1806. "EMMS \n\t" // DONE
  1807. : "=c" (dummy_value_c), // output regs (dummy)
  1808. "=S" (dummy_value_S),
  1809. "=D" (dummy_value_D)
  1810. : "1" (sptr), // esi // input regs
  1811. "2" (dp), // edi
  1812. "0" (width_mmx) // ecx
  1813. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1814. : "%mm0", "%mm1" // clobber list
  1815. #endif
  1816. );
  1817. }
  1818. sptr -= width_mmx;
  1819. dp -= width_mmx*4;
  1820. for (i = width; i; i--)
  1821. {
  1822. int j;
  1823. for (j = 0; j < png_pass_inc[pass]; j++)
  1824. {
  1825. *dp-- = *sptr;
  1826. }
  1827. --sptr;
  1828. }
  1829. }
  1830. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1831. {
  1832. int width_mmx = ((width >> 3) << 3);
  1833. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1834. if (width_mmx)
  1835. {
  1836. int dummy_value_c; // fix 'forbidden register spilled'
  1837. int dummy_value_S;
  1838. int dummy_value_D;
  1839. __asm__ __volatile__ (
  1840. "subl $7, %%esi \n\t"
  1841. "subl $15, %%edi \n\t"
  1842. ".loop1_pass4: \n\t"
  1843. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  1844. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  1845. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1846. "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
  1847. "movq %%mm1, 8(%%edi) \n\t"
  1848. "subl $8, %%esi \n\t"
  1849. "movq %%mm0, (%%edi) \n\t"
  1850. "subl $16, %%edi \n\t"
  1851. "subl $8, %%ecx \n\t"
  1852. "jnz .loop1_pass4 \n\t"
  1853. "EMMS \n\t" // DONE
  1854. : "=c" (dummy_value_c), // output regs (none)
  1855. "=S" (dummy_value_S),
  1856. "=D" (dummy_value_D)
  1857. : "1" (sptr), // esi // input regs
  1858. "2" (dp), // edi
  1859. "0" (width_mmx) // ecx
  1860. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1861. : "%mm0", "%mm1" // clobber list
  1862. #endif
  1863. );
  1864. }
  1865. sptr -= width_mmx;
  1866. dp -= width_mmx*2;
  1867. for (i = width; i; i--)
  1868. {
  1869. int j;
  1870. for (j = 0; j < png_pass_inc[pass]; j++)
  1871. {
  1872. *dp-- = *sptr;
  1873. }
  1874. --sptr;
  1875. }
  1876. }
  1877. } /* end of pixel_bytes == 1 */
  1878. //--------------------------------------------------------------
  1879. else if (pixel_bytes == 2)
  1880. {
  1881. if (((pass == 0) || (pass == 1)) && width)
  1882. {
  1883. int width_mmx = ((width >> 1) << 1);
  1884. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1885. if (width_mmx)
  1886. {
  1887. int dummy_value_c; // fix 'forbidden register spilled'
  1888. int dummy_value_S;
  1889. int dummy_value_D;
  1890. __asm__ __volatile__ (
  1891. "subl $2, %%esi \n\t"
  1892. "subl $30, %%edi \n\t"
  1893. ".loop2_pass0: \n\t"
  1894. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1895. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  1896. "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
  1897. "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
  1898. "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
  1899. "movq %%mm0, (%%edi) \n\t"
  1900. "movq %%mm0, 8(%%edi) \n\t"
  1901. "movq %%mm1, 16(%%edi) \n\t"
  1902. "subl $4, %%esi \n\t"
  1903. "movq %%mm1, 24(%%edi) \n\t"
  1904. "subl $32, %%edi \n\t"
  1905. "subl $2, %%ecx \n\t"
  1906. "jnz .loop2_pass0 \n\t"
  1907. "EMMS \n\t" // DONE
  1908. : "=c" (dummy_value_c), // output regs (dummy)
  1909. "=S" (dummy_value_S),
  1910. "=D" (dummy_value_D)
  1911. : "1" (sptr), // esi // input regs
  1912. "2" (dp), // edi
  1913. "0" (width_mmx) // ecx
  1914. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1915. : "%mm0", "%mm1" // clobber list
  1916. #endif
  1917. );
  1918. }
  1919. sptr -= (width_mmx*2 - 2); // sign fixed
  1920. dp -= (width_mmx*16 - 2); // sign fixed
  1921. for (i = width; i; i--)
  1922. {
  1923. png_byte v[8];
  1924. int j;
  1925. sptr -= 2;
  1926. png_memcpy(v, sptr, 2);
  1927. for (j = 0; j < png_pass_inc[pass]; j++)
  1928. {
  1929. dp -= 2;
  1930. png_memcpy(dp, v, 2);
  1931. }
  1932. }
  1933. }
  1934. else if (((pass == 2) || (pass == 3)) && width)
  1935. {
  1936. int width_mmx = ((width >> 1) << 1) ;
  1937. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1938. if (width_mmx)
  1939. {
  1940. int dummy_value_c; // fix 'forbidden register spilled'
  1941. int dummy_value_S;
  1942. int dummy_value_D;
  1943. __asm__ __volatile__ (
  1944. "subl $2, %%esi \n\t"
  1945. "subl $14, %%edi \n\t"
  1946. ".loop2_pass2: \n\t"
  1947. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1948. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  1949. "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
  1950. "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
  1951. "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
  1952. "movq %%mm0, (%%edi) \n\t"
  1953. "subl $4, %%esi \n\t"
  1954. "movq %%mm1, 8(%%edi) \n\t"
  1955. "subl $16, %%edi \n\t"
  1956. "subl $2, %%ecx \n\t"
  1957. "jnz .loop2_pass2 \n\t"
  1958. "EMMS \n\t" // DONE
  1959. : "=c" (dummy_value_c), // output regs (dummy)
  1960. "=S" (dummy_value_S),
  1961. "=D" (dummy_value_D)
  1962. : "1" (sptr), // esi // input regs
  1963. "2" (dp), // edi
  1964. "0" (width_mmx) // ecx
  1965. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1966. : "%mm0", "%mm1" // clobber list
  1967. #endif
  1968. );
  1969. }
  1970. sptr -= (width_mmx*2 - 2); // sign fixed
  1971. dp -= (width_mmx*8 - 2); // sign fixed
  1972. for (i = width; i; i--)
  1973. {
  1974. png_byte v[8];
  1975. int j;
  1976. sptr -= 2;
  1977. png_memcpy(v, sptr, 2);
  1978. for (j = 0; j < png_pass_inc[pass]; j++)
  1979. {
  1980. dp -= 2;
  1981. png_memcpy(dp, v, 2);
  1982. }
  1983. }
  1984. }
  1985. else if (width) // pass == 4 or 5
  1986. {
  1987. int width_mmx = ((width >> 1) << 1) ;
  1988. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1989. if (width_mmx)
  1990. {
  1991. int dummy_value_c; // fix 'forbidden register spilled'
  1992. int dummy_value_S;
  1993. int dummy_value_D;
  1994. __asm__ __volatile__ (
  1995. "subl $2, %%esi \n\t"
  1996. "subl $6, %%edi \n\t"
  1997. ".loop2_pass4: \n\t"
  1998. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1999. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  2000. "subl $4, %%esi \n\t"
  2001. "movq %%mm0, (%%edi) \n\t"
  2002. "subl $8, %%edi \n\t"
  2003. "subl $2, %%ecx \n\t"
  2004. "jnz .loop2_pass4 \n\t"
  2005. "EMMS \n\t" // DONE
  2006. : "=c" (dummy_value_c), // output regs (dummy)
  2007. "=S" (dummy_value_S),
  2008. "=D" (dummy_value_D)
  2009. : "1" (sptr), // esi // input regs
  2010. "2" (dp), // edi
  2011. "0" (width_mmx) // ecx
  2012. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2013. : "%mm0" // clobber list
  2014. #endif
  2015. );
  2016. }
  2017. sptr -= (width_mmx*2 - 2); // sign fixed
  2018. dp -= (width_mmx*4 - 2); // sign fixed
  2019. for (i = width; i; i--)
  2020. {
  2021. png_byte v[8];
  2022. int j;
  2023. sptr -= 2;
  2024. png_memcpy(v, sptr, 2);
  2025. for (j = 0; j < png_pass_inc[pass]; j++)
  2026. {
  2027. dp -= 2;
  2028. png_memcpy(dp, v, 2);
  2029. }
  2030. }
  2031. }
  2032. } /* end of pixel_bytes == 2 */
  2033. //--------------------------------------------------------------
  2034. else if (pixel_bytes == 4)
  2035. {
  2036. if (((pass == 0) || (pass == 1)) && width)
  2037. {
  2038. int width_mmx = ((width >> 1) << 1);
  2039. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2040. if (width_mmx)
  2041. {
  2042. int dummy_value_c; // fix 'forbidden register spilled'
  2043. int dummy_value_S;
  2044. int dummy_value_D;
  2045. __asm__ __volatile__ (
  2046. "subl $4, %%esi \n\t"
  2047. "subl $60, %%edi \n\t"
  2048. ".loop4_pass0: \n\t"
  2049. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2050. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2051. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2052. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2053. "movq %%mm0, (%%edi) \n\t"
  2054. "movq %%mm0, 8(%%edi) \n\t"
  2055. "movq %%mm0, 16(%%edi) \n\t"
  2056. "movq %%mm0, 24(%%edi) \n\t"
  2057. "movq %%mm1, 32(%%edi) \n\t"
  2058. "movq %%mm1, 40(%%edi) \n\t"
  2059. "movq %%mm1, 48(%%edi) \n\t"
  2060. "subl $8, %%esi \n\t"
  2061. "movq %%mm1, 56(%%edi) \n\t"
  2062. "subl $64, %%edi \n\t"
  2063. "subl $2, %%ecx \n\t"
  2064. "jnz .loop4_pass0 \n\t"
  2065. "EMMS \n\t" // DONE
  2066. : "=c" (dummy_value_c), // output regs (dummy)
  2067. "=S" (dummy_value_S),
  2068. "=D" (dummy_value_D)
  2069. : "1" (sptr), // esi // input regs
  2070. "2" (dp), // edi
  2071. "0" (width_mmx) // ecx
  2072. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2073. : "%mm0", "%mm1" // clobber list
  2074. #endif
  2075. );
  2076. }
  2077. sptr -= (width_mmx*4 - 4); // sign fixed
  2078. dp -= (width_mmx*32 - 4); // sign fixed
  2079. for (i = width; i; i--)
  2080. {
  2081. png_byte v[8];
  2082. int j;
  2083. sptr -= 4;
  2084. png_memcpy(v, sptr, 4);
  2085. for (j = 0; j < png_pass_inc[pass]; j++)
  2086. {
  2087. dp -= 4;
  2088. png_memcpy(dp, v, 4);
  2089. }
  2090. }
  2091. }
  2092. else if (((pass == 2) || (pass == 3)) && width)
  2093. {
  2094. int width_mmx = ((width >> 1) << 1);
  2095. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2096. if (width_mmx)
  2097. {
  2098. int dummy_value_c; // fix 'forbidden register spilled'
  2099. int dummy_value_S;
  2100. int dummy_value_D;
  2101. __asm__ __volatile__ (
  2102. "subl $4, %%esi \n\t"
  2103. "subl $28, %%edi \n\t"
  2104. ".loop4_pass2: \n\t"
  2105. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2106. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2107. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2108. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2109. "movq %%mm0, (%%edi) \n\t"
  2110. "movq %%mm0, 8(%%edi) \n\t"
  2111. "movq %%mm1, 16(%%edi) \n\t"
  2112. "movq %%mm1, 24(%%edi) \n\t"
  2113. "subl $8, %%esi \n\t"
  2114. "subl $32, %%edi \n\t"
  2115. "subl $2, %%ecx \n\t"
  2116. "jnz .loop4_pass2 \n\t"
  2117. "EMMS \n\t" // DONE
  2118. : "=c" (dummy_value_c), // output regs (dummy)
  2119. "=S" (dummy_value_S),
  2120. "=D" (dummy_value_D)
  2121. : "1" (sptr), // esi // input regs
  2122. "2" (dp), // edi
  2123. "0" (width_mmx) // ecx
  2124. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2125. : "%mm0", "%mm1" // clobber list
  2126. #endif
  2127. );
  2128. }
  2129. sptr -= (width_mmx*4 - 4); // sign fixed
  2130. dp -= (width_mmx*16 - 4); // sign fixed
  2131. for (i = width; i; i--)
  2132. {
  2133. png_byte v[8];
  2134. int j;
  2135. sptr -= 4;
  2136. png_memcpy(v, sptr, 4);
  2137. for (j = 0; j < png_pass_inc[pass]; j++)
  2138. {
  2139. dp -= 4;
  2140. png_memcpy(dp, v, 4);
  2141. }
  2142. }
  2143. }
  2144. else if (width) // pass == 4 or 5
  2145. {
  2146. int width_mmx = ((width >> 1) << 1) ;
  2147. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2148. if (width_mmx)
  2149. {
  2150. int dummy_value_c; // fix 'forbidden register spilled'
  2151. int dummy_value_S;
  2152. int dummy_value_D;
  2153. __asm__ __volatile__ (
  2154. "subl $4, %%esi \n\t"
  2155. "subl $12, %%edi \n\t"
  2156. ".loop4_pass4: \n\t"
  2157. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2158. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2159. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2160. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2161. "movq %%mm0, (%%edi) \n\t"
  2162. "subl $8, %%esi \n\t"
  2163. "movq %%mm1, 8(%%edi) \n\t"
  2164. "subl $16, %%edi \n\t"
  2165. "subl $2, %%ecx \n\t"
  2166. "jnz .loop4_pass4 \n\t"
  2167. "EMMS \n\t" // DONE
  2168. : "=c" (dummy_value_c), // output regs (dummy)
  2169. "=S" (dummy_value_S),
  2170. "=D" (dummy_value_D)
  2171. : "1" (sptr), // esi // input regs
  2172. "2" (dp), // edi
  2173. "0" (width_mmx) // ecx
  2174. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2175. : "%mm0", "%mm1" // clobber list
  2176. #endif
  2177. );
  2178. }
  2179. sptr -= (width_mmx*4 - 4); // sign fixed
  2180. dp -= (width_mmx*8 - 4); // sign fixed
  2181. for (i = width; i; i--)
  2182. {
  2183. png_byte v[8];
  2184. int j;
  2185. sptr -= 4;
  2186. png_memcpy(v, sptr, 4);
  2187. for (j = 0; j < png_pass_inc[pass]; j++)
  2188. {
  2189. dp -= 4;
  2190. png_memcpy(dp, v, 4);
  2191. }
  2192. }
  2193. }
  2194. } /* end of pixel_bytes == 4 */
  2195. //--------------------------------------------------------------
  2196. else if (pixel_bytes == 8)
  2197. {
  2198. // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
  2199. // GRR NOTE: no need to combine passes here!
  2200. if (((pass == 0) || (pass == 1)) && width)
  2201. {
  2202. int dummy_value_c; // fix 'forbidden register spilled'
  2203. int dummy_value_S;
  2204. int dummy_value_D;
  2205. // source is 8-byte RRGGBBAA
  2206. // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
  2207. __asm__ __volatile__ (
  2208. "subl $56, %%edi \n\t" // start of last block
  2209. ".loop8_pass0: \n\t"
  2210. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2211. "movq %%mm0, (%%edi) \n\t"
  2212. "movq %%mm0, 8(%%edi) \n\t"
  2213. "movq %%mm0, 16(%%edi) \n\t"
  2214. "movq %%mm0, 24(%%edi) \n\t"
  2215. "movq %%mm0, 32(%%edi) \n\t"
  2216. "movq %%mm0, 40(%%edi) \n\t"
  2217. "movq %%mm0, 48(%%edi) \n\t"
  2218. "subl $8, %%esi \n\t"
  2219. "movq %%mm0, 56(%%edi) \n\t"
  2220. "subl $64, %%edi \n\t"
  2221. "decl %%ecx \n\t"
  2222. "jnz .loop8_pass0 \n\t"
  2223. "EMMS \n\t" // DONE
  2224. : "=c" (dummy_value_c), // output regs (dummy)
  2225. "=S" (dummy_value_S),
  2226. "=D" (dummy_value_D)
  2227. : "1" (sptr), // esi // input regs
  2228. "2" (dp), // edi
  2229. "0" (width) // ecx
  2230. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2231. : "%mm0" // clobber list
  2232. #endif
  2233. );
  2234. }
  2235. else if (((pass == 2) || (pass == 3)) && width)
  2236. {
  2237. // source is 8-byte RRGGBBAA
  2238. // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
  2239. // (recall that expansion is _in place_: sptr and dp
  2240. // both point at locations within same row buffer)
  2241. {
  2242. int dummy_value_c; // fix 'forbidden register spilled'
  2243. int dummy_value_S;
  2244. int dummy_value_D;
  2245. __asm__ __volatile__ (
  2246. "subl $24, %%edi \n\t" // start of last block
  2247. ".loop8_pass2: \n\t"
  2248. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2249. "movq %%mm0, (%%edi) \n\t"
  2250. "movq %%mm0, 8(%%edi) \n\t"
  2251. "movq %%mm0, 16(%%edi) \n\t"
  2252. "subl $8, %%esi \n\t"
  2253. "movq %%mm0, 24(%%edi) \n\t"
  2254. "subl $32, %%edi \n\t"
  2255. "decl %%ecx \n\t"
  2256. "jnz .loop8_pass2 \n\t"
  2257. "EMMS \n\t" // DONE
  2258. : "=c" (dummy_value_c), // output regs (dummy)
  2259. "=S" (dummy_value_S),
  2260. "=D" (dummy_value_D)
  2261. : "1" (sptr), // esi // input regs
  2262. "2" (dp), // edi
  2263. "0" (width) // ecx
  2264. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2265. : "%mm0" // clobber list
  2266. #endif
  2267. );
  2268. }
  2269. }
  2270. else if (width) // pass == 4 or 5
  2271. {
  2272. // source is 8-byte RRGGBBAA
  2273. // dest is 16-byte RRGGBBAA RRGGBBAA
  2274. {
  2275. int dummy_value_c; // fix 'forbidden register spilled'
  2276. int dummy_value_S;
  2277. int dummy_value_D;
  2278. __asm__ __volatile__ (
  2279. "subl $8, %%edi \n\t" // start of last block
  2280. ".loop8_pass4: \n\t"
  2281. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2282. "movq %%mm0, (%%edi) \n\t"
  2283. "subl $8, %%esi \n\t"
  2284. "movq %%mm0, 8(%%edi) \n\t"
  2285. "subl $16, %%edi \n\t"
  2286. "decl %%ecx \n\t"
  2287. "jnz .loop8_pass4 \n\t"
  2288. "EMMS \n\t" // DONE
  2289. : "=c" (dummy_value_c), // output regs (dummy)
  2290. "=S" (dummy_value_S),
  2291. "=D" (dummy_value_D)
  2292. : "1" (sptr), // esi // input regs
  2293. "2" (dp), // edi
  2294. "0" (width) // ecx
  2295. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2296. : "%mm0" // clobber list
  2297. #endif
  2298. );
  2299. }
  2300. }
  2301. } /* end of pixel_bytes == 8 */
  2302. //--------------------------------------------------------------
  2303. else if (pixel_bytes == 6)
  2304. {
  2305. for (i = width; i; i--)
  2306. {
  2307. png_byte v[8];
  2308. int j;
  2309. png_memcpy(v, sptr, 6);
  2310. for (j = 0; j < png_pass_inc[pass]; j++)
  2311. {
  2312. png_memcpy(dp, v, 6);
  2313. dp -= 6;
  2314. }
  2315. sptr -= 6;
  2316. }
  2317. } /* end of pixel_bytes == 6 */
  2318. //--------------------------------------------------------------
  2319. else
  2320. {
  2321. for (i = width; i; i--)
  2322. {
  2323. png_byte v[8];
  2324. int j;
  2325. png_memcpy(v, sptr, pixel_bytes);
  2326. for (j = 0; j < png_pass_inc[pass]; j++)
  2327. {
  2328. png_memcpy(dp, v, pixel_bytes);
  2329. dp -= pixel_bytes;
  2330. }
  2331. sptr-= pixel_bytes;
  2332. }
  2333. }
  2334. } // end of _mmx_supported ========================================
  2335. else /* MMX not supported: use modified C code - takes advantage
  2336. * of inlining of png_memcpy for a constant */
  2337. /* GRR 19991007: does it? or should pixel_bytes in each
  2338. * block be replaced with immediate value (e.g., 1)? */
  2339. /* GRR 19991017: replaced with constants in each case */
  2340. #endif /* PNG_MMX_CODE_SUPPORTED */
  2341. {
  2342. if (pixel_bytes == 1)
  2343. {
  2344. for (i = width; i; i--)
  2345. {
  2346. int j;
  2347. for (j = 0; j < png_pass_inc[pass]; j++)
  2348. {
  2349. *dp-- = *sptr;
  2350. }
  2351. --sptr;
  2352. }
  2353. }
  2354. else if (pixel_bytes == 3)
  2355. {
  2356. for (i = width; i; i--)
  2357. {
  2358. png_byte v[8];
  2359. int j;
  2360. png_memcpy(v, sptr, 3);
  2361. for (j = 0; j < png_pass_inc[pass]; j++)
  2362. {
  2363. png_memcpy(dp, v, 3);
  2364. dp -= 3;
  2365. }
  2366. sptr -= 3;
  2367. }
  2368. }
  2369. else if (pixel_bytes == 2)
  2370. {
  2371. for (i = width; i; i--)
  2372. {
  2373. png_byte v[8];
  2374. int j;
  2375. png_memcpy(v, sptr, 2);
  2376. for (j = 0; j < png_pass_inc[pass]; j++)
  2377. {
  2378. png_memcpy(dp, v, 2);
  2379. dp -= 2;
  2380. }
  2381. sptr -= 2;
  2382. }
  2383. }
  2384. else if (pixel_bytes == 4)
  2385. {
  2386. for (i = width; i; i--)
  2387. {
  2388. png_byte v[8];
  2389. int j;
  2390. png_memcpy(v, sptr, 4);
  2391. for (j = 0; j < png_pass_inc[pass]; j++)
  2392. {
  2393. #ifdef PNG_DEBUG
  2394. if (dp < row || dp+3 > row+png_ptr->row_buf_size)
  2395. {
  2396. printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
  2397. row, dp, row+png_ptr->row_buf_size);
  2398. printf("row_buf=%d\n",png_ptr->row_buf_size);
  2399. }
  2400. #endif
  2401. png_memcpy(dp, v, 4);
  2402. dp -= 4;
  2403. }
  2404. sptr -= 4;
  2405. }
  2406. }
  2407. else if (pixel_bytes == 6)
  2408. {
  2409. for (i = width; i; i--)
  2410. {
  2411. png_byte v[8];
  2412. int j;
  2413. png_memcpy(v, sptr, 6);
  2414. for (j = 0; j < png_pass_inc[pass]; j++)
  2415. {
  2416. png_memcpy(dp, v, 6);
  2417. dp -= 6;
  2418. }
  2419. sptr -= 6;
  2420. }
  2421. }
  2422. else if (pixel_bytes == 8)
  2423. {
  2424. for (i = width; i; i--)
  2425. {
  2426. png_byte v[8];
  2427. int j;
  2428. png_memcpy(v, sptr, 8);
  2429. for (j = 0; j < png_pass_inc[pass]; j++)
  2430. {
  2431. png_memcpy(dp, v, 8);
  2432. dp -= 8;
  2433. }
  2434. sptr -= 8;
  2435. }
  2436. }
  2437. else /* GRR: should never be reached */
  2438. {
  2439. for (i = width; i; i--)
  2440. {
  2441. png_byte v[8];
  2442. int j;
  2443. png_memcpy(v, sptr, pixel_bytes);
  2444. for (j = 0; j < png_pass_inc[pass]; j++)
  2445. {
  2446. png_memcpy(dp, v, pixel_bytes);
  2447. dp -= pixel_bytes;
  2448. }
  2449. sptr -= pixel_bytes;
  2450. }
  2451. }
  2452. } /* end if (MMX not supported) */
  2453. break;
  2454. }
  2455. } /* end switch (row_info->pixel_depth) */
  2456. row_info->width = final_width;
  2457. row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
  2458. }
  2459. } /* end png_do_read_interlace() */
  2460. #endif /* PNG_HAVE_MMX_READ_INTERLACE */
  2461. #endif /* PNG_READ_INTERLACING_SUPPORTED */
  2462. #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
  2463. #if defined(PNG_MMX_CODE_SUPPORTED)
  2464. // These variables are utilized in the functions below. They are declared
  2465. // globally here to ensure alignment on 8-byte boundaries.
  2466. union uAll {
  2467. long long use;
  2468. double align;
  2469. } _LBCarryMask = {0x0101010101010101LL},
  2470. _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
  2471. _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
  2472. #ifdef PNG_THREAD_UNSAFE_OK
  2473. //===========================================================================//
  2474. // //
  2475. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
  2476. // //
  2477. //===========================================================================//
  2478. // Optimized code for PNG Average filter decoder
  2479. static void /* PRIVATE */
  2480. png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
  2481. png_bytep prev_row)
  2482. {
  2483. int bpp;
  2484. int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
  2485. int dummy_value_S;
  2486. int dummy_value_D;
  2487. bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
  2488. _FullLength = row_info->rowbytes; // # of bytes to filter
  2489. __asm__ __volatile__ (
  2490. // initialize address pointers and offset
  2491. #ifdef __PIC__
  2492. "pushl %%ebx \n\t" // save index to Global Offset Table
  2493. #endif
  2494. //pre "movl row, %%edi \n\t" // edi: Avg(x)
  2495. "xorl %%ebx, %%ebx \n\t" // ebx: x
  2496. "movl %%edi, %%edx \n\t"
  2497. //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2498. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  2499. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  2500. "xorl %%eax,%%eax \n\t"
  2501. // Compute the Raw value for the first bpp bytes
  2502. // Raw(x) = Avg(x) + (Prior(x)/2)
  2503. "avg_rlp: \n\t"
  2504. "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
  2505. "incl %%ebx \n\t"
  2506. "shrb %%al \n\t" // divide by 2
  2507. "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
  2508. //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
  2509. "cmpl %%ecx, %%ebx \n\t"
  2510. "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
  2511. "jb avg_rlp \n\t" // mov does not affect flags
  2512. // get # of bytes to alignment
  2513. "movl %%edi, _dif \n\t" // take start of row
  2514. "addl %%ebx, _dif \n\t" // add bpp
  2515. "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
  2516. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  2517. "subl %%edi, _dif \n\t" // subtract from start => value ebx at
  2518. "jz avg_go \n\t" // alignment
  2519. // fix alignment
  2520. // Compute the Raw value for the bytes up to the alignment boundary
  2521. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2522. "xorl %%ecx, %%ecx \n\t"
  2523. "avg_lp1: \n\t"
  2524. "xorl %%eax, %%eax \n\t"
  2525. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  2526. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  2527. "addw %%cx, %%ax \n\t"
  2528. "incl %%ebx \n\t"
  2529. "shrw %%ax \n\t" // divide by 2
  2530. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
  2531. "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
  2532. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
  2533. "jb avg_lp1 \n\t" // repeat until at alignment boundary
  2534. "avg_go: \n\t"
  2535. "movl _FullLength, %%eax \n\t"
  2536. "movl %%eax, %%ecx \n\t"
  2537. "subl %%ebx, %%eax \n\t" // subtract alignment fix
  2538. "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
  2539. "subl %%eax, %%ecx \n\t" // drop over bytes from original length
  2540. "movl %%ecx, _MMXLength \n\t"
  2541. #ifdef __PIC__
  2542. "popl %%ebx \n\t" // restore index to Global Offset Table
  2543. #endif
  2544. : "=c" (dummy_value_c), // output regs (dummy)
  2545. "=S" (dummy_value_S),
  2546. "=D" (dummy_value_D)
  2547. : "0" (bpp), // ecx // input regs
  2548. "1" (prev_row), // esi
  2549. "2" (row) // edi
  2550. : "%eax", "%edx" // clobber list
  2551. #ifndef __PIC__
  2552. , "%ebx"
  2553. #endif
  2554. // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
  2555. // (seems to work fine without...)
  2556. );
  2557. // now do the math for the rest of the row
  2558. switch (bpp)
  2559. {
  2560. case 3:
  2561. {
  2562. _ActiveMask.use = 0x0000000000ffffffLL;
  2563. _ShiftBpp.use = 24; // == 3 * 8
  2564. _ShiftRem.use = 40; // == 64 - 24
  2565. __asm__ __volatile__ (
  2566. // re-init address pointers and offset
  2567. "movq _ActiveMask, %%mm7 \n\t"
  2568. "movl _dif, %%ecx \n\t" // ecx: x = offset to
  2569. "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
  2570. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2571. "movq _HBClearMask, %%mm4 \n\t"
  2572. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2573. // prime the pump: load the first Raw(x-bpp) data set
  2574. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2575. // (correct pos. in loop below)
  2576. "avg_3lp: \n\t"
  2577. "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
  2578. "movq %%mm5, %%mm3 \n\t"
  2579. "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
  2580. // data
  2581. "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
  2582. "movq %%mm7, %%mm6 \n\t"
  2583. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2584. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2585. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2586. // byte
  2587. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2588. // each byte
  2589. // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
  2590. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2591. // LBCarrys
  2592. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2593. // where both
  2594. // lsb's were == 1 (only valid for active group)
  2595. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2596. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2597. // byte
  2598. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2599. // for each byte
  2600. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
  2601. // bytes to add to Avg
  2602. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2603. // Avg for each Active
  2604. // byte
  2605. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2606. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2607. // bytes 3-5
  2608. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2609. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2610. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2611. // LBCarrys
  2612. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2613. // where both
  2614. // lsb's were == 1 (only valid for active group)
  2615. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2616. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2617. // byte
  2618. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2619. // for each byte
  2620. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2621. // bytes to add to Avg
  2622. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2623. // Avg for each Active
  2624. // byte
  2625. // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
  2626. "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
  2627. // two
  2628. // bytes
  2629. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2630. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2631. // Data only needs to be shifted once here to
  2632. // get the correct x-bpp offset.
  2633. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2634. // LBCarrys
  2635. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2636. // where both
  2637. // lsb's were == 1 (only valid for active group)
  2638. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2639. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2640. // byte
  2641. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2642. // for each byte
  2643. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2644. // bytes to add to Avg
  2645. "addl $8, %%ecx \n\t"
  2646. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2647. // Avg for each Active
  2648. // byte
  2649. // now ready to write back to memory
  2650. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2651. // move updated Raw(x) to use as Raw(x-bpp) for next loop
  2652. "cmpl _MMXLength, %%ecx \n\t"
  2653. "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
  2654. "jb avg_3lp \n\t"
  2655. : "=S" (dummy_value_S), // output regs (dummy)
  2656. "=D" (dummy_value_D)
  2657. : "0" (prev_row), // esi // input regs
  2658. "1" (row) // edi
  2659. : "%ecx" // clobber list
  2660. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2661. , "%mm0", "%mm1", "%mm2", "%mm3"
  2662. , "%mm4", "%mm5", "%mm6", "%mm7"
  2663. #endif
  2664. );
  2665. }
  2666. break; // end 3 bpp
  2667. case 6:
  2668. case 4:
  2669. //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
  2670. //case 5: // GRR BOGUS
  2671. {
  2672. _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
  2673. // appropriate inactive bytes
  2674. _ShiftBpp.use = bpp << 3;
  2675. _ShiftRem.use = 64 - _ShiftBpp.use;
  2676. __asm__ __volatile__ (
  2677. "movq _HBClearMask, %%mm4 \n\t"
  2678. // re-init address pointers and offset
  2679. "movl _dif, %%ecx \n\t" // ecx: x = offset to
  2680. // alignment boundary
  2681. // load _ActiveMask and clear all bytes except for 1st active group
  2682. "movq _ActiveMask, %%mm7 \n\t"
  2683. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2684. "psrlq _ShiftRem, %%mm7 \n\t"
  2685. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2686. "movq %%mm7, %%mm6 \n\t"
  2687. "movq _LBCarryMask, %%mm5 \n\t"
  2688. "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
  2689. // group
  2690. // prime the pump: load the first Raw(x-bpp) data set
  2691. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2692. // (we correct pos. in loop below)
  2693. "avg_4lp: \n\t"
  2694. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2695. "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
  2696. "movq (%%esi,%%ecx,), %%mm1 \n\t"
  2697. // add (Prev_row/2) to average
  2698. "movq %%mm5, %%mm3 \n\t"
  2699. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2700. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2701. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2702. // byte
  2703. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2704. // each byte
  2705. // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
  2706. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2707. // LBCarrys
  2708. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2709. // where both
  2710. // lsb's were == 1 (only valid for active group)
  2711. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2712. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2713. // byte
  2714. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2715. // for each byte
  2716. "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
  2717. // bytes to add to Avg
  2718. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
  2719. // for each Active
  2720. // byte
  2721. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2722. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2723. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2724. "addl $8, %%ecx \n\t"
  2725. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2726. // LBCarrys
  2727. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2728. // where both
  2729. // lsb's were == 1 (only valid for active group)
  2730. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2731. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2732. // byte
  2733. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2734. // for each byte
  2735. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2736. // bytes to add to Avg
  2737. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2738. // Avg for each Active
  2739. // byte
  2740. "cmpl _MMXLength, %%ecx \n\t"
  2741. // now ready to write back to memory
  2742. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2743. // prep Raw(x-bpp) for next loop
  2744. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2745. "jb avg_4lp \n\t"
  2746. : "=S" (dummy_value_S), // output regs (dummy)
  2747. "=D" (dummy_value_D)
  2748. : "0" (prev_row), // esi // input regs
  2749. "1" (row) // edi
  2750. : "%ecx" // clobber list
  2751. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2752. , "%mm0", "%mm1", "%mm2", "%mm3"
  2753. , "%mm4", "%mm5", "%mm6", "%mm7"
  2754. #endif
  2755. );
  2756. }
  2757. break; // end 4,6 bpp
  2758. case 2:
  2759. {
  2760. _ActiveMask.use = 0x000000000000ffffLL;
  2761. _ShiftBpp.use = 16; // == 2 * 8
  2762. _ShiftRem.use = 48; // == 64 - 16
  2763. __asm__ __volatile__ (
  2764. // load _ActiveMask
  2765. "movq _ActiveMask, %%mm7 \n\t"
  2766. // re-init address pointers and offset
  2767. "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
  2768. // boundary
  2769. "movq _LBCarryMask, %%mm5 \n\t"
  2770. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2771. "movq _HBClearMask, %%mm4 \n\t"
  2772. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2773. // prime the pump: load the first Raw(x-bpp) data set
  2774. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2775. // (we correct pos. in loop below)
  2776. "avg_2lp: \n\t"
  2777. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2778. "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
  2779. "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
  2780. // add (Prev_row/2) to average
  2781. "movq %%mm5, %%mm3 \n\t"
  2782. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2783. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2784. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2785. // byte
  2786. "movq %%mm7, %%mm6 \n\t"
  2787. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2788. // each byte
  2789. // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
  2790. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2791. // LBCarrys
  2792. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2793. // where both
  2794. // lsb's were == 1 (only valid
  2795. // for active group)
  2796. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2797. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2798. // byte
  2799. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2800. // for each byte
  2801. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
  2802. // bytes to add to Avg
  2803. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
  2804. // for each Active byte
  2805. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2806. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2807. // bytes 2 & 3
  2808. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2809. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2810. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2811. // LBCarrys
  2812. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2813. // where both
  2814. // lsb's were == 1 (only valid
  2815. // for active group)
  2816. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2817. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2818. // byte
  2819. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2820. // for each byte
  2821. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2822. // bytes to add to Avg
  2823. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2824. // Avg for each Active byte
  2825. // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
  2826. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2827. // bytes 4 & 5
  2828. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2829. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2830. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2831. // LBCarrys
  2832. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2833. // where both lsb's were == 1
  2834. // (only valid for active group)
  2835. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2836. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2837. // byte
  2838. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2839. // for each byte
  2840. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2841. // bytes to add to Avg
  2842. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2843. // Avg for each Active byte
  2844. // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
  2845. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2846. // bytes 6 & 7
  2847. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2848. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2849. "addl $8, %%ecx \n\t"
  2850. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2851. // LBCarrys
  2852. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2853. // where both
  2854. // lsb's were == 1 (only valid
  2855. // for active group)
  2856. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2857. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2858. // byte
  2859. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2860. // for each byte
  2861. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2862. // bytes to add to Avg
  2863. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2864. // Avg for each Active byte
  2865. "cmpl _MMXLength, %%ecx \n\t"
  2866. // now ready to write back to memory
  2867. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2868. // prep Raw(x-bpp) for next loop
  2869. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2870. "jb avg_2lp \n\t"
  2871. : "=S" (dummy_value_S), // output regs (dummy)
  2872. "=D" (dummy_value_D)
  2873. : "0" (prev_row), // esi // input regs
  2874. "1" (row) // edi
  2875. : "%ecx" // clobber list
  2876. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2877. , "%mm0", "%mm1", "%mm2", "%mm3"
  2878. , "%mm4", "%mm5", "%mm6", "%mm7"
  2879. #endif
  2880. );
  2881. }
  2882. break; // end 2 bpp
  2883. case 1:
  2884. {
  2885. __asm__ __volatile__ (
  2886. // re-init address pointers and offset
  2887. #ifdef __PIC__
  2888. "pushl %%ebx \n\t" // save Global Offset Table index
  2889. #endif
  2890. "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
  2891. // boundary
  2892. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2893. "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
  2894. "jnb avg_1end \n\t"
  2895. // do Paeth decode for remaining bytes
  2896. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2897. "movl %%edi, %%edx \n\t"
  2898. // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  2899. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  2900. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
  2901. // in loop below
  2902. "avg_1lp: \n\t"
  2903. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2904. "xorl %%eax, %%eax \n\t"
  2905. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  2906. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  2907. "addw %%cx, %%ax \n\t"
  2908. "incl %%ebx \n\t"
  2909. "shrw %%ax \n\t" // divide by 2
  2910. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
  2911. // inc ebx
  2912. "cmpl _FullLength, %%ebx \n\t" // check if at end of array
  2913. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
  2914. // mov does not affect flags; -1 to offset inc ebx
  2915. "jb avg_1lp \n\t"
  2916. "avg_1end: \n\t"
  2917. #ifdef __PIC__
  2918. "popl %%ebx \n\t" // Global Offset Table index
  2919. #endif
  2920. : "=c" (dummy_value_c), // output regs (dummy)
  2921. "=S" (dummy_value_S),
  2922. "=D" (dummy_value_D)
  2923. : "0" (bpp), // ecx // input regs
  2924. "1" (prev_row), // esi
  2925. "2" (row) // edi
  2926. : "%eax", "%edx" // clobber list
  2927. #ifndef __PIC__
  2928. , "%ebx"
  2929. #endif
  2930. );
  2931. }
  2932. return; // end 1 bpp
  2933. case 8:
  2934. {
  2935. __asm__ __volatile__ (
  2936. // re-init address pointers and offset
  2937. "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
  2938. "movq _LBCarryMask, %%mm5 \n\t" // boundary
  2939. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2940. "movq _HBClearMask, %%mm4 \n\t"
  2941. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2942. // prime the pump: load the first Raw(x-bpp) data set
  2943. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2944. // (NO NEED to correct pos. in loop below)
  2945. "avg_8lp: \n\t"
  2946. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2947. "movq %%mm5, %%mm3 \n\t"
  2948. "movq (%%esi,%%ecx,), %%mm1 \n\t"
  2949. "addl $8, %%ecx \n\t"
  2950. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2951. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2952. "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
  2953. // where both lsb's were == 1
  2954. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2955. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
  2956. "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
  2957. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
  2958. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
  2959. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
  2960. "cmpl _MMXLength, %%ecx \n\t"
  2961. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2962. "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
  2963. "jb avg_8lp \n\t"
  2964. : "=S" (dummy_value_S), // output regs (dummy)
  2965. "=D" (dummy_value_D)
  2966. : "0" (prev_row), // esi // input regs
  2967. "1" (row) // edi
  2968. : "%ecx" // clobber list
  2969. #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2970. , "%mm0", "%mm1", "%mm2"
  2971. , "%mm3", "%mm4", "%mm5"
  2972. #endif
  2973. );
  2974. }
  2975. break; // end 8 bpp
  2976. default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
  2977. {
  2978. #ifdef PNG_DEBUG
  2979. // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
  2980. png_debug(1,
  2981. "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
  2982. #endif
  2983. #if 0
  2984. __asm__ __volatile__ (
  2985. "movq _LBCarryMask, %%mm5 \n\t"
  2986. // re-init address pointers and offset
  2987. "movl _dif, %%ebx \n\t" // ebx: x = offset to
  2988. // alignment boundary
  2989. "movl row, %%edi \n\t" // edi: Avg(x)
  2990. "movq _HBClearMask, %%mm4 \n\t"
  2991. "movl %%edi, %%edx \n\t"
  2992. "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2993. "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
  2994. "avg_Alp: \n\t"
  2995. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  2996. "movq %%mm5, %%mm3 \n\t"
  2997. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  2998. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2999. "movq (%%edx,%%ebx,), %%mm2 \n\t"
  3000. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  3001. "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
  3002. // where both lsb's were == 1
  3003. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  3004. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  3005. // byte
  3006. "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
  3007. // byte
  3008. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  3009. // byte
  3010. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  3011. // each byte
  3012. "addl $8, %%ebx \n\t"
  3013. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
  3014. // byte
  3015. "cmpl _MMXLength, %%ebx \n\t"
  3016. "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
  3017. "jb avg_Alp \n\t"
  3018. : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
  3019. : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
  3020. : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
  3021. );
  3022. #endif /* 0 - NEVER REACHED */
  3023. }
  3024. break;
  3025. } // end switch (bpp)
  3026. __asm__ __volatile__ (
  3027. // MMX acceleration complete; now do clean-up
  3028. // check if any remaining bytes left to decode
  3029. #ifdef __PIC__
  3030. "pushl %%ebx \n\t" // save index to Global Offset Table
  3031. #endif
  3032. "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
  3033. //pre "movl row, %%edi \n\t" // edi: Avg(x)
  3034. "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
  3035. "jnb avg_end \n\t"
  3036. // do Avg decode for remaining bytes
  3037. //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
  3038. "movl %%edi, %%edx \n\t"
  3039. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3040. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  3041. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
  3042. "avg_lp2: \n\t"
  3043. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  3044. "xorl %%eax, %%eax \n\t"
  3045. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  3046. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  3047. "addw %%cx, %%ax \n\t"
  3048. "incl %%ebx \n\t"
  3049. "shrw %%ax \n\t" // divide by 2
  3050. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
  3051. "cmpl _FullLength, %%ebx \n\t" // check if at end of array
  3052. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
  3053. "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
  3054. "avg_end: \n\t"
  3055. "EMMS \n\t" // end MMX; prep for poss. FP instrs.
  3056. #ifdef __PIC__
  3057. "popl %%ebx \n\t" // restore index to Global Offset Table
  3058. #endif
  3059. : "=c" (dummy_value_c), // output regs (dummy)
  3060. "=S" (dummy_value_S),
  3061. "=D" (dummy_value_D)
  3062. : "0" (bpp), // ecx // input regs
  3063. "1" (prev_row), // esi
  3064. "2" (row) // edi
  3065. : "%eax", "%edx" // clobber list
  3066. #ifndef __PIC__
  3067. , "%ebx"
  3068. #endif
  3069. );
  3070. } /* end png_read_filter_row_mmx_avg() */
  3071. #endif
  3072. #ifdef PNG_THREAD_UNSAFE_OK
  3073. //===========================================================================//
  3074. // //
  3075. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
  3076. // //
  3077. //===========================================================================//
  3078. // Optimized code for PNG Paeth filter decoder
  3079. static void /* PRIVATE */
  3080. png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
  3081. png_bytep prev_row)
  3082. {
  3083. int bpp;
  3084. int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
  3085. int dummy_value_S;
  3086. int dummy_value_D;
  3087. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  3088. _FullLength = row_info->rowbytes; // # of bytes to filter
  3089. __asm__ __volatile__ (
  3090. #ifdef __PIC__
  3091. "pushl %%ebx \n\t" // save index to Global Offset Table
  3092. #endif
  3093. "xorl %%ebx, %%ebx \n\t" // ebx: x offset
  3094. //pre "movl row, %%edi \n\t"
  3095. "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
  3096. //pre "movl prev_row, %%esi \n\t"
  3097. "xorl %%eax, %%eax \n\t"
  3098. // Compute the Raw value for the first bpp bytes
  3099. // Note: the formula works out to be always
  3100. // Paeth(x) = Raw(x) + Prior(x) where x < bpp
  3101. "paeth_rlp: \n\t"
  3102. "movb (%%edi,%%ebx,), %%al \n\t"
  3103. "addb (%%esi,%%ebx,), %%al \n\t"
  3104. "incl %%ebx \n\t"
  3105. //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
  3106. "cmpl %%ecx, %%ebx \n\t"
  3107. "movb %%al, -1(%%edi,%%ebx,) \n\t"
  3108. "jb paeth_rlp \n\t"
  3109. // get # of bytes to alignment
  3110. "movl %%edi, _dif \n\t" // take start of row
  3111. "addl %%ebx, _dif \n\t" // add bpp
  3112. "xorl %%ecx, %%ecx \n\t"
  3113. "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
  3114. // boundary
  3115. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  3116. "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
  3117. // at alignment
  3118. "jz paeth_go \n\t"
  3119. // fix alignment
  3120. "paeth_lp1: \n\t"
  3121. "xorl %%eax, %%eax \n\t"
  3122. // pav = p - a = (a + b - c) - a = b - c
  3123. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3124. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3125. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3126. "movl %%eax, _patemp \n\t" // Save pav for later use
  3127. "xorl %%eax, %%eax \n\t"
  3128. // pbv = p - b = (a + b - c) - b = a - c
  3129. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3130. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3131. "movl %%eax, %%ecx \n\t"
  3132. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3133. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3134. // pc = abs(pcv)
  3135. "testl $0x80000000, %%eax \n\t"
  3136. "jz paeth_pca \n\t"
  3137. "negl %%eax \n\t" // reverse sign of neg values
  3138. "paeth_pca: \n\t"
  3139. "movl %%eax, _pctemp \n\t" // save pc for later use
  3140. // pb = abs(pbv)
  3141. "testl $0x80000000, %%ecx \n\t"
  3142. "jz paeth_pba \n\t"
  3143. "negl %%ecx \n\t" // reverse sign of neg values
  3144. "paeth_pba: \n\t"
  3145. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3146. // pa = abs(pav)
  3147. "movl _patemp, %%eax \n\t"
  3148. "testl $0x80000000, %%eax \n\t"
  3149. "jz paeth_paa \n\t"
  3150. "negl %%eax \n\t" // reverse sign of neg values
  3151. "paeth_paa: \n\t"
  3152. "movl %%eax, _patemp \n\t" // save pa for later use
  3153. // test if pa <= pb
  3154. "cmpl %%ecx, %%eax \n\t"
  3155. "jna paeth_abb \n\t"
  3156. // pa > pb; now test if pb <= pc
  3157. "cmpl _pctemp, %%ecx \n\t"
  3158. "jna paeth_bbc \n\t"
  3159. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3160. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3161. "jmp paeth_paeth \n\t"
  3162. "paeth_bbc: \n\t"
  3163. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3164. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  3165. "jmp paeth_paeth \n\t"
  3166. "paeth_abb: \n\t"
  3167. // pa <= pb; now test if pa <= pc
  3168. "cmpl _pctemp, %%eax \n\t"
  3169. "jna paeth_abc \n\t"
  3170. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3171. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3172. "jmp paeth_paeth \n\t"
  3173. "paeth_abc: \n\t"
  3174. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3175. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  3176. "paeth_paeth: \n\t"
  3177. "incl %%ebx \n\t"
  3178. "incl %%edx \n\t"
  3179. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3180. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  3181. "cmpl _dif, %%ebx \n\t"
  3182. "jb paeth_lp1 \n\t"
  3183. "paeth_go: \n\t"
  3184. "movl _FullLength, %%ecx \n\t"
  3185. "movl %%ecx, %%eax \n\t"
  3186. "subl %%ebx, %%eax \n\t" // subtract alignment fix
  3187. "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
  3188. "subl %%eax, %%ecx \n\t" // drop over bytes from original length
  3189. "movl %%ecx, _MMXLength \n\t"
  3190. #ifdef __PIC__
  3191. "popl %%ebx \n\t" // restore index to Global Offset Table
  3192. #endif
  3193. : "=c" (dummy_value_c), // output regs (dummy)
  3194. "=S" (dummy_value_S),
  3195. "=D" (dummy_value_D)
  3196. : "0" (bpp), // ecx // input regs
  3197. "1" (prev_row), // esi
  3198. "2" (row) // edi
  3199. : "%eax", "%edx" // clobber list
  3200. #ifndef __PIC__
  3201. , "%ebx"
  3202. #endif
  3203. );
  3204. // now do the math for the rest of the row
  3205. switch (bpp)
  3206. {
  3207. case 3:
  3208. {
  3209. _ActiveMask.use = 0x0000000000ffffffLL;
  3210. _ActiveMaskEnd.use = 0xffff000000000000LL;
  3211. _ShiftBpp.use = 24; // == bpp(3) * 8
  3212. _ShiftRem.use = 40; // == 64 - 24
  3213. __asm__ __volatile__ (
  3214. "movl _dif, %%ecx \n\t"
  3215. // preload "movl row, %%edi \n\t"
  3216. // preload "movl prev_row, %%esi \n\t"
  3217. "pxor %%mm0, %%mm0 \n\t"
  3218. // prime the pump: load the first Raw(x-bpp) data set
  3219. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3220. "paeth_3lp: \n\t"
  3221. "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
  3222. // 3 bytes
  3223. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3224. "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3225. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
  3226. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3227. "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
  3228. // 3 bytes
  3229. // pav = p - a = (a + b - c) - a = b - c
  3230. "movq %%mm2, %%mm4 \n\t"
  3231. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3232. // pbv = p - b = (a + b - c) - b = a - c
  3233. "movq %%mm1, %%mm5 \n\t"
  3234. "psubw %%mm3, %%mm4 \n\t"
  3235. "pxor %%mm7, %%mm7 \n\t"
  3236. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3237. "movq %%mm4, %%mm6 \n\t"
  3238. "psubw %%mm3, %%mm5 \n\t"
  3239. // pa = abs(p-a) = abs(pav)
  3240. // pb = abs(p-b) = abs(pbv)
  3241. // pc = abs(p-c) = abs(pcv)
  3242. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3243. "paddw %%mm5, %%mm6 \n\t"
  3244. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3245. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3246. "psubw %%mm0, %%mm4 \n\t"
  3247. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3248. "psubw %%mm0, %%mm4 \n\t"
  3249. "psubw %%mm7, %%mm5 \n\t"
  3250. "pxor %%mm0, %%mm0 \n\t"
  3251. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3252. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3253. "psubw %%mm7, %%mm5 \n\t"
  3254. "psubw %%mm0, %%mm6 \n\t"
  3255. // test pa <= pb
  3256. "movq %%mm4, %%mm7 \n\t"
  3257. "psubw %%mm0, %%mm6 \n\t"
  3258. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3259. "movq %%mm7, %%mm0 \n\t"
  3260. // use mm7 mask to merge pa & pb
  3261. "pand %%mm7, %%mm5 \n\t"
  3262. // use mm0 mask copy to merge a & b
  3263. "pand %%mm0, %%mm2 \n\t"
  3264. "pandn %%mm4, %%mm7 \n\t"
  3265. "pandn %%mm1, %%mm0 \n\t"
  3266. "paddw %%mm5, %%mm7 \n\t"
  3267. "paddw %%mm2, %%mm0 \n\t"
  3268. // test ((pa <= pb)? pa:pb) <= pc
  3269. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3270. "pxor %%mm1, %%mm1 \n\t"
  3271. "pand %%mm7, %%mm3 \n\t"
  3272. "pandn %%mm0, %%mm7 \n\t"
  3273. "paddw %%mm3, %%mm7 \n\t"
  3274. "pxor %%mm0, %%mm0 \n\t"
  3275. "packuswb %%mm1, %%mm7 \n\t"
  3276. "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3277. "pand _ActiveMask, %%mm7 \n\t"
  3278. "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
  3279. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3280. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3281. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3282. "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
  3283. // Raw(x-bpp)
  3284. // now do Paeth for 2nd set of bytes (3-5)
  3285. "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
  3286. "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3287. "pxor %%mm7, %%mm7 \n\t"
  3288. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3289. // pbv = p - b = (a + b - c) - b = a - c
  3290. "movq %%mm1, %%mm5 \n\t"
  3291. // pav = p - a = (a + b - c) - a = b - c
  3292. "movq %%mm2, %%mm4 \n\t"
  3293. "psubw %%mm3, %%mm5 \n\t"
  3294. "psubw %%mm3, %%mm4 \n\t"
  3295. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
  3296. // pav + pbv = pbv + pav
  3297. "movq %%mm5, %%mm6 \n\t"
  3298. "paddw %%mm4, %%mm6 \n\t"
  3299. // pa = abs(p-a) = abs(pav)
  3300. // pb = abs(p-b) = abs(pbv)
  3301. // pc = abs(p-c) = abs(pcv)
  3302. "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
  3303. "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
  3304. "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
  3305. "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
  3306. "psubw %%mm0, %%mm5 \n\t"
  3307. "psubw %%mm7, %%mm4 \n\t"
  3308. "psubw %%mm0, %%mm5 \n\t"
  3309. "psubw %%mm7, %%mm4 \n\t"
  3310. "pxor %%mm0, %%mm0 \n\t"
  3311. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3312. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3313. "psubw %%mm0, %%mm6 \n\t"
  3314. // test pa <= pb
  3315. "movq %%mm4, %%mm7 \n\t"
  3316. "psubw %%mm0, %%mm6 \n\t"
  3317. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3318. "movq %%mm7, %%mm0 \n\t"
  3319. // use mm7 mask to merge pa & pb
  3320. "pand %%mm7, %%mm5 \n\t"
  3321. // use mm0 mask copy to merge a & b
  3322. "pand %%mm0, %%mm2 \n\t"
  3323. "pandn %%mm4, %%mm7 \n\t"
  3324. "pandn %%mm1, %%mm0 \n\t"
  3325. "paddw %%mm5, %%mm7 \n\t"
  3326. "paddw %%mm2, %%mm0 \n\t"
  3327. // test ((pa <= pb)? pa:pb) <= pc
  3328. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3329. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3330. "pand %%mm7, %%mm3 \n\t"
  3331. "pandn %%mm0, %%mm7 \n\t"
  3332. "pxor %%mm1, %%mm1 \n\t"
  3333. "paddw %%mm3, %%mm7 \n\t"
  3334. "pxor %%mm0, %%mm0 \n\t"
  3335. "packuswb %%mm1, %%mm7 \n\t"
  3336. "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
  3337. "pand _ActiveMask, %%mm7 \n\t"
  3338. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3339. "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
  3340. // 3 bytes
  3341. // pav = p - a = (a + b - c) - a = b - c
  3342. "movq %%mm2, %%mm4 \n\t"
  3343. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3344. "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
  3345. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3346. "movq %%mm7, %%mm1 \n\t"
  3347. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3348. "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
  3349. // now mm1 will be used as Raw(x-bpp)
  3350. // now do Paeth for 3rd, and final, set of bytes (6-7)
  3351. "pxor %%mm7, %%mm7 \n\t"
  3352. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3353. "psubw %%mm3, %%mm4 \n\t"
  3354. // pbv = p - b = (a + b - c) - b = a - c
  3355. "movq %%mm1, %%mm5 \n\t"
  3356. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3357. "movq %%mm4, %%mm6 \n\t"
  3358. "psubw %%mm3, %%mm5 \n\t"
  3359. "pxor %%mm0, %%mm0 \n\t"
  3360. "paddw %%mm5, %%mm6 \n\t"
  3361. // pa = abs(p-a) = abs(pav)
  3362. // pb = abs(p-b) = abs(pbv)
  3363. // pc = abs(p-c) = abs(pcv)
  3364. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3365. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3366. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3367. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3368. "psubw %%mm0, %%mm4 \n\t"
  3369. "psubw %%mm7, %%mm5 \n\t"
  3370. "psubw %%mm0, %%mm4 \n\t"
  3371. "psubw %%mm7, %%mm5 \n\t"
  3372. "pxor %%mm0, %%mm0 \n\t"
  3373. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3374. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3375. "psubw %%mm0, %%mm6 \n\t"
  3376. // test pa <= pb
  3377. "movq %%mm4, %%mm7 \n\t"
  3378. "psubw %%mm0, %%mm6 \n\t"
  3379. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3380. "movq %%mm7, %%mm0 \n\t"
  3381. // use mm0 mask copy to merge a & b
  3382. "pand %%mm0, %%mm2 \n\t"
  3383. // use mm7 mask to merge pa & pb
  3384. "pand %%mm7, %%mm5 \n\t"
  3385. "pandn %%mm1, %%mm0 \n\t"
  3386. "pandn %%mm4, %%mm7 \n\t"
  3387. "paddw %%mm2, %%mm0 \n\t"
  3388. "paddw %%mm5, %%mm7 \n\t"
  3389. // test ((pa <= pb)? pa:pb) <= pc
  3390. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3391. "pand %%mm7, %%mm3 \n\t"
  3392. "pandn %%mm0, %%mm7 \n\t"
  3393. "paddw %%mm3, %%mm7 \n\t"
  3394. "pxor %%mm1, %%mm1 \n\t"
  3395. "packuswb %%mm7, %%mm1 \n\t"
  3396. // step ecx to next set of 8 bytes and repeat loop til done
  3397. "addl $8, %%ecx \n\t"
  3398. "pand _ActiveMaskEnd, %%mm1 \n\t"
  3399. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
  3400. // Raw(x)
  3401. "cmpl _MMXLength, %%ecx \n\t"
  3402. "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
  3403. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3404. // mm1 will be used as Raw(x-bpp) next loop
  3405. // mm3 ready to be used as Prior(x-bpp) next loop
  3406. "jb paeth_3lp \n\t"
  3407. : "=S" (dummy_value_S), // output regs (dummy)
  3408. "=D" (dummy_value_D)
  3409. : "0" (prev_row), // esi // input regs
  3410. "1" (row) // edi
  3411. : "%ecx" // clobber list
  3412. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3413. , "%mm0", "%mm1", "%mm2", "%mm3"
  3414. , "%mm4", "%mm5", "%mm6", "%mm7"
  3415. #endif
  3416. );
  3417. }
  3418. break; // end 3 bpp
  3419. case 6:
  3420. //case 7: // GRR BOGUS
  3421. //case 5: // GRR BOGUS
  3422. {
  3423. _ActiveMask.use = 0x00000000ffffffffLL;
  3424. _ActiveMask2.use = 0xffffffff00000000LL;
  3425. _ShiftBpp.use = bpp << 3; // == bpp * 8
  3426. _ShiftRem.use = 64 - _ShiftBpp.use;
  3427. __asm__ __volatile__ (
  3428. "movl _dif, %%ecx \n\t"
  3429. // preload "movl row, %%edi \n\t"
  3430. // preload "movl prev_row, %%esi \n\t"
  3431. // prime the pump: load the first Raw(x-bpp) data set
  3432. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3433. "pxor %%mm0, %%mm0 \n\t"
  3434. "paeth_6lp: \n\t"
  3435. // must shift to position Raw(x-bpp) data
  3436. "psrlq _ShiftRem, %%mm1 \n\t"
  3437. // do first set of 4 bytes
  3438. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3439. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3440. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3441. "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3442. // must shift to position Prior(x-bpp) data
  3443. "psrlq _ShiftRem, %%mm3 \n\t"
  3444. // pav = p - a = (a + b - c) - a = b - c
  3445. "movq %%mm2, %%mm4 \n\t"
  3446. "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
  3447. // pbv = p - b = (a + b - c) - b = a - c
  3448. "movq %%mm1, %%mm5 \n\t"
  3449. "psubw %%mm3, %%mm4 \n\t"
  3450. "pxor %%mm7, %%mm7 \n\t"
  3451. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3452. "movq %%mm4, %%mm6 \n\t"
  3453. "psubw %%mm3, %%mm5 \n\t"
  3454. // pa = abs(p-a) = abs(pav)
  3455. // pb = abs(p-b) = abs(pbv)
  3456. // pc = abs(p-c) = abs(pcv)
  3457. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3458. "paddw %%mm5, %%mm6 \n\t"
  3459. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3460. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3461. "psubw %%mm0, %%mm4 \n\t"
  3462. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3463. "psubw %%mm0, %%mm4 \n\t"
  3464. "psubw %%mm7, %%mm5 \n\t"
  3465. "pxor %%mm0, %%mm0 \n\t"
  3466. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3467. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3468. "psubw %%mm7, %%mm5 \n\t"
  3469. "psubw %%mm0, %%mm6 \n\t"
  3470. // test pa <= pb
  3471. "movq %%mm4, %%mm7 \n\t"
  3472. "psubw %%mm0, %%mm6 \n\t"
  3473. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3474. "movq %%mm7, %%mm0 \n\t"
  3475. // use mm7 mask to merge pa & pb
  3476. "pand %%mm7, %%mm5 \n\t"
  3477. // use mm0 mask copy to merge a & b
  3478. "pand %%mm0, %%mm2 \n\t"
  3479. "pandn %%mm4, %%mm7 \n\t"
  3480. "pandn %%mm1, %%mm0 \n\t"
  3481. "paddw %%mm5, %%mm7 \n\t"
  3482. "paddw %%mm2, %%mm0 \n\t"
  3483. // test ((pa <= pb)? pa:pb) <= pc
  3484. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3485. "pxor %%mm1, %%mm1 \n\t"
  3486. "pand %%mm7, %%mm3 \n\t"
  3487. "pandn %%mm0, %%mm7 \n\t"
  3488. "paddw %%mm3, %%mm7 \n\t"
  3489. "pxor %%mm0, %%mm0 \n\t"
  3490. "packuswb %%mm1, %%mm7 \n\t"
  3491. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3492. "pand _ActiveMask, %%mm7 \n\t"
  3493. "psrlq _ShiftRem, %%mm3 \n\t"
  3494. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
  3495. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
  3496. "movq %%mm2, %%mm6 \n\t"
  3497. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3498. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3499. "psllq _ShiftBpp, %%mm6 \n\t"
  3500. "movq %%mm7, %%mm5 \n\t"
  3501. "psrlq _ShiftRem, %%mm1 \n\t"
  3502. "por %%mm6, %%mm3 \n\t"
  3503. "psllq _ShiftBpp, %%mm5 \n\t"
  3504. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3505. "por %%mm5, %%mm1 \n\t"
  3506. // do second set of 4 bytes
  3507. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3508. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3509. // pav = p - a = (a + b - c) - a = b - c
  3510. "movq %%mm2, %%mm4 \n\t"
  3511. // pbv = p - b = (a + b - c) - b = a - c
  3512. "movq %%mm1, %%mm5 \n\t"
  3513. "psubw %%mm3, %%mm4 \n\t"
  3514. "pxor %%mm7, %%mm7 \n\t"
  3515. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3516. "movq %%mm4, %%mm6 \n\t"
  3517. "psubw %%mm3, %%mm5 \n\t"
  3518. // pa = abs(p-a) = abs(pav)
  3519. // pb = abs(p-b) = abs(pbv)
  3520. // pc = abs(p-c) = abs(pcv)
  3521. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3522. "paddw %%mm5, %%mm6 \n\t"
  3523. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3524. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3525. "psubw %%mm0, %%mm4 \n\t"
  3526. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3527. "psubw %%mm0, %%mm4 \n\t"
  3528. "psubw %%mm7, %%mm5 \n\t"
  3529. "pxor %%mm0, %%mm0 \n\t"
  3530. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3531. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3532. "psubw %%mm7, %%mm5 \n\t"
  3533. "psubw %%mm0, %%mm6 \n\t"
  3534. // test pa <= pb
  3535. "movq %%mm4, %%mm7 \n\t"
  3536. "psubw %%mm0, %%mm6 \n\t"
  3537. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3538. "movq %%mm7, %%mm0 \n\t"
  3539. // use mm7 mask to merge pa & pb
  3540. "pand %%mm7, %%mm5 \n\t"
  3541. // use mm0 mask copy to merge a & b
  3542. "pand %%mm0, %%mm2 \n\t"
  3543. "pandn %%mm4, %%mm7 \n\t"
  3544. "pandn %%mm1, %%mm0 \n\t"
  3545. "paddw %%mm5, %%mm7 \n\t"
  3546. "paddw %%mm2, %%mm0 \n\t"
  3547. // test ((pa <= pb)? pa:pb) <= pc
  3548. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3549. "pxor %%mm1, %%mm1 \n\t"
  3550. "pand %%mm7, %%mm3 \n\t"
  3551. "pandn %%mm0, %%mm7 \n\t"
  3552. "pxor %%mm1, %%mm1 \n\t"
  3553. "paddw %%mm3, %%mm7 \n\t"
  3554. "pxor %%mm0, %%mm0 \n\t"
  3555. // step ecx to next set of 8 bytes and repeat loop til done
  3556. "addl $8, %%ecx \n\t"
  3557. "packuswb %%mm7, %%mm1 \n\t"
  3558. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
  3559. "cmpl _MMXLength, %%ecx \n\t"
  3560. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3561. // mm1 will be used as Raw(x-bpp) next loop
  3562. "jb paeth_6lp \n\t"
  3563. : "=S" (dummy_value_S), // output regs (dummy)
  3564. "=D" (dummy_value_D)
  3565. : "0" (prev_row), // esi // input regs
  3566. "1" (row) // edi
  3567. : "%ecx" // clobber list
  3568. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3569. , "%mm0", "%mm1", "%mm2", "%mm3"
  3570. , "%mm4", "%mm5", "%mm6", "%mm7"
  3571. #endif
  3572. );
  3573. }
  3574. break; // end 6 bpp
  3575. case 4:
  3576. {
  3577. _ActiveMask.use = 0x00000000ffffffffLL;
  3578. __asm__ __volatile__ (
  3579. "movl _dif, %%ecx \n\t"
  3580. // preload "movl row, %%edi \n\t"
  3581. // preload "movl prev_row, %%esi \n\t"
  3582. "pxor %%mm0, %%mm0 \n\t"
  3583. // prime the pump: load the first Raw(x-bpp) data set
  3584. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
  3585. // a=Raw(x-bpp) bytes
  3586. "paeth_4lp: \n\t"
  3587. // do first set of 4 bytes
  3588. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3589. "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3590. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3591. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3592. // pav = p - a = (a + b - c) - a = b - c
  3593. "movq %%mm2, %%mm4 \n\t"
  3594. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3595. // pbv = p - b = (a + b - c) - b = a - c
  3596. "movq %%mm1, %%mm5 \n\t"
  3597. "psubw %%mm3, %%mm4 \n\t"
  3598. "pxor %%mm7, %%mm7 \n\t"
  3599. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3600. "movq %%mm4, %%mm6 \n\t"
  3601. "psubw %%mm3, %%mm5 \n\t"
  3602. // pa = abs(p-a) = abs(pav)
  3603. // pb = abs(p-b) = abs(pbv)
  3604. // pc = abs(p-c) = abs(pcv)
  3605. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3606. "paddw %%mm5, %%mm6 \n\t"
  3607. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3608. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3609. "psubw %%mm0, %%mm4 \n\t"
  3610. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3611. "psubw %%mm0, %%mm4 \n\t"
  3612. "psubw %%mm7, %%mm5 \n\t"
  3613. "pxor %%mm0, %%mm0 \n\t"
  3614. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3615. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3616. "psubw %%mm7, %%mm5 \n\t"
  3617. "psubw %%mm0, %%mm6 \n\t"
  3618. // test pa <= pb
  3619. "movq %%mm4, %%mm7 \n\t"
  3620. "psubw %%mm0, %%mm6 \n\t"
  3621. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3622. "movq %%mm7, %%mm0 \n\t"
  3623. // use mm7 mask to merge pa & pb
  3624. "pand %%mm7, %%mm5 \n\t"
  3625. // use mm0 mask copy to merge a & b
  3626. "pand %%mm0, %%mm2 \n\t"
  3627. "pandn %%mm4, %%mm7 \n\t"
  3628. "pandn %%mm1, %%mm0 \n\t"
  3629. "paddw %%mm5, %%mm7 \n\t"
  3630. "paddw %%mm2, %%mm0 \n\t"
  3631. // test ((pa <= pb)? pa:pb) <= pc
  3632. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3633. "pxor %%mm1, %%mm1 \n\t"
  3634. "pand %%mm7, %%mm3 \n\t"
  3635. "pandn %%mm0, %%mm7 \n\t"
  3636. "paddw %%mm3, %%mm7 \n\t"
  3637. "pxor %%mm0, %%mm0 \n\t"
  3638. "packuswb %%mm1, %%mm7 \n\t"
  3639. "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3640. "pand _ActiveMask, %%mm7 \n\t"
  3641. "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
  3642. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3643. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3644. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3645. "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
  3646. // do second set of 4 bytes
  3647. "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3648. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3649. // pav = p - a = (a + b - c) - a = b - c
  3650. "movq %%mm2, %%mm4 \n\t"
  3651. // pbv = p - b = (a + b - c) - b = a - c
  3652. "movq %%mm1, %%mm5 \n\t"
  3653. "psubw %%mm3, %%mm4 \n\t"
  3654. "pxor %%mm7, %%mm7 \n\t"
  3655. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3656. "movq %%mm4, %%mm6 \n\t"
  3657. "psubw %%mm3, %%mm5 \n\t"
  3658. // pa = abs(p-a) = abs(pav)
  3659. // pb = abs(p-b) = abs(pbv)
  3660. // pc = abs(p-c) = abs(pcv)
  3661. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3662. "paddw %%mm5, %%mm6 \n\t"
  3663. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3664. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3665. "psubw %%mm0, %%mm4 \n\t"
  3666. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3667. "psubw %%mm0, %%mm4 \n\t"
  3668. "psubw %%mm7, %%mm5 \n\t"
  3669. "pxor %%mm0, %%mm0 \n\t"
  3670. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3671. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3672. "psubw %%mm7, %%mm5 \n\t"
  3673. "psubw %%mm0, %%mm6 \n\t"
  3674. // test pa <= pb
  3675. "movq %%mm4, %%mm7 \n\t"
  3676. "psubw %%mm0, %%mm6 \n\t"
  3677. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3678. "movq %%mm7, %%mm0 \n\t"
  3679. // use mm7 mask to merge pa & pb
  3680. "pand %%mm7, %%mm5 \n\t"
  3681. // use mm0 mask copy to merge a & b
  3682. "pand %%mm0, %%mm2 \n\t"
  3683. "pandn %%mm4, %%mm7 \n\t"
  3684. "pandn %%mm1, %%mm0 \n\t"
  3685. "paddw %%mm5, %%mm7 \n\t"
  3686. "paddw %%mm2, %%mm0 \n\t"
  3687. // test ((pa <= pb)? pa:pb) <= pc
  3688. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3689. "pxor %%mm1, %%mm1 \n\t"
  3690. "pand %%mm7, %%mm3 \n\t"
  3691. "pandn %%mm0, %%mm7 \n\t"
  3692. "pxor %%mm1, %%mm1 \n\t"
  3693. "paddw %%mm3, %%mm7 \n\t"
  3694. "pxor %%mm0, %%mm0 \n\t"
  3695. // step ecx to next set of 8 bytes and repeat loop til done
  3696. "addl $8, %%ecx \n\t"
  3697. "packuswb %%mm7, %%mm1 \n\t"
  3698. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
  3699. "cmpl _MMXLength, %%ecx \n\t"
  3700. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3701. // mm1 will be used as Raw(x-bpp) next loop
  3702. "jb paeth_4lp \n\t"
  3703. : "=S" (dummy_value_S), // output regs (dummy)
  3704. "=D" (dummy_value_D)
  3705. : "0" (prev_row), // esi // input regs
  3706. "1" (row) // edi
  3707. : "%ecx" // clobber list
  3708. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3709. , "%mm0", "%mm1", "%mm2", "%mm3"
  3710. , "%mm4", "%mm5", "%mm6", "%mm7"
  3711. #endif
  3712. );
  3713. }
  3714. break; // end 4 bpp
  3715. case 8: // bpp == 8
  3716. {
  3717. _ActiveMask.use = 0x00000000ffffffffLL;
  3718. __asm__ __volatile__ (
  3719. "movl _dif, %%ecx \n\t"
  3720. // preload "movl row, %%edi \n\t"
  3721. // preload "movl prev_row, %%esi \n\t"
  3722. "pxor %%mm0, %%mm0 \n\t"
  3723. // prime the pump: load the first Raw(x-bpp) data set
  3724. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
  3725. // a=Raw(x-bpp) bytes
  3726. "paeth_8lp: \n\t"
  3727. // do first set of 4 bytes
  3728. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3729. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3730. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3731. "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3732. // pav = p - a = (a + b - c) - a = b - c
  3733. "movq %%mm2, %%mm4 \n\t"
  3734. "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
  3735. // pbv = p - b = (a + b - c) - b = a - c
  3736. "movq %%mm1, %%mm5 \n\t"
  3737. "psubw %%mm3, %%mm4 \n\t"
  3738. "pxor %%mm7, %%mm7 \n\t"
  3739. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3740. "movq %%mm4, %%mm6 \n\t"
  3741. "psubw %%mm3, %%mm5 \n\t"
  3742. // pa = abs(p-a) = abs(pav)
  3743. // pb = abs(p-b) = abs(pbv)
  3744. // pc = abs(p-c) = abs(pcv)
  3745. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3746. "paddw %%mm5, %%mm6 \n\t"
  3747. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3748. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3749. "psubw %%mm0, %%mm4 \n\t"
  3750. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3751. "psubw %%mm0, %%mm4 \n\t"
  3752. "psubw %%mm7, %%mm5 \n\t"
  3753. "pxor %%mm0, %%mm0 \n\t"
  3754. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3755. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3756. "psubw %%mm7, %%mm5 \n\t"
  3757. "psubw %%mm0, %%mm6 \n\t"
  3758. // test pa <= pb
  3759. "movq %%mm4, %%mm7 \n\t"
  3760. "psubw %%mm0, %%mm6 \n\t"
  3761. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3762. "movq %%mm7, %%mm0 \n\t"
  3763. // use mm7 mask to merge pa & pb
  3764. "pand %%mm7, %%mm5 \n\t"
  3765. // use mm0 mask copy to merge a & b
  3766. "pand %%mm0, %%mm2 \n\t"
  3767. "pandn %%mm4, %%mm7 \n\t"
  3768. "pandn %%mm1, %%mm0 \n\t"
  3769. "paddw %%mm5, %%mm7 \n\t"
  3770. "paddw %%mm2, %%mm0 \n\t"
  3771. // test ((pa <= pb)? pa:pb) <= pc
  3772. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3773. "pxor %%mm1, %%mm1 \n\t"
  3774. "pand %%mm7, %%mm3 \n\t"
  3775. "pandn %%mm0, %%mm7 \n\t"
  3776. "paddw %%mm3, %%mm7 \n\t"
  3777. "pxor %%mm0, %%mm0 \n\t"
  3778. "packuswb %%mm1, %%mm7 \n\t"
  3779. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3780. "pand _ActiveMask, %%mm7 \n\t"
  3781. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3782. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3783. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3784. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3785. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
  3786. // do second set of 4 bytes
  3787. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3788. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3789. // pav = p - a = (a + b - c) - a = b - c
  3790. "movq %%mm2, %%mm4 \n\t"
  3791. // pbv = p - b = (a + b - c) - b = a - c
  3792. "movq %%mm1, %%mm5 \n\t"
  3793. "psubw %%mm3, %%mm4 \n\t"
  3794. "pxor %%mm7, %%mm7 \n\t"
  3795. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3796. "movq %%mm4, %%mm6 \n\t"
  3797. "psubw %%mm3, %%mm5 \n\t"
  3798. // pa = abs(p-a) = abs(pav)
  3799. // pb = abs(p-b) = abs(pbv)
  3800. // pc = abs(p-c) = abs(pcv)
  3801. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3802. "paddw %%mm5, %%mm6 \n\t"
  3803. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3804. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3805. "psubw %%mm0, %%mm4 \n\t"
  3806. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3807. "psubw %%mm0, %%mm4 \n\t"
  3808. "psubw %%mm7, %%mm5 \n\t"
  3809. "pxor %%mm0, %%mm0 \n\t"
  3810. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3811. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3812. "psubw %%mm7, %%mm5 \n\t"
  3813. "psubw %%mm0, %%mm6 \n\t"
  3814. // test pa <= pb
  3815. "movq %%mm4, %%mm7 \n\t"
  3816. "psubw %%mm0, %%mm6 \n\t"
  3817. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3818. "movq %%mm7, %%mm0 \n\t"
  3819. // use mm7 mask to merge pa & pb
  3820. "pand %%mm7, %%mm5 \n\t"
  3821. // use mm0 mask copy to merge a & b
  3822. "pand %%mm0, %%mm2 \n\t"
  3823. "pandn %%mm4, %%mm7 \n\t"
  3824. "pandn %%mm1, %%mm0 \n\t"
  3825. "paddw %%mm5, %%mm7 \n\t"
  3826. "paddw %%mm2, %%mm0 \n\t"
  3827. // test ((pa <= pb)? pa:pb) <= pc
  3828. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3829. "pxor %%mm1, %%mm1 \n\t"
  3830. "pand %%mm7, %%mm3 \n\t"
  3831. "pandn %%mm0, %%mm7 \n\t"
  3832. "pxor %%mm1, %%mm1 \n\t"
  3833. "paddw %%mm3, %%mm7 \n\t"
  3834. "pxor %%mm0, %%mm0 \n\t"
  3835. // step ecx to next set of 8 bytes and repeat loop til done
  3836. "addl $8, %%ecx \n\t"
  3837. "packuswb %%mm7, %%mm1 \n\t"
  3838. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
  3839. "cmpl _MMXLength, %%ecx \n\t"
  3840. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3841. // mm1 will be used as Raw(x-bpp) next loop
  3842. "jb paeth_8lp \n\t"
  3843. : "=S" (dummy_value_S), // output regs (dummy)
  3844. "=D" (dummy_value_D)
  3845. : "0" (prev_row), // esi // input regs
  3846. "1" (row) // edi
  3847. : "%ecx" // clobber list
  3848. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3849. , "%mm0", "%mm1", "%mm2", "%mm3"
  3850. , "%mm4", "%mm5", "%mm6", "%mm7"
  3851. #endif
  3852. );
  3853. }
  3854. break; // end 8 bpp
  3855. case 1: // bpp = 1
  3856. case 2: // bpp = 2
  3857. default: // bpp > 8
  3858. {
  3859. __asm__ __volatile__ (
  3860. #ifdef __PIC__
  3861. "pushl %%ebx \n\t" // save Global Offset Table index
  3862. #endif
  3863. "movl _dif, %%ebx \n\t"
  3864. "cmpl _FullLength, %%ebx \n\t"
  3865. "jnb paeth_dend \n\t"
  3866. // preload "movl row, %%edi \n\t"
  3867. // preload "movl prev_row, %%esi \n\t"
  3868. // do Paeth decode for remaining bytes
  3869. "movl %%ebx, %%edx \n\t"
  3870. // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3871. "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
  3872. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
  3873. "paeth_dlp: \n\t"
  3874. "xorl %%eax, %%eax \n\t"
  3875. // pav = p - a = (a + b - c) - a = b - c
  3876. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3877. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3878. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3879. "movl %%eax, _patemp \n\t" // Save pav for later use
  3880. "xorl %%eax, %%eax \n\t"
  3881. // pbv = p - b = (a + b - c) - b = a - c
  3882. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3883. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3884. "movl %%eax, %%ecx \n\t"
  3885. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3886. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3887. // pc = abs(pcv)
  3888. "testl $0x80000000, %%eax \n\t"
  3889. "jz paeth_dpca \n\t"
  3890. "negl %%eax \n\t" // reverse sign of neg values
  3891. "paeth_dpca: \n\t"
  3892. "movl %%eax, _pctemp \n\t" // save pc for later use
  3893. // pb = abs(pbv)
  3894. "testl $0x80000000, %%ecx \n\t"
  3895. "jz paeth_dpba \n\t"
  3896. "negl %%ecx \n\t" // reverse sign of neg values
  3897. "paeth_dpba: \n\t"
  3898. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3899. // pa = abs(pav)
  3900. "movl _patemp, %%eax \n\t"
  3901. "testl $0x80000000, %%eax \n\t"
  3902. "jz paeth_dpaa \n\t"
  3903. "negl %%eax \n\t" // reverse sign of neg values
  3904. "paeth_dpaa: \n\t"
  3905. "movl %%eax, _patemp \n\t" // save pa for later use
  3906. // test if pa <= pb
  3907. "cmpl %%ecx, %%eax \n\t"
  3908. "jna paeth_dabb \n\t"
  3909. // pa > pb; now test if pb <= pc
  3910. "cmpl _pctemp, %%ecx \n\t"
  3911. "jna paeth_dbbc \n\t"
  3912. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3913. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3914. "jmp paeth_dpaeth \n\t"
  3915. "paeth_dbbc: \n\t"
  3916. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3917. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  3918. "jmp paeth_dpaeth \n\t"
  3919. "paeth_dabb: \n\t"
  3920. // pa <= pb; now test if pa <= pc
  3921. "cmpl _pctemp, %%eax \n\t"
  3922. "jna paeth_dabc \n\t"
  3923. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3924. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3925. "jmp paeth_dpaeth \n\t"
  3926. "paeth_dabc: \n\t"
  3927. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3928. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  3929. "paeth_dpaeth: \n\t"
  3930. "incl %%ebx \n\t"
  3931. "incl %%edx \n\t"
  3932. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3933. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  3934. "cmpl _FullLength, %%ebx \n\t"
  3935. "jb paeth_dlp \n\t"
  3936. "paeth_dend: \n\t"
  3937. #ifdef __PIC__
  3938. "popl %%ebx \n\t" // index to Global Offset Table
  3939. #endif
  3940. : "=c" (dummy_value_c), // output regs (dummy)
  3941. "=S" (dummy_value_S),
  3942. "=D" (dummy_value_D)
  3943. : "0" (bpp), // ecx // input regs
  3944. "1" (prev_row), // esi
  3945. "2" (row) // edi
  3946. : "%eax", "%edx" // clobber list
  3947. #ifndef __PIC__
  3948. , "%ebx"
  3949. #endif
  3950. );
  3951. }
  3952. return; // No need to go further with this one
  3953. } // end switch (bpp)
  3954. __asm__ __volatile__ (
  3955. // MMX acceleration complete; now do clean-up
  3956. // check if any remaining bytes left to decode
  3957. #ifdef __PIC__
  3958. "pushl %%ebx \n\t" // save index to Global Offset Table
  3959. #endif
  3960. "movl _MMXLength, %%ebx \n\t"
  3961. "cmpl _FullLength, %%ebx \n\t"
  3962. "jnb paeth_end \n\t"
  3963. //pre "movl row, %%edi \n\t"
  3964. //pre "movl prev_row, %%esi \n\t"
  3965. // do Paeth decode for remaining bytes
  3966. "movl %%ebx, %%edx \n\t"
  3967. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3968. "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
  3969. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
  3970. "paeth_lp2: \n\t"
  3971. "xorl %%eax, %%eax \n\t"
  3972. // pav = p - a = (a + b - c) - a = b - c
  3973. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3974. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3975. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3976. "movl %%eax, _patemp \n\t" // Save pav for later use
  3977. "xorl %%eax, %%eax \n\t"
  3978. // pbv = p - b = (a + b - c) - b = a - c
  3979. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3980. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3981. "movl %%eax, %%ecx \n\t"
  3982. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3983. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3984. // pc = abs(pcv)
  3985. "testl $0x80000000, %%eax \n\t"
  3986. "jz paeth_pca2 \n\t"
  3987. "negl %%eax \n\t" // reverse sign of neg values
  3988. "paeth_pca2: \n\t"
  3989. "movl %%eax, _pctemp \n\t" // save pc for later use
  3990. // pb = abs(pbv)
  3991. "testl $0x80000000, %%ecx \n\t"
  3992. "jz paeth_pba2 \n\t"
  3993. "negl %%ecx \n\t" // reverse sign of neg values
  3994. "paeth_pba2: \n\t"
  3995. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3996. // pa = abs(pav)
  3997. "movl _patemp, %%eax \n\t"
  3998. "testl $0x80000000, %%eax \n\t"
  3999. "jz paeth_paa2 \n\t"
  4000. "negl %%eax \n\t" // reverse sign of neg values
  4001. "paeth_paa2: \n\t"
  4002. "movl %%eax, _patemp \n\t" // save pa for later use
  4003. // test if pa <= pb
  4004. "cmpl %%ecx, %%eax \n\t"
  4005. "jna paeth_abb2 \n\t"
  4006. // pa > pb; now test if pb <= pc
  4007. "cmpl _pctemp, %%ecx \n\t"
  4008. "jna paeth_bbc2 \n\t"
  4009. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  4010. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  4011. "jmp paeth_paeth2 \n\t"
  4012. "paeth_bbc2: \n\t"
  4013. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  4014. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  4015. "jmp paeth_paeth2 \n\t"
  4016. "paeth_abb2: \n\t"
  4017. // pa <= pb; now test if pa <= pc
  4018. "cmpl _pctemp, %%eax \n\t"
  4019. "jna paeth_abc2 \n\t"
  4020. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  4021. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  4022. "jmp paeth_paeth2 \n\t"
  4023. "paeth_abc2: \n\t"
  4024. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  4025. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  4026. "paeth_paeth2: \n\t"
  4027. "incl %%ebx \n\t"
  4028. "incl %%edx \n\t"
  4029. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  4030. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  4031. "cmpl _FullLength, %%ebx \n\t"
  4032. "jb paeth_lp2 \n\t"
  4033. "paeth_end: \n\t"
  4034. "EMMS \n\t" // end MMX; prep for poss. FP instrs.
  4035. #ifdef __PIC__
  4036. "popl %%ebx \n\t" // restore index to Global Offset Table
  4037. #endif
  4038. : "=c" (dummy_value_c), // output regs (dummy)
  4039. "=S" (dummy_value_S),
  4040. "=D" (dummy_value_D)
  4041. : "0" (bpp), // ecx // input regs
  4042. "1" (prev_row), // esi
  4043. "2" (row) // edi
  4044. : "%eax", "%edx" // clobber list (no input regs!)
  4045. #ifndef __PIC__
  4046. , "%ebx"
  4047. #endif
  4048. );
  4049. } /* end png_read_filter_row_mmx_paeth() */
  4050. #endif
  4051. #ifdef PNG_THREAD_UNSAFE_OK
  4052. //===========================================================================//
  4053. // //
  4054. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
  4055. // //
  4056. //===========================================================================//
  4057. // Optimized code for PNG Sub filter decoder
  4058. static void /* PRIVATE */
  4059. png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
  4060. {
  4061. int bpp;
  4062. int dummy_value_a;
  4063. int dummy_value_D;
  4064. bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
  4065. _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
  4066. __asm__ __volatile__ (
  4067. //pre "movl row, %%edi \n\t"
  4068. "movl %%edi, %%esi \n\t" // lp = row
  4069. //pre "movl bpp, %%eax \n\t"
  4070. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4071. //irr "xorl %%eax, %%eax \n\t"
  4072. // get # of bytes to alignment
  4073. "movl %%edi, _dif \n\t" // take start of row
  4074. "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
  4075. // alignment boundary
  4076. "xorl %%ecx, %%ecx \n\t"
  4077. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  4078. "subl %%edi, _dif \n\t" // subtract from start ==> value
  4079. "jz sub_go \n\t" // ecx at alignment
  4080. "sub_lp1: \n\t" // fix alignment
  4081. "movb (%%esi,%%ecx,), %%al \n\t"
  4082. "addb %%al, (%%edi,%%ecx,) \n\t"
  4083. "incl %%ecx \n\t"
  4084. "cmpl _dif, %%ecx \n\t"
  4085. "jb sub_lp1 \n\t"
  4086. "sub_go: \n\t"
  4087. "movl _FullLength, %%eax \n\t"
  4088. "movl %%eax, %%edx \n\t"
  4089. "subl %%ecx, %%edx \n\t" // subtract alignment fix
  4090. "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
  4091. "subl %%edx, %%eax \n\t" // drop over bytes from length
  4092. "movl %%eax, _MMXLength \n\t"
  4093. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4094. "=D" (dummy_value_D) // 1
  4095. : "0" (bpp), // eax // input regs
  4096. "1" (row) // edi
  4097. : "%esi", "%ecx", "%edx" // clobber list
  4098. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4099. , "%mm0", "%mm1", "%mm2", "%mm3"
  4100. , "%mm4", "%mm5", "%mm6", "%mm7"
  4101. #endif
  4102. );
  4103. // now do the math for the rest of the row
  4104. switch (bpp)
  4105. {
  4106. case 3:
  4107. {
  4108. _ActiveMask.use = 0x0000ffffff000000LL;
  4109. _ShiftBpp.use = 24; // == 3 * 8
  4110. _ShiftRem.use = 40; // == 64 - 24
  4111. __asm__ __volatile__ (
  4112. // preload "movl row, %%edi \n\t"
  4113. "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
  4114. // active byte group
  4115. "movl %%edi, %%esi \n\t" // lp = row
  4116. // preload "movl bpp, %%eax \n\t"
  4117. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4118. "movq %%mm7, %%mm6 \n\t"
  4119. "movl _dif, %%edx \n\t"
  4120. "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
  4121. // 3rd active byte group
  4122. // prime the pump: load the first Raw(x-bpp) data set
  4123. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4124. "sub_3lp: \n\t" // shift data for adding first
  4125. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4126. // shift clears inactive bytes)
  4127. // add 1st active group
  4128. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4129. "paddb %%mm1, %%mm0 \n\t"
  4130. // add 2nd active group
  4131. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4132. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4133. "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
  4134. "paddb %%mm1, %%mm0 \n\t"
  4135. // add 3rd active group
  4136. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4137. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4138. "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
  4139. "addl $8, %%edx \n\t"
  4140. "paddb %%mm1, %%mm0 \n\t"
  4141. "cmpl _MMXLength, %%edx \n\t"
  4142. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
  4143. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4144. "jb sub_3lp \n\t"
  4145. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4146. "=D" (dummy_value_D) // 1
  4147. : "0" (bpp), // eax // input regs
  4148. "1" (row) // edi
  4149. : "%edx", "%esi" // clobber list
  4150. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4151. , "%mm0", "%mm1", "%mm6", "%mm7"
  4152. #endif
  4153. );
  4154. }
  4155. break;
  4156. case 1:
  4157. {
  4158. __asm__ __volatile__ (
  4159. "movl _dif, %%edx \n\t"
  4160. // preload "movl row, %%edi \n\t"
  4161. "cmpl _FullLength, %%edx \n\t"
  4162. "jnb sub_1end \n\t"
  4163. "movl %%edi, %%esi \n\t" // lp = row
  4164. "xorl %%eax, %%eax \n\t"
  4165. // preload "movl bpp, %%eax \n\t"
  4166. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4167. "sub_1lp: \n\t"
  4168. "movb (%%esi,%%edx,), %%al \n\t"
  4169. "addb %%al, (%%edi,%%edx,) \n\t"
  4170. "incl %%edx \n\t"
  4171. "cmpl _FullLength, %%edx \n\t"
  4172. "jb sub_1lp \n\t"
  4173. "sub_1end: \n\t"
  4174. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4175. "=D" (dummy_value_D) // 1
  4176. : "0" (bpp), // eax // input regs
  4177. "1" (row) // edi
  4178. : "%edx", "%esi" // clobber list
  4179. );
  4180. }
  4181. return;
  4182. case 6:
  4183. case 4:
  4184. //case 7: // GRR BOGUS
  4185. //case 5: // GRR BOGUS
  4186. {
  4187. _ShiftBpp.use = bpp << 3;
  4188. _ShiftRem.use = 64 - _ShiftBpp.use;
  4189. __asm__ __volatile__ (
  4190. // preload "movl row, %%edi \n\t"
  4191. "movl _dif, %%edx \n\t"
  4192. "movl %%edi, %%esi \n\t" // lp = row
  4193. // preload "movl bpp, %%eax \n\t"
  4194. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4195. // prime the pump: load the first Raw(x-bpp) data set
  4196. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4197. "sub_4lp: \n\t" // shift data for adding first
  4198. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4199. // shift clears inactive bytes)
  4200. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4201. "paddb %%mm1, %%mm0 \n\t"
  4202. // add 2nd active group
  4203. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4204. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4205. "addl $8, %%edx \n\t"
  4206. "paddb %%mm1, %%mm0 \n\t"
  4207. "cmpl _MMXLength, %%edx \n\t"
  4208. "movq %%mm0, -8(%%edi,%%edx,) \n\t"
  4209. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4210. "jb sub_4lp \n\t"
  4211. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4212. "=D" (dummy_value_D) // 1
  4213. : "0" (bpp), // eax // input regs
  4214. "1" (row) // edi
  4215. : "%edx", "%esi" // clobber list
  4216. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4217. , "%mm0", "%mm1"
  4218. #endif
  4219. );
  4220. }
  4221. break;
  4222. case 2:
  4223. {
  4224. _ActiveMask.use = 0x00000000ffff0000LL;
  4225. _ShiftBpp.use = 16; // == 2 * 8
  4226. _ShiftRem.use = 48; // == 64 - 16
  4227. __asm__ __volatile__ (
  4228. "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
  4229. // active byte group
  4230. "movl _dif, %%edx \n\t"
  4231. "movq %%mm7, %%mm6 \n\t"
  4232. // preload "movl row, %%edi \n\t"
  4233. "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
  4234. // 3rd active byte group
  4235. "movl %%edi, %%esi \n\t" // lp = row
  4236. "movq %%mm6, %%mm5 \n\t"
  4237. // preload "movl bpp, %%eax \n\t"
  4238. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4239. "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
  4240. // 4th active byte group
  4241. // prime the pump: load the first Raw(x-bpp) data set
  4242. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4243. "sub_2lp: \n\t" // shift data for adding first
  4244. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4245. // shift clears inactive bytes)
  4246. // add 1st active group
  4247. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4248. "paddb %%mm1, %%mm0 \n\t"
  4249. // add 2nd active group
  4250. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4251. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4252. "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
  4253. "paddb %%mm1, %%mm0 \n\t"
  4254. // add 3rd active group
  4255. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4256. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4257. "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
  4258. "paddb %%mm1, %%mm0 \n\t"
  4259. // add 4th active group
  4260. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4261. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4262. "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
  4263. "addl $8, %%edx \n\t"
  4264. "paddb %%mm1, %%mm0 \n\t"
  4265. "cmpl _MMXLength, %%edx \n\t"
  4266. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
  4267. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4268. "jb sub_2lp \n\t"
  4269. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4270. "=D" (dummy_value_D) // 1
  4271. : "0" (bpp), // eax // input regs
  4272. "1" (row) // edi
  4273. : "%edx", "%esi" // clobber list
  4274. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4275. , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
  4276. #endif
  4277. );
  4278. }
  4279. break;
  4280. case 8:
  4281. {
  4282. __asm__ __volatile__ (
  4283. // preload "movl row, %%edi \n\t"
  4284. "movl _dif, %%edx \n\t"
  4285. "movl %%edi, %%esi \n\t" // lp = row
  4286. // preload "movl bpp, %%eax \n\t"
  4287. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4288. "movl _MMXLength, %%ecx \n\t"
  4289. // prime the pump: load the first Raw(x-bpp) data set
  4290. "movq -8(%%edi,%%edx,), %%mm7 \n\t"
  4291. "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
  4292. "sub_8lp: \n\t"
  4293. "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
  4294. "paddb %%mm7, %%mm0 \n\t"
  4295. "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
  4296. "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
  4297. // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
  4298. // This will be repeated for each group of 8 bytes with the 8th
  4299. // group being used as the Raw(x-bpp) for the 1st group of the
  4300. // next loop.
  4301. "paddb %%mm0, %%mm1 \n\t"
  4302. "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
  4303. "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
  4304. "paddb %%mm1, %%mm2 \n\t"
  4305. "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
  4306. "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
  4307. "paddb %%mm2, %%mm3 \n\t"
  4308. "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
  4309. "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
  4310. "paddb %%mm3, %%mm4 \n\t"
  4311. "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
  4312. "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
  4313. "paddb %%mm4, %%mm5 \n\t"
  4314. "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
  4315. "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
  4316. "paddb %%mm5, %%mm6 \n\t"
  4317. "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
  4318. "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
  4319. "addl $64, %%edx \n\t"
  4320. "paddb %%mm6, %%mm7 \n\t"
  4321. "cmpl %%ecx, %%edx \n\t"
  4322. "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
  4323. "jb sub_8lp \n\t"
  4324. "cmpl _MMXLength, %%edx \n\t"
  4325. "jnb sub_8lt8 \n\t"
  4326. "sub_8lpA: \n\t"
  4327. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4328. "addl $8, %%edx \n\t"
  4329. "paddb %%mm7, %%mm0 \n\t"
  4330. "cmpl _MMXLength, %%edx \n\t"
  4331. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
  4332. "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
  4333. // to mm1 to be new Raw(x-bpp)
  4334. // for next loop
  4335. "jb sub_8lpA \n\t"
  4336. "sub_8lt8: \n\t"
  4337. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4338. "=D" (dummy_value_D) // 1
  4339. : "0" (bpp), // eax // input regs
  4340. "1" (row) // edi
  4341. : "%ecx", "%edx", "%esi" // clobber list
  4342. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4343. , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
  4344. #endif
  4345. );
  4346. }
  4347. break;
  4348. default: // bpp greater than 8 bytes GRR BOGUS
  4349. {
  4350. __asm__ __volatile__ (
  4351. "movl _dif, %%edx \n\t"
  4352. // preload "movl row, %%edi \n\t"
  4353. "movl %%edi, %%esi \n\t" // lp = row
  4354. // preload "movl bpp, %%eax \n\t"
  4355. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4356. "sub_Alp: \n\t"
  4357. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4358. "movq (%%esi,%%edx,), %%mm1 \n\t"
  4359. "addl $8, %%edx \n\t"
  4360. "paddb %%mm1, %%mm0 \n\t"
  4361. "cmpl _MMXLength, %%edx \n\t"
  4362. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
  4363. // -8 to offset addl edx
  4364. "jb sub_Alp \n\t"
  4365. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4366. "=D" (dummy_value_D) // 1
  4367. : "0" (bpp), // eax // input regs
  4368. "1" (row) // edi
  4369. : "%edx", "%esi" // clobber list
  4370. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4371. , "%mm0", "%mm1"
  4372. #endif
  4373. );
  4374. }
  4375. break;
  4376. } // end switch (bpp)
  4377. __asm__ __volatile__ (
  4378. "movl _MMXLength, %%edx \n\t"
  4379. //pre "movl row, %%edi \n\t"
  4380. "cmpl _FullLength, %%edx \n\t"
  4381. "jnb sub_end \n\t"
  4382. "movl %%edi, %%esi \n\t" // lp = row
  4383. //pre "movl bpp, %%eax \n\t"
  4384. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4385. "xorl %%eax, %%eax \n\t"
  4386. "sub_lp2: \n\t"
  4387. "movb (%%esi,%%edx,), %%al \n\t"
  4388. "addb %%al, (%%edi,%%edx,) \n\t"
  4389. "incl %%edx \n\t"
  4390. "cmpl _FullLength, %%edx \n\t"
  4391. "jb sub_lp2 \n\t"
  4392. "sub_end: \n\t"
  4393. "EMMS \n\t" // end MMX instructions
  4394. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4395. "=D" (dummy_value_D) // 1
  4396. : "0" (bpp), // eax // input regs
  4397. "1" (row) // edi
  4398. : "%edx", "%esi" // clobber list
  4399. );
  4400. } // end of png_read_filter_row_mmx_sub()
  4401. #endif
  4402. //===========================================================================//
  4403. // //
  4404. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
  4405. // //
  4406. //===========================================================================//
  4407. // Optimized code for PNG Up filter decoder
  4408. static void /* PRIVATE */
  4409. png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
  4410. png_bytep prev_row)
  4411. {
  4412. png_uint_32 len;
  4413. int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
  4414. int dummy_value_S;
  4415. int dummy_value_D;
  4416. len = row_info->rowbytes; // number of bytes to filter
  4417. __asm__ __volatile__ (
  4418. //pre "movl row, %%edi \n\t"
  4419. // get # of bytes to alignment
  4420. #ifdef __PIC__
  4421. "pushl %%ebx \n\t"
  4422. #endif
  4423. "movl %%edi, %%ecx \n\t"
  4424. "xorl %%ebx, %%ebx \n\t"
  4425. "addl $0x7, %%ecx \n\t"
  4426. "xorl %%eax, %%eax \n\t"
  4427. "andl $0xfffffff8, %%ecx \n\t"
  4428. //pre "movl prev_row, %%esi \n\t"
  4429. "subl %%edi, %%ecx \n\t"
  4430. "jz up_go \n\t"
  4431. "up_lp1: \n\t" // fix alignment
  4432. "movb (%%edi,%%ebx,), %%al \n\t"
  4433. "addb (%%esi,%%ebx,), %%al \n\t"
  4434. "incl %%ebx \n\t"
  4435. "cmpl %%ecx, %%ebx \n\t"
  4436. "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
  4437. "jb up_lp1 \n\t" // offset incl ebx
  4438. "up_go: \n\t"
  4439. //pre "movl len, %%edx \n\t"
  4440. "movl %%edx, %%ecx \n\t"
  4441. "subl %%ebx, %%edx \n\t" // subtract alignment fix
  4442. "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
  4443. "subl %%edx, %%ecx \n\t" // drop over bytes from length
  4444. // unrolled loop - use all MMX registers and interleave to reduce
  4445. // number of branch instructions (loops) and reduce partial stalls
  4446. "up_loop: \n\t"
  4447. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  4448. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  4449. "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
  4450. "paddb %%mm1, %%mm0 \n\t"
  4451. "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
  4452. "movq %%mm0, (%%edi,%%ebx,) \n\t"
  4453. "paddb %%mm3, %%mm2 \n\t"
  4454. "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
  4455. "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
  4456. "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
  4457. "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
  4458. "paddb %%mm5, %%mm4 \n\t"
  4459. "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
  4460. "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
  4461. "paddb %%mm7, %%mm6 \n\t"
  4462. "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
  4463. "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
  4464. "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
  4465. "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
  4466. "paddb %%mm1, %%mm0 \n\t"
  4467. "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
  4468. "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
  4469. "paddb %%mm3, %%mm2 \n\t"
  4470. "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
  4471. "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
  4472. "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
  4473. "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
  4474. "paddb %%mm5, %%mm4 \n\t"
  4475. "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
  4476. "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
  4477. "addl $64, %%ebx \n\t"
  4478. "paddb %%mm7, %%mm6 \n\t"
  4479. "cmpl %%ecx, %%ebx \n\t"
  4480. "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
  4481. "jb up_loop \n\t" // -8 to offset addl ebx
  4482. "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
  4483. "jz up_end \n\t"
  4484. "cmpl $8, %%edx \n\t" // test for less than 8 bytes
  4485. "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
  4486. "addl %%edx, %%ecx \n\t"
  4487. "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
  4488. "subl %%edx, %%ecx \n\t" // drop over bytes from length
  4489. "jz up_lt8 \n\t"
  4490. "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
  4491. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  4492. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  4493. "addl $8, %%ebx \n\t"
  4494. "paddb %%mm1, %%mm0 \n\t"
  4495. "cmpl %%ecx, %%ebx \n\t"
  4496. "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
  4497. "jb up_lpA \n\t" // offset add ebx
  4498. "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
  4499. "jz up_end \n\t"
  4500. "up_lt8: \n\t"
  4501. "xorl %%eax, %%eax \n\t"
  4502. "addl %%edx, %%ecx \n\t" // move over byte count into counter
  4503. "up_lp2: \n\t" // use x86 regs for remaining bytes
  4504. "movb (%%edi,%%ebx,), %%al \n\t"
  4505. "addb (%%esi,%%ebx,), %%al \n\t"
  4506. "incl %%ebx \n\t"
  4507. "cmpl %%ecx, %%ebx \n\t"
  4508. "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
  4509. "jb up_lp2 \n\t" // offset inc ebx
  4510. "up_end: \n\t"
  4511. "EMMS \n\t" // conversion of filtered row complete
  4512. #ifdef __PIC__
  4513. "popl %%ebx \n\t"
  4514. #endif
  4515. : "=d" (dummy_value_d), // 0 // output regs (dummy)
  4516. "=S" (dummy_value_S), // 1
  4517. "=D" (dummy_value_D) // 2
  4518. : "0" (len), // edx // input regs
  4519. "1" (prev_row), // esi
  4520. "2" (row) // edi
  4521. : "%eax", "%ecx" // clobber list (no input regs!)
  4522. #ifndef __PIC__
  4523. , "%ebx"
  4524. #endif
  4525. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4526. , "%mm0", "%mm1", "%mm2", "%mm3"
  4527. , "%mm4", "%mm5", "%mm6", "%mm7"
  4528. #endif
  4529. );
  4530. } // end of png_read_filter_row_mmx_up()
  4531. #endif /* PNG_MMX_CODE_SUPPORTED */
  4532. /*===========================================================================*/
  4533. /* */
  4534. /* P N G _ R E A D _ F I L T E R _ R O W */
  4535. /* */
  4536. /*===========================================================================*/
  4537. /* Optimized png_read_filter_row routines */
  4538. void /* PRIVATE */
  4539. png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
  4540. row, png_bytep prev_row, int filter)
  4541. {
  4542. #ifdef PNG_DEBUG
  4543. char filnm[10];
  4544. #endif
  4545. #if defined(PNG_MMX_CODE_SUPPORTED)
  4546. /* GRR: these are superseded by png_ptr->asm_flags: */
  4547. #define UseMMX_sub 1 // GRR: converted 20000730
  4548. #define UseMMX_up 1 // GRR: converted 20000729
  4549. #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
  4550. #define UseMMX_paeth 1 // GRR: converted 20000828
  4551. if (_mmx_supported == 2) {
  4552. /* this should have happened in png_init_mmx_flags() already */
  4553. #if !defined(PNG_1_0_X)
  4554. png_warning(png_ptr, "asm_flags may not have been initialized");
  4555. #endif
  4556. png_mmx_support();
  4557. }
  4558. #endif /* PNG_MMX_CODE_SUPPORTED */
  4559. #ifdef PNG_DEBUG
  4560. png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
  4561. switch (filter)
  4562. {
  4563. case 0: sprintf(filnm, "none");
  4564. break;
  4565. case 1: sprintf(filnm, "sub-%s",
  4566. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4567. #if !defined(PNG_1_0_X)
  4568. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
  4569. #endif
  4570. #endif
  4571. "x86");
  4572. break;
  4573. case 2: sprintf(filnm, "up-%s",
  4574. #ifdef PNG_MMX_CODE_SUPPORTED
  4575. #if !defined(PNG_1_0_X)
  4576. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
  4577. #endif
  4578. #endif
  4579. "x86");
  4580. break;
  4581. case 3: sprintf(filnm, "avg-%s",
  4582. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4583. #if !defined(PNG_1_0_X)
  4584. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
  4585. #endif
  4586. #endif
  4587. "x86");
  4588. break;
  4589. case 4: sprintf(filnm, "Paeth-%s",
  4590. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4591. #if !defined(PNG_1_0_X)
  4592. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
  4593. #endif
  4594. #endif
  4595. "x86");
  4596. break;
  4597. default: sprintf(filnm, "unknw");
  4598. break;
  4599. }
  4600. png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
  4601. png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
  4602. png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
  4603. (int)((row_info->pixel_depth + 7) >> 3));
  4604. png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
  4605. #endif /* PNG_DEBUG */
  4606. switch (filter)
  4607. {
  4608. case PNG_FILTER_VALUE_NONE:
  4609. break;
  4610. case PNG_FILTER_VALUE_SUB:
  4611. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4612. #if !defined(PNG_1_0_X)
  4613. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
  4614. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4615. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4616. #else
  4617. if (_mmx_supported)
  4618. #endif
  4619. {
  4620. png_read_filter_row_mmx_sub(row_info, row);
  4621. }
  4622. else
  4623. #endif /* PNG_MMX_CODE_SUPPORTED */
  4624. {
  4625. png_uint_32 i;
  4626. png_uint_32 istop = row_info->rowbytes;
  4627. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4628. png_bytep rp = row + bpp;
  4629. png_bytep lp = row;
  4630. for (i = bpp; i < istop; i++)
  4631. {
  4632. *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
  4633. rp++;
  4634. }
  4635. } /* end !UseMMX_sub */
  4636. break;
  4637. case PNG_FILTER_VALUE_UP:
  4638. #if defined(PNG_MMX_CODE_SUPPORTED)
  4639. #if !defined(PNG_1_0_X)
  4640. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
  4641. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4642. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4643. #else
  4644. if (_mmx_supported)
  4645. #endif
  4646. {
  4647. png_read_filter_row_mmx_up(row_info, row, prev_row);
  4648. }
  4649. else
  4650. #endif /* PNG_MMX_CODE_SUPPORTED */
  4651. {
  4652. png_uint_32 i;
  4653. png_uint_32 istop = row_info->rowbytes;
  4654. png_bytep rp = row;
  4655. png_bytep pp = prev_row;
  4656. for (i = 0; i < istop; ++i)
  4657. {
  4658. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  4659. rp++;
  4660. }
  4661. } /* end !UseMMX_up */
  4662. break;
  4663. case PNG_FILTER_VALUE_AVG:
  4664. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4665. #if !defined(PNG_1_0_X)
  4666. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
  4667. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4668. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4669. #else
  4670. if (_mmx_supported)
  4671. #endif
  4672. {
  4673. png_read_filter_row_mmx_avg(row_info, row, prev_row);
  4674. }
  4675. else
  4676. #endif /* PNG_MMX_CODE_SUPPORTED */
  4677. {
  4678. png_uint_32 i;
  4679. png_bytep rp = row;
  4680. png_bytep pp = prev_row;
  4681. png_bytep lp = row;
  4682. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4683. png_uint_32 istop = row_info->rowbytes - bpp;
  4684. for (i = 0; i < bpp; i++)
  4685. {
  4686. *rp = (png_byte)(((int)(*rp) +
  4687. ((int)(*pp++) >> 1)) & 0xff);
  4688. rp++;
  4689. }
  4690. for (i = 0; i < istop; i++)
  4691. {
  4692. *rp = (png_byte)(((int)(*rp) +
  4693. ((int)(*pp++ + *lp++) >> 1)) & 0xff);
  4694. rp++;
  4695. }
  4696. } /* end !UseMMX_avg */
  4697. break;
  4698. case PNG_FILTER_VALUE_PAETH:
  4699. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4700. #if !defined(PNG_1_0_X)
  4701. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
  4702. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4703. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4704. #else
  4705. if (_mmx_supported)
  4706. #endif
  4707. {
  4708. png_read_filter_row_mmx_paeth(row_info, row, prev_row);
  4709. }
  4710. else
  4711. #endif /* PNG_MMX_CODE_SUPPORTED */
  4712. {
  4713. png_uint_32 i;
  4714. png_bytep rp = row;
  4715. png_bytep pp = prev_row;
  4716. png_bytep lp = row;
  4717. png_bytep cp = prev_row;
  4718. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4719. png_uint_32 istop = row_info->rowbytes - bpp;
  4720. for (i = 0; i < bpp; i++)
  4721. {
  4722. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  4723. rp++;
  4724. }
  4725. for (i = 0; i < istop; i++) /* use leftover rp,pp */
  4726. {
  4727. int a, b, c, pa, pb, pc, p;
  4728. a = *lp++;
  4729. b = *pp++;
  4730. c = *cp++;
  4731. p = b - c;
  4732. pc = a - c;
  4733. #ifdef PNG_USE_ABS
  4734. pa = abs(p);
  4735. pb = abs(pc);
  4736. pc = abs(p + pc);
  4737. #else
  4738. pa = p < 0 ? -p : p;
  4739. pb = pc < 0 ? -pc : pc;
  4740. pc = (p + pc) < 0 ? -(p + pc) : p + pc;
  4741. #endif
  4742. /*
  4743. if (pa <= pb && pa <= pc)
  4744. p = a;
  4745. else if (pb <= pc)
  4746. p = b;
  4747. else
  4748. p = c;
  4749. */
  4750. p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
  4751. *rp = (png_byte)(((int)(*rp) + p) & 0xff);
  4752. rp++;
  4753. }
  4754. } /* end !UseMMX_paeth */
  4755. break;
  4756. default:
  4757. png_warning(png_ptr, "Ignoring bad row-filter type");
  4758. *row=0;
  4759. break;
  4760. }
  4761. }
  4762. #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
  4763. /*===========================================================================*/
  4764. /* */
  4765. /* P N G _ M M X _ S U P P O R T */
  4766. /* */
  4767. /*===========================================================================*/
  4768. /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
  4769. * (2) all instructions compile with gcc 2.7.2.3 and later
  4770. * (3) the function is moved down here to prevent gcc from
  4771. * inlining it in multiple places and then barfing be-
  4772. * cause the ".NOT_SUPPORTED" label is multiply defined
  4773. * [is there a way to signal that a *single* function should
  4774. * not be inlined? is there a way to modify the label for
  4775. * each inlined instance, e.g., by appending _1, _2, etc.?
  4776. * maybe if don't use leading "." in label name? (nope...sigh)]
  4777. */
  4778. int PNGAPI
  4779. png_mmx_support(void)
  4780. {
  4781. #if defined(PNG_MMX_CODE_SUPPORTED)
  4782. int result;
  4783. __asm__ __volatile__ (
  4784. "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
  4785. "pushl %%ecx \n\t" // so does ecx...
  4786. "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
  4787. // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
  4788. // "pushf \n\t" // 16-bit pushf
  4789. "pushfl \n\t" // save Eflag to stack
  4790. "popl %%eax \n\t" // get Eflag from stack into eax
  4791. "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
  4792. "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
  4793. "pushl %%eax \n\t" // save modified Eflag back to stack
  4794. // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
  4795. // "popf \n\t" // 16-bit popf
  4796. "popfl \n\t" // restore modified value to Eflag reg
  4797. "pushfl \n\t" // save Eflag to stack
  4798. "popl %%eax \n\t" // get Eflag from stack
  4799. "pushl %%ecx \n\t" // save original Eflag to stack
  4800. "popfl \n\t" // restore original Eflag
  4801. "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
  4802. "jz 0f \n\t" // if same, CPUID instr. is not supported
  4803. "xorl %%eax, %%eax \n\t" // set eax to zero
  4804. // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
  4805. "cpuid \n\t" // get the CPU identification info
  4806. "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
  4807. "jl 0f \n\t" // if eax is zero, MMX is not supported
  4808. "xorl %%eax, %%eax \n\t" // set eax to zero and...
  4809. "incl %%eax \n\t" // ...increment eax to 1. This pair is
  4810. // faster than the instruction "mov eax, 1"
  4811. "cpuid \n\t" // get the CPU identification info again
  4812. "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
  4813. "cmpl $0, %%edx \n\t" // 0 = MMX not supported
  4814. "jz 0f \n\t" // non-zero = yes, MMX IS supported
  4815. "movl $1, %%eax \n\t" // set return value to 1
  4816. "jmp 1f \n\t" // DONE: have MMX support
  4817. "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
  4818. "movl $0, %%eax \n\t" // set return value to 0
  4819. "1: \n\t" // .RETURN: target label for jump instructions
  4820. "popl %%edx \n\t" // restore edx
  4821. "popl %%ecx \n\t" // restore ecx
  4822. "popl %%ebx \n\t" // restore ebx
  4823. // "ret \n\t" // DONE: no MMX support
  4824. // (fall through to standard C "ret")
  4825. : "=a" (result) // output list
  4826. : // any variables used on input (none)
  4827. // no clobber list
  4828. // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
  4829. // , "memory" // if write to a variable gcc thought was in a reg
  4830. // , "cc" // "condition codes" (flag bits)
  4831. );
  4832. _mmx_supported = result;
  4833. #else
  4834. _mmx_supported = 0;
  4835. #endif /* PNG_MMX_CODE_SUPPORTED */
  4836. return _mmx_supported;
  4837. }
  4838. #endif /* PNG_USE_PNGGCCRD */