pngvcrd.c 140 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904
  1. /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
  2. *
  3. * For Intel x86 CPU and Microsoft Visual C++ compiler
  4. *
  5. * Last changed in libpng 1.2.6 - August 15, 2004
  6. * For conditions of distribution and use, see copyright notice in png.h
  7. * Copyright (c) 1998-2004 Glenn Randers-Pehrson
  8. * Copyright (c) 1998, Intel Corporation
  9. *
  10. * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
  11. * Interface to libpng contributed by Gilles Vollant, 1999
  12. *
  13. *
  14. * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
  15. * a sign error in the post-MMX cleanup code for each pixel_depth resulted
  16. * in bad pixels at the beginning of some rows of some images, and also
  17. * (due to out-of-range memory reads and writes) caused heap corruption
  18. * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
  19. *
  20. * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
  21. *
  22. * [runtime MMX configuration, GRR 20010102]
  23. *
  24. */
  25. #define PNG_INTERNAL
  26. #include "png.h"
  27. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
  28. static int mmx_supported=2;
  29. int PNGAPI
  30. png_mmx_support(void)
  31. {
  32. int mmx_supported_local = 0;
  33. _asm {
  34. push ebx //CPUID will trash these
  35. push ecx
  36. push edx
  37. pushfd //Save Eflag to stack
  38. pop eax //Get Eflag from stack into eax
  39. mov ecx, eax //Make another copy of Eflag in ecx
  40. xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
  41. push eax //Save modified Eflag back to stack
  42. popfd //Restored modified value back to Eflag reg
  43. pushfd //Save Eflag to stack
  44. pop eax //Get Eflag from stack
  45. push ecx // save original Eflag to stack
  46. popfd // restore original Eflag
  47. xor eax, ecx //Compare the new Eflag with the original Eflag
  48. jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
  49. //skip following instructions and jump to
  50. //NOT_SUPPORTED label
  51. xor eax, eax //Set eax to zero
  52. _asm _emit 0x0f //CPUID instruction (two bytes opcode)
  53. _asm _emit 0xa2
  54. cmp eax, 1 //make sure eax return non-zero value
  55. jl NOT_SUPPORTED //If eax is zero, mmx not supported
  56. xor eax, eax //set eax to zero
  57. inc eax //Now increment eax to 1. This instruction is
  58. //faster than the instruction "mov eax, 1"
  59. _asm _emit 0x0f //CPUID instruction
  60. _asm _emit 0xa2
  61. and edx, 0x00800000 //mask out all bits but mmx bit(24)
  62. cmp edx, 0 // 0 = mmx not supported
  63. jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
  64. mov mmx_supported_local, 1 //set return value to 1
  65. NOT_SUPPORTED:
  66. mov eax, mmx_supported_local //move return value to eax
  67. pop edx //CPUID trashed these
  68. pop ecx
  69. pop ebx
  70. }
  71. //mmx_supported_local=0; // test code for force don't support MMX
  72. //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
  73. mmx_supported = mmx_supported_local;
  74. return mmx_supported_local;
  75. }
  76. /* Combines the row recently read in with the previous row.
  77. This routine takes care of alpha and transparency if requested.
  78. This routine also handles the two methods of progressive display
  79. of interlaced images, depending on the mask value.
  80. The mask value describes which pixels are to be combined with
  81. the row. The pattern always repeats every 8 pixels, so just 8
  82. bits are needed. A one indicates the pixel is to be combined; a
  83. zero indicates the pixel is to be skipped. This is in addition
  84. to any alpha or transparency value associated with the pixel. If
  85. you want all pixels to be combined, pass 0xff (255) in mask. */
  86. /* Use this routine for x86 platform - uses faster MMX routine if machine
  87. supports MMX */
  88. void /* PRIVATE */
  89. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  90. {
  91. #ifdef PNG_USE_LOCAL_ARRAYS
  92. const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  93. #endif
  94. png_debug(1,"in png_combine_row_asm\n");
  95. if (mmx_supported == 2) {
  96. #if !defined(PNG_1_0_X)
  97. /* this should have happened in png_init_mmx_flags() already */
  98. png_warning(png_ptr, "asm_flags may not have been initialized");
  99. #endif
  100. png_mmx_support();
  101. }
  102. if (mask == 0xff)
  103. {
  104. png_memcpy(row, png_ptr->row_buf + 1,
  105. (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
  106. png_ptr->width));
  107. }
  108. /* GRR: add "else if (mask == 0)" case?
  109. * or does png_combine_row() not even get called in that case? */
  110. else
  111. {
  112. switch (png_ptr->row_info.pixel_depth)
  113. {
  114. case 1:
  115. {
  116. png_bytep sp;
  117. png_bytep dp;
  118. int s_inc, s_start, s_end;
  119. int m;
  120. int shift;
  121. png_uint_32 i;
  122. sp = png_ptr->row_buf + 1;
  123. dp = row;
  124. m = 0x80;
  125. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  126. if (png_ptr->transformations & PNG_PACKSWAP)
  127. {
  128. s_start = 0;
  129. s_end = 7;
  130. s_inc = 1;
  131. }
  132. else
  133. #endif
  134. {
  135. s_start = 7;
  136. s_end = 0;
  137. s_inc = -1;
  138. }
  139. shift = s_start;
  140. for (i = 0; i < png_ptr->width; i++)
  141. {
  142. if (m & mask)
  143. {
  144. int value;
  145. value = (*sp >> shift) & 0x1;
  146. *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  147. *dp |= (png_byte)(value << shift);
  148. }
  149. if (shift == s_end)
  150. {
  151. shift = s_start;
  152. sp++;
  153. dp++;
  154. }
  155. else
  156. shift += s_inc;
  157. if (m == 1)
  158. m = 0x80;
  159. else
  160. m >>= 1;
  161. }
  162. break;
  163. }
  164. case 2:
  165. {
  166. png_bytep sp;
  167. png_bytep dp;
  168. int s_start, s_end, s_inc;
  169. int m;
  170. int shift;
  171. png_uint_32 i;
  172. int value;
  173. sp = png_ptr->row_buf + 1;
  174. dp = row;
  175. m = 0x80;
  176. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  177. if (png_ptr->transformations & PNG_PACKSWAP)
  178. {
  179. s_start = 0;
  180. s_end = 6;
  181. s_inc = 2;
  182. }
  183. else
  184. #endif
  185. {
  186. s_start = 6;
  187. s_end = 0;
  188. s_inc = -2;
  189. }
  190. shift = s_start;
  191. for (i = 0; i < png_ptr->width; i++)
  192. {
  193. if (m & mask)
  194. {
  195. value = (*sp >> shift) & 0x3;
  196. *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  197. *dp |= (png_byte)(value << shift);
  198. }
  199. if (shift == s_end)
  200. {
  201. shift = s_start;
  202. sp++;
  203. dp++;
  204. }
  205. else
  206. shift += s_inc;
  207. if (m == 1)
  208. m = 0x80;
  209. else
  210. m >>= 1;
  211. }
  212. break;
  213. }
  214. case 4:
  215. {
  216. png_bytep sp;
  217. png_bytep dp;
  218. int s_start, s_end, s_inc;
  219. int m;
  220. int shift;
  221. png_uint_32 i;
  222. int value;
  223. sp = png_ptr->row_buf + 1;
  224. dp = row;
  225. m = 0x80;
  226. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  227. if (png_ptr->transformations & PNG_PACKSWAP)
  228. {
  229. s_start = 0;
  230. s_end = 4;
  231. s_inc = 4;
  232. }
  233. else
  234. #endif
  235. {
  236. s_start = 4;
  237. s_end = 0;
  238. s_inc = -4;
  239. }
  240. shift = s_start;
  241. for (i = 0; i < png_ptr->width; i++)
  242. {
  243. if (m & mask)
  244. {
  245. value = (*sp >> shift) & 0xf;
  246. *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  247. *dp |= (png_byte)(value << shift);
  248. }
  249. if (shift == s_end)
  250. {
  251. shift = s_start;
  252. sp++;
  253. dp++;
  254. }
  255. else
  256. shift += s_inc;
  257. if (m == 1)
  258. m = 0x80;
  259. else
  260. m >>= 1;
  261. }
  262. break;
  263. }
  264. case 8:
  265. {
  266. png_bytep srcptr;
  267. png_bytep dstptr;
  268. png_uint_32 len;
  269. int m;
  270. int diff, unmask;
  271. __int64 mask0=0x0102040810204080;
  272. #if !defined(PNG_1_0_X)
  273. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  274. /* && mmx_supported */ )
  275. #else
  276. if (mmx_supported)
  277. #endif
  278. {
  279. srcptr = png_ptr->row_buf + 1;
  280. dstptr = row;
  281. m = 0x80;
  282. unmask = ~mask;
  283. len = png_ptr->width &~7; //reduce to multiple of 8
  284. diff = png_ptr->width & 7; //amount lost
  285. _asm
  286. {
  287. movd mm7, unmask //load bit pattern
  288. psubb mm6,mm6 //zero mm6
  289. punpcklbw mm7,mm7
  290. punpcklwd mm7,mm7
  291. punpckldq mm7,mm7 //fill register with 8 masks
  292. movq mm0,mask0
  293. pand mm0,mm7 //nonzero if keep byte
  294. pcmpeqb mm0,mm6 //zeros->1s, v versa
  295. mov ecx,len //load length of line (pixels)
  296. mov esi,srcptr //load source
  297. mov ebx,dstptr //load dest
  298. cmp ecx,0 //lcr
  299. je mainloop8end
  300. mainloop8:
  301. movq mm4,[esi]
  302. pand mm4,mm0
  303. movq mm6,mm0
  304. pandn mm6,[ebx]
  305. por mm4,mm6
  306. movq [ebx],mm4
  307. add esi,8 //inc by 8 bytes processed
  308. add ebx,8
  309. sub ecx,8 //dec by 8 pixels processed
  310. ja mainloop8
  311. mainloop8end:
  312. mov ecx,diff
  313. cmp ecx,0
  314. jz end8
  315. mov edx,mask
  316. sal edx,24 //make low byte the high byte
  317. secondloop8:
  318. sal edx,1 //move high bit to CF
  319. jnc skip8 //if CF = 0
  320. mov al,[esi]
  321. mov [ebx],al
  322. skip8:
  323. inc esi
  324. inc ebx
  325. dec ecx
  326. jnz secondloop8
  327. end8:
  328. emms
  329. }
  330. }
  331. else /* mmx not supported - use modified C routine */
  332. {
  333. register unsigned int incr1, initial_val, final_val;
  334. png_size_t pixel_bytes;
  335. png_uint_32 i;
  336. register int disp = png_pass_inc[png_ptr->pass];
  337. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  338. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  339. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  340. pixel_bytes;
  341. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  342. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  343. final_val = png_ptr->width*pixel_bytes;
  344. incr1 = (disp)*pixel_bytes;
  345. for (i = initial_val; i < final_val; i += incr1)
  346. {
  347. png_memcpy(dstptr, srcptr, pixel_bytes);
  348. srcptr += incr1;
  349. dstptr += incr1;
  350. }
  351. } /* end of else */
  352. break;
  353. } // end 8 bpp
  354. case 16:
  355. {
  356. png_bytep srcptr;
  357. png_bytep dstptr;
  358. png_uint_32 len;
  359. int unmask, diff;
  360. __int64 mask1=0x0101020204040808,
  361. mask0=0x1010202040408080;
  362. #if !defined(PNG_1_0_X)
  363. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  364. /* && mmx_supported */ )
  365. #else
  366. if (mmx_supported)
  367. #endif
  368. {
  369. srcptr = png_ptr->row_buf + 1;
  370. dstptr = row;
  371. unmask = ~mask;
  372. len = (png_ptr->width)&~7;
  373. diff = (png_ptr->width)&7;
  374. _asm
  375. {
  376. movd mm7, unmask //load bit pattern
  377. psubb mm6,mm6 //zero mm6
  378. punpcklbw mm7,mm7
  379. punpcklwd mm7,mm7
  380. punpckldq mm7,mm7 //fill register with 8 masks
  381. movq mm0,mask0
  382. movq mm1,mask1
  383. pand mm0,mm7
  384. pand mm1,mm7
  385. pcmpeqb mm0,mm6
  386. pcmpeqb mm1,mm6
  387. mov ecx,len //load length of line
  388. mov esi,srcptr //load source
  389. mov ebx,dstptr //load dest
  390. cmp ecx,0 //lcr
  391. jz mainloop16end
  392. mainloop16:
  393. movq mm4,[esi]
  394. pand mm4,mm0
  395. movq mm6,mm0
  396. movq mm7,[ebx]
  397. pandn mm6,mm7
  398. por mm4,mm6
  399. movq [ebx],mm4
  400. movq mm5,[esi+8]
  401. pand mm5,mm1
  402. movq mm7,mm1
  403. movq mm6,[ebx+8]
  404. pandn mm7,mm6
  405. por mm5,mm7
  406. movq [ebx+8],mm5
  407. add esi,16 //inc by 16 bytes processed
  408. add ebx,16
  409. sub ecx,8 //dec by 8 pixels processed
  410. ja mainloop16
  411. mainloop16end:
  412. mov ecx,diff
  413. cmp ecx,0
  414. jz end16
  415. mov edx,mask
  416. sal edx,24 //make low byte the high byte
  417. secondloop16:
  418. sal edx,1 //move high bit to CF
  419. jnc skip16 //if CF = 0
  420. mov ax,[esi]
  421. mov [ebx],ax
  422. skip16:
  423. add esi,2
  424. add ebx,2
  425. dec ecx
  426. jnz secondloop16
  427. end16:
  428. emms
  429. }
  430. }
  431. else /* mmx not supported - use modified C routine */
  432. {
  433. register unsigned int incr1, initial_val, final_val;
  434. png_size_t pixel_bytes;
  435. png_uint_32 i;
  436. register int disp = png_pass_inc[png_ptr->pass];
  437. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  438. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  439. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  440. pixel_bytes;
  441. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  442. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  443. final_val = png_ptr->width*pixel_bytes;
  444. incr1 = (disp)*pixel_bytes;
  445. for (i = initial_val; i < final_val; i += incr1)
  446. {
  447. png_memcpy(dstptr, srcptr, pixel_bytes);
  448. srcptr += incr1;
  449. dstptr += incr1;
  450. }
  451. } /* end of else */
  452. break;
  453. } // end 16 bpp
  454. case 24:
  455. {
  456. png_bytep srcptr;
  457. png_bytep dstptr;
  458. png_uint_32 len;
  459. int unmask, diff;
  460. __int64 mask2=0x0101010202020404, //24bpp
  461. mask1=0x0408080810101020,
  462. mask0=0x2020404040808080;
  463. srcptr = png_ptr->row_buf + 1;
  464. dstptr = row;
  465. unmask = ~mask;
  466. len = (png_ptr->width)&~7;
  467. diff = (png_ptr->width)&7;
  468. #if !defined(PNG_1_0_X)
  469. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  470. /* && mmx_supported */ )
  471. #else
  472. if (mmx_supported)
  473. #endif
  474. {
  475. _asm
  476. {
  477. movd mm7, unmask //load bit pattern
  478. psubb mm6,mm6 //zero mm6
  479. punpcklbw mm7,mm7
  480. punpcklwd mm7,mm7
  481. punpckldq mm7,mm7 //fill register with 8 masks
  482. movq mm0,mask0
  483. movq mm1,mask1
  484. movq mm2,mask2
  485. pand mm0,mm7
  486. pand mm1,mm7
  487. pand mm2,mm7
  488. pcmpeqb mm0,mm6
  489. pcmpeqb mm1,mm6
  490. pcmpeqb mm2,mm6
  491. mov ecx,len //load length of line
  492. mov esi,srcptr //load source
  493. mov ebx,dstptr //load dest
  494. cmp ecx,0
  495. jz mainloop24end
  496. mainloop24:
  497. movq mm4,[esi]
  498. pand mm4,mm0
  499. movq mm6,mm0
  500. movq mm7,[ebx]
  501. pandn mm6,mm7
  502. por mm4,mm6
  503. movq [ebx],mm4
  504. movq mm5,[esi+8]
  505. pand mm5,mm1
  506. movq mm7,mm1
  507. movq mm6,[ebx+8]
  508. pandn mm7,mm6
  509. por mm5,mm7
  510. movq [ebx+8],mm5
  511. movq mm6,[esi+16]
  512. pand mm6,mm2
  513. movq mm4,mm2
  514. movq mm7,[ebx+16]
  515. pandn mm4,mm7
  516. por mm6,mm4
  517. movq [ebx+16],mm6
  518. add esi,24 //inc by 24 bytes processed
  519. add ebx,24
  520. sub ecx,8 //dec by 8 pixels processed
  521. ja mainloop24
  522. mainloop24end:
  523. mov ecx,diff
  524. cmp ecx,0
  525. jz end24
  526. mov edx,mask
  527. sal edx,24 //make low byte the high byte
  528. secondloop24:
  529. sal edx,1 //move high bit to CF
  530. jnc skip24 //if CF = 0
  531. mov ax,[esi]
  532. mov [ebx],ax
  533. xor eax,eax
  534. mov al,[esi+2]
  535. mov [ebx+2],al
  536. skip24:
  537. add esi,3
  538. add ebx,3
  539. dec ecx
  540. jnz secondloop24
  541. end24:
  542. emms
  543. }
  544. }
  545. else /* mmx not supported - use modified C routine */
  546. {
  547. register unsigned int incr1, initial_val, final_val;
  548. png_size_t pixel_bytes;
  549. png_uint_32 i;
  550. register int disp = png_pass_inc[png_ptr->pass];
  551. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  552. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  553. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  554. pixel_bytes;
  555. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  556. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  557. final_val = png_ptr->width*pixel_bytes;
  558. incr1 = (disp)*pixel_bytes;
  559. for (i = initial_val; i < final_val; i += incr1)
  560. {
  561. png_memcpy(dstptr, srcptr, pixel_bytes);
  562. srcptr += incr1;
  563. dstptr += incr1;
  564. }
  565. } /* end of else */
  566. break;
  567. } // end 24 bpp
  568. case 32:
  569. {
  570. png_bytep srcptr;
  571. png_bytep dstptr;
  572. png_uint_32 len;
  573. int unmask, diff;
  574. __int64 mask3=0x0101010102020202, //32bpp
  575. mask2=0x0404040408080808,
  576. mask1=0x1010101020202020,
  577. mask0=0x4040404080808080;
  578. srcptr = png_ptr->row_buf + 1;
  579. dstptr = row;
  580. unmask = ~mask;
  581. len = (png_ptr->width)&~7;
  582. diff = (png_ptr->width)&7;
  583. #if !defined(PNG_1_0_X)
  584. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  585. /* && mmx_supported */ )
  586. #else
  587. if (mmx_supported)
  588. #endif
  589. {
  590. _asm
  591. {
  592. movd mm7, unmask //load bit pattern
  593. psubb mm6,mm6 //zero mm6
  594. punpcklbw mm7,mm7
  595. punpcklwd mm7,mm7
  596. punpckldq mm7,mm7 //fill register with 8 masks
  597. movq mm0,mask0
  598. movq mm1,mask1
  599. movq mm2,mask2
  600. movq mm3,mask3
  601. pand mm0,mm7
  602. pand mm1,mm7
  603. pand mm2,mm7
  604. pand mm3,mm7
  605. pcmpeqb mm0,mm6
  606. pcmpeqb mm1,mm6
  607. pcmpeqb mm2,mm6
  608. pcmpeqb mm3,mm6
  609. mov ecx,len //load length of line
  610. mov esi,srcptr //load source
  611. mov ebx,dstptr //load dest
  612. cmp ecx,0 //lcr
  613. jz mainloop32end
  614. mainloop32:
  615. movq mm4,[esi]
  616. pand mm4,mm0
  617. movq mm6,mm0
  618. movq mm7,[ebx]
  619. pandn mm6,mm7
  620. por mm4,mm6
  621. movq [ebx],mm4
  622. movq mm5,[esi+8]
  623. pand mm5,mm1
  624. movq mm7,mm1
  625. movq mm6,[ebx+8]
  626. pandn mm7,mm6
  627. por mm5,mm7
  628. movq [ebx+8],mm5
  629. movq mm6,[esi+16]
  630. pand mm6,mm2
  631. movq mm4,mm2
  632. movq mm7,[ebx+16]
  633. pandn mm4,mm7
  634. por mm6,mm4
  635. movq [ebx+16],mm6
  636. movq mm7,[esi+24]
  637. pand mm7,mm3
  638. movq mm5,mm3
  639. movq mm4,[ebx+24]
  640. pandn mm5,mm4
  641. por mm7,mm5
  642. movq [ebx+24],mm7
  643. add esi,32 //inc by 32 bytes processed
  644. add ebx,32
  645. sub ecx,8 //dec by 8 pixels processed
  646. ja mainloop32
  647. mainloop32end:
  648. mov ecx,diff
  649. cmp ecx,0
  650. jz end32
  651. mov edx,mask
  652. sal edx,24 //make low byte the high byte
  653. secondloop32:
  654. sal edx,1 //move high bit to CF
  655. jnc skip32 //if CF = 0
  656. mov eax,[esi]
  657. mov [ebx],eax
  658. skip32:
  659. add esi,4
  660. add ebx,4
  661. dec ecx
  662. jnz secondloop32
  663. end32:
  664. emms
  665. }
  666. }
  667. else /* mmx _not supported - Use modified C routine */
  668. {
  669. register unsigned int incr1, initial_val, final_val;
  670. png_size_t pixel_bytes;
  671. png_uint_32 i;
  672. register int disp = png_pass_inc[png_ptr->pass];
  673. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  674. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  675. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  676. pixel_bytes;
  677. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  678. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  679. final_val = png_ptr->width*pixel_bytes;
  680. incr1 = (disp)*pixel_bytes;
  681. for (i = initial_val; i < final_val; i += incr1)
  682. {
  683. png_memcpy(dstptr, srcptr, pixel_bytes);
  684. srcptr += incr1;
  685. dstptr += incr1;
  686. }
  687. } /* end of else */
  688. break;
  689. } // end 32 bpp
  690. case 48:
  691. {
  692. png_bytep srcptr;
  693. png_bytep dstptr;
  694. png_uint_32 len;
  695. int unmask, diff;
  696. __int64 mask5=0x0101010101010202,
  697. mask4=0x0202020204040404,
  698. mask3=0x0404080808080808,
  699. mask2=0x1010101010102020,
  700. mask1=0x2020202040404040,
  701. mask0=0x4040808080808080;
  702. #if !defined(PNG_1_0_X)
  703. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  704. /* && mmx_supported */ )
  705. #else
  706. if (mmx_supported)
  707. #endif
  708. {
  709. srcptr = png_ptr->row_buf + 1;
  710. dstptr = row;
  711. unmask = ~mask;
  712. len = (png_ptr->width)&~7;
  713. diff = (png_ptr->width)&7;
  714. _asm
  715. {
  716. movd mm7, unmask //load bit pattern
  717. psubb mm6,mm6 //zero mm6
  718. punpcklbw mm7,mm7
  719. punpcklwd mm7,mm7
  720. punpckldq mm7,mm7 //fill register with 8 masks
  721. movq mm0,mask0
  722. movq mm1,mask1
  723. movq mm2,mask2
  724. movq mm3,mask3
  725. movq mm4,mask4
  726. movq mm5,mask5
  727. pand mm0,mm7
  728. pand mm1,mm7
  729. pand mm2,mm7
  730. pand mm3,mm7
  731. pand mm4,mm7
  732. pand mm5,mm7
  733. pcmpeqb mm0,mm6
  734. pcmpeqb mm1,mm6
  735. pcmpeqb mm2,mm6
  736. pcmpeqb mm3,mm6
  737. pcmpeqb mm4,mm6
  738. pcmpeqb mm5,mm6
  739. mov ecx,len //load length of line
  740. mov esi,srcptr //load source
  741. mov ebx,dstptr //load dest
  742. cmp ecx,0
  743. jz mainloop48end
  744. mainloop48:
  745. movq mm7,[esi]
  746. pand mm7,mm0
  747. movq mm6,mm0
  748. pandn mm6,[ebx]
  749. por mm7,mm6
  750. movq [ebx],mm7
  751. movq mm6,[esi+8]
  752. pand mm6,mm1
  753. movq mm7,mm1
  754. pandn mm7,[ebx+8]
  755. por mm6,mm7
  756. movq [ebx+8],mm6
  757. movq mm6,[esi+16]
  758. pand mm6,mm2
  759. movq mm7,mm2
  760. pandn mm7,[ebx+16]
  761. por mm6,mm7
  762. movq [ebx+16],mm6
  763. movq mm7,[esi+24]
  764. pand mm7,mm3
  765. movq mm6,mm3
  766. pandn mm6,[ebx+24]
  767. por mm7,mm6
  768. movq [ebx+24],mm7
  769. movq mm6,[esi+32]
  770. pand mm6,mm4
  771. movq mm7,mm4
  772. pandn mm7,[ebx+32]
  773. por mm6,mm7
  774. movq [ebx+32],mm6
  775. movq mm7,[esi+40]
  776. pand mm7,mm5
  777. movq mm6,mm5
  778. pandn mm6,[ebx+40]
  779. por mm7,mm6
  780. movq [ebx+40],mm7
  781. add esi,48 //inc by 32 bytes processed
  782. add ebx,48
  783. sub ecx,8 //dec by 8 pixels processed
  784. ja mainloop48
  785. mainloop48end:
  786. mov ecx,diff
  787. cmp ecx,0
  788. jz end48
  789. mov edx,mask
  790. sal edx,24 //make low byte the high byte
  791. secondloop48:
  792. sal edx,1 //move high bit to CF
  793. jnc skip48 //if CF = 0
  794. mov eax,[esi]
  795. mov [ebx],eax
  796. skip48:
  797. add esi,4
  798. add ebx,4
  799. dec ecx
  800. jnz secondloop48
  801. end48:
  802. emms
  803. }
  804. }
  805. else /* mmx _not supported - Use modified C routine */
  806. {
  807. register unsigned int incr1, initial_val, final_val;
  808. png_size_t pixel_bytes;
  809. png_uint_32 i;
  810. register int disp = png_pass_inc[png_ptr->pass];
  811. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  812. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  813. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  814. pixel_bytes;
  815. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  816. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  817. final_val = png_ptr->width*pixel_bytes;
  818. incr1 = (disp)*pixel_bytes;
  819. for (i = initial_val; i < final_val; i += incr1)
  820. {
  821. png_memcpy(dstptr, srcptr, pixel_bytes);
  822. srcptr += incr1;
  823. dstptr += incr1;
  824. }
  825. } /* end of else */
  826. break;
  827. } // end 48 bpp
  828. default:
  829. {
  830. png_bytep sptr;
  831. png_bytep dp;
  832. png_size_t pixel_bytes;
  833. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  834. unsigned int i;
  835. register int disp = png_pass_inc[png_ptr->pass]; // get the offset
  836. register unsigned int incr1, initial_val, final_val;
  837. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  838. sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  839. pixel_bytes;
  840. dp = row + offset_table[png_ptr->pass]*pixel_bytes;
  841. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  842. final_val = png_ptr->width*pixel_bytes;
  843. incr1 = (disp)*pixel_bytes;
  844. for (i = initial_val; i < final_val; i += incr1)
  845. {
  846. png_memcpy(dp, sptr, pixel_bytes);
  847. sptr += incr1;
  848. dp += incr1;
  849. }
  850. break;
  851. }
  852. } /* end switch (png_ptr->row_info.pixel_depth) */
  853. } /* end if (non-trivial mask) */
  854. } /* end png_combine_row() */
  855. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  856. void /* PRIVATE */
  857. png_do_read_interlace(png_structp png_ptr)
  858. {
  859. png_row_infop row_info = &(png_ptr->row_info);
  860. png_bytep row = png_ptr->row_buf + 1;
  861. int pass = png_ptr->pass;
  862. png_uint_32 transformations = png_ptr->transformations;
  863. #ifdef PNG_USE_LOCAL_ARRAYS
  864. const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  865. #endif
  866. png_debug(1,"in png_do_read_interlace\n");
  867. if (mmx_supported == 2) {
  868. #if !defined(PNG_1_0_X)
  869. /* this should have happened in png_init_mmx_flags() already */
  870. png_warning(png_ptr, "asm_flags may not have been initialized");
  871. #endif
  872. png_mmx_support();
  873. }
  874. if (row != NULL && row_info != NULL)
  875. {
  876. png_uint_32 final_width;
  877. final_width = row_info->width * png_pass_inc[pass];
  878. switch (row_info->pixel_depth)
  879. {
  880. case 1:
  881. {
  882. png_bytep sp, dp;
  883. int sshift, dshift;
  884. int s_start, s_end, s_inc;
  885. png_byte v;
  886. png_uint_32 i;
  887. int j;
  888. sp = row + (png_size_t)((row_info->width - 1) >> 3);
  889. dp = row + (png_size_t)((final_width - 1) >> 3);
  890. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  891. if (transformations & PNG_PACKSWAP)
  892. {
  893. sshift = (int)((row_info->width + 7) & 7);
  894. dshift = (int)((final_width + 7) & 7);
  895. s_start = 7;
  896. s_end = 0;
  897. s_inc = -1;
  898. }
  899. else
  900. #endif
  901. {
  902. sshift = 7 - (int)((row_info->width + 7) & 7);
  903. dshift = 7 - (int)((final_width + 7) & 7);
  904. s_start = 0;
  905. s_end = 7;
  906. s_inc = 1;
  907. }
  908. for (i = row_info->width; i; i--)
  909. {
  910. v = (png_byte)((*sp >> sshift) & 0x1);
  911. for (j = 0; j < png_pass_inc[pass]; j++)
  912. {
  913. *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  914. *dp |= (png_byte)(v << dshift);
  915. if (dshift == s_end)
  916. {
  917. dshift = s_start;
  918. dp--;
  919. }
  920. else
  921. dshift += s_inc;
  922. }
  923. if (sshift == s_end)
  924. {
  925. sshift = s_start;
  926. sp--;
  927. }
  928. else
  929. sshift += s_inc;
  930. }
  931. break;
  932. }
  933. case 2:
  934. {
  935. png_bytep sp, dp;
  936. int sshift, dshift;
  937. int s_start, s_end, s_inc;
  938. png_uint_32 i;
  939. sp = row + (png_size_t)((row_info->width - 1) >> 2);
  940. dp = row + (png_size_t)((final_width - 1) >> 2);
  941. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  942. if (transformations & PNG_PACKSWAP)
  943. {
  944. sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  945. dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  946. s_start = 6;
  947. s_end = 0;
  948. s_inc = -2;
  949. }
  950. else
  951. #endif
  952. {
  953. sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  954. dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  955. s_start = 0;
  956. s_end = 6;
  957. s_inc = 2;
  958. }
  959. for (i = row_info->width; i; i--)
  960. {
  961. png_byte v;
  962. int j;
  963. v = (png_byte)((*sp >> sshift) & 0x3);
  964. for (j = 0; j < png_pass_inc[pass]; j++)
  965. {
  966. *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  967. *dp |= (png_byte)(v << dshift);
  968. if (dshift == s_end)
  969. {
  970. dshift = s_start;
  971. dp--;
  972. }
  973. else
  974. dshift += s_inc;
  975. }
  976. if (sshift == s_end)
  977. {
  978. sshift = s_start;
  979. sp--;
  980. }
  981. else
  982. sshift += s_inc;
  983. }
  984. break;
  985. }
  986. case 4:
  987. {
  988. png_bytep sp, dp;
  989. int sshift, dshift;
  990. int s_start, s_end, s_inc;
  991. png_uint_32 i;
  992. sp = row + (png_size_t)((row_info->width - 1) >> 1);
  993. dp = row + (png_size_t)((final_width - 1) >> 1);
  994. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  995. if (transformations & PNG_PACKSWAP)
  996. {
  997. sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  998. dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  999. s_start = 4;
  1000. s_end = 0;
  1001. s_inc = -4;
  1002. }
  1003. else
  1004. #endif
  1005. {
  1006. sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1007. dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1008. s_start = 0;
  1009. s_end = 4;
  1010. s_inc = 4;
  1011. }
  1012. for (i = row_info->width; i; i--)
  1013. {
  1014. png_byte v;
  1015. int j;
  1016. v = (png_byte)((*sp >> sshift) & 0xf);
  1017. for (j = 0; j < png_pass_inc[pass]; j++)
  1018. {
  1019. *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1020. *dp |= (png_byte)(v << dshift);
  1021. if (dshift == s_end)
  1022. {
  1023. dshift = s_start;
  1024. dp--;
  1025. }
  1026. else
  1027. dshift += s_inc;
  1028. }
  1029. if (sshift == s_end)
  1030. {
  1031. sshift = s_start;
  1032. sp--;
  1033. }
  1034. else
  1035. sshift += s_inc;
  1036. }
  1037. break;
  1038. }
  1039. default: // This is the place where the routine is modified
  1040. {
  1041. __int64 const4 = 0x0000000000FFFFFF;
  1042. // __int64 const5 = 0x000000FFFFFF0000; // unused...
  1043. __int64 const6 = 0x00000000000000FF;
  1044. png_bytep sptr, dp;
  1045. png_uint_32 i;
  1046. png_size_t pixel_bytes;
  1047. int width = row_info->width;
  1048. pixel_bytes = (row_info->pixel_depth >> 3);
  1049. sptr = row + (width - 1) * pixel_bytes;
  1050. dp = row + (final_width - 1) * pixel_bytes;
  1051. // New code by Nirav Chhatrapati - Intel Corporation
  1052. // sign fix by GRR
  1053. // NOTE: there is NO MMX code for 48-bit and 64-bit images
  1054. // use MMX routine if machine supports it
  1055. #if !defined(PNG_1_0_X)
  1056. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1057. /* && mmx_supported */ )
  1058. #else
  1059. if (mmx_supported)
  1060. #endif
  1061. {
  1062. if (pixel_bytes == 3)
  1063. {
  1064. if (((pass == 0) || (pass == 1)) && width)
  1065. {
  1066. _asm
  1067. {
  1068. mov esi, sptr
  1069. mov edi, dp
  1070. mov ecx, width
  1071. sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
  1072. loop_pass0:
  1073. movd mm0, [esi] ; X X X X X v2 v1 v0
  1074. pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
  1075. movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
  1076. psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
  1077. movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
  1078. psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
  1079. psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
  1080. por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
  1081. por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
  1082. movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
  1083. psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
  1084. movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
  1085. punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
  1086. movq [edi+16] , mm4
  1087. psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
  1088. movq [edi+8] , mm3
  1089. punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
  1090. sub esi, 3
  1091. movq [edi], mm0
  1092. sub edi, 24
  1093. //sub esi, 3
  1094. dec ecx
  1095. jnz loop_pass0
  1096. EMMS
  1097. }
  1098. }
  1099. else if (((pass == 2) || (pass == 3)) && width)
  1100. {
  1101. _asm
  1102. {
  1103. mov esi, sptr
  1104. mov edi, dp
  1105. mov ecx, width
  1106. sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
  1107. loop_pass2:
  1108. movd mm0, [esi] ; X X X X X v2 v1 v0
  1109. pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
  1110. movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
  1111. psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
  1112. movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
  1113. psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
  1114. psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
  1115. por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
  1116. por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
  1117. movq [edi+4], mm0 ; move to memory
  1118. psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
  1119. movd [edi], mm0 ; move to memory
  1120. sub esi, 3
  1121. sub edi, 12
  1122. dec ecx
  1123. jnz loop_pass2
  1124. EMMS
  1125. }
  1126. }
  1127. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1128. {
  1129. int width_mmx = ((width >> 1) << 1) - 8;
  1130. if (width_mmx < 0)
  1131. width_mmx = 0;
  1132. width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
  1133. if (width_mmx)
  1134. {
  1135. _asm
  1136. {
  1137. mov esi, sptr
  1138. mov edi, dp
  1139. mov ecx, width_mmx
  1140. sub esi, 3
  1141. sub edi, 9
  1142. loop_pass4:
  1143. movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
  1144. movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
  1145. movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
  1146. psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
  1147. pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
  1148. psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
  1149. por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
  1150. movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
  1151. psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
  1152. movq [edi], mm0 ; move quad to memory
  1153. psrlq mm5, 16 ; 0 0 0 0 0 X X v2
  1154. pand mm5, const6 ; 0 0 0 0 0 0 0 v2
  1155. por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
  1156. movd [edi+8], mm6 ; move double to memory
  1157. sub esi, 6
  1158. sub edi, 12
  1159. sub ecx, 2
  1160. jnz loop_pass4
  1161. EMMS
  1162. }
  1163. }
  1164. sptr -= width_mmx*3;
  1165. dp -= width_mmx*6;
  1166. for (i = width; i; i--)
  1167. {
  1168. png_byte v[8];
  1169. int j;
  1170. png_memcpy(v, sptr, 3);
  1171. for (j = 0; j < png_pass_inc[pass]; j++)
  1172. {
  1173. png_memcpy(dp, v, 3);
  1174. dp -= 3;
  1175. }
  1176. sptr -= 3;
  1177. }
  1178. }
  1179. } /* end of pixel_bytes == 3 */
  1180. else if (pixel_bytes == 1)
  1181. {
  1182. if (((pass == 0) || (pass == 1)) && width)
  1183. {
  1184. int width_mmx = ((width >> 2) << 2);
  1185. width -= width_mmx;
  1186. if (width_mmx)
  1187. {
  1188. _asm
  1189. {
  1190. mov esi, sptr
  1191. mov edi, dp
  1192. mov ecx, width_mmx
  1193. sub edi, 31
  1194. sub esi, 3
  1195. loop1_pass0:
  1196. movd mm0, [esi] ; X X X X v0 v1 v2 v3
  1197. movq mm1, mm0 ; X X X X v0 v1 v2 v3
  1198. punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1199. movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1200. punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1201. movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1202. punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
  1203. punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
  1204. movq [edi], mm0 ; move to memory v3
  1205. punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
  1206. movq [edi+8], mm3 ; move to memory v2
  1207. movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
  1208. punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
  1209. punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
  1210. movq [edi+16], mm2 ; move to memory v1
  1211. movq [edi+24], mm4 ; move to memory v0
  1212. sub esi, 4
  1213. sub edi, 32
  1214. sub ecx, 4
  1215. jnz loop1_pass0
  1216. EMMS
  1217. }
  1218. }
  1219. sptr -= width_mmx;
  1220. dp -= width_mmx*8;
  1221. for (i = width; i; i--)
  1222. {
  1223. int j;
  1224. /* I simplified this part in version 1.0.4e
  1225. * here and in several other instances where
  1226. * pixel_bytes == 1 -- GR-P
  1227. *
  1228. * Original code:
  1229. *
  1230. * png_byte v[8];
  1231. * png_memcpy(v, sptr, pixel_bytes);
  1232. * for (j = 0; j < png_pass_inc[pass]; j++)
  1233. * {
  1234. * png_memcpy(dp, v, pixel_bytes);
  1235. * dp -= pixel_bytes;
  1236. * }
  1237. * sptr -= pixel_bytes;
  1238. *
  1239. * Replacement code is in the next three lines:
  1240. */
  1241. for (j = 0; j < png_pass_inc[pass]; j++)
  1242. *dp-- = *sptr;
  1243. sptr--;
  1244. }
  1245. }
  1246. else if (((pass == 2) || (pass == 3)) && width)
  1247. {
  1248. int width_mmx = ((width >> 2) << 2);
  1249. width -= width_mmx;
  1250. if (width_mmx)
  1251. {
  1252. _asm
  1253. {
  1254. mov esi, sptr
  1255. mov edi, dp
  1256. mov ecx, width_mmx
  1257. sub edi, 15
  1258. sub esi, 3
  1259. loop1_pass2:
  1260. movd mm0, [esi] ; X X X X v0 v1 v2 v3
  1261. punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1262. movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1263. punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1264. punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
  1265. movq [edi], mm0 ; move to memory v2 and v3
  1266. sub esi, 4
  1267. movq [edi+8], mm1 ; move to memory v1 and v0
  1268. sub edi, 16
  1269. sub ecx, 4
  1270. jnz loop1_pass2
  1271. EMMS
  1272. }
  1273. }
  1274. sptr -= width_mmx;
  1275. dp -= width_mmx*4;
  1276. for (i = width; i; i--)
  1277. {
  1278. int j;
  1279. for (j = 0; j < png_pass_inc[pass]; j++)
  1280. {
  1281. *dp-- = *sptr;
  1282. }
  1283. sptr --;
  1284. }
  1285. }
  1286. else if (width) /* && ((pass == 4) || (pass == 5))) */
  1287. {
  1288. int width_mmx = ((width >> 3) << 3);
  1289. width -= width_mmx;
  1290. if (width_mmx)
  1291. {
  1292. _asm
  1293. {
  1294. mov esi, sptr
  1295. mov edi, dp
  1296. mov ecx, width_mmx
  1297. sub edi, 15
  1298. sub esi, 7
  1299. loop1_pass4:
  1300. movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
  1301. movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
  1302. punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
  1303. //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1304. punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
  1305. movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
  1306. sub esi, 8
  1307. movq [edi], mm0 ; move to memory v4 v5 v6 and v7
  1308. //sub esi, 4
  1309. sub edi, 16
  1310. sub ecx, 8
  1311. jnz loop1_pass4
  1312. EMMS
  1313. }
  1314. }
  1315. sptr -= width_mmx;
  1316. dp -= width_mmx*2;
  1317. for (i = width; i; i--)
  1318. {
  1319. int j;
  1320. for (j = 0; j < png_pass_inc[pass]; j++)
  1321. {
  1322. *dp-- = *sptr;
  1323. }
  1324. sptr --;
  1325. }
  1326. }
  1327. } /* end of pixel_bytes == 1 */
  1328. else if (pixel_bytes == 2)
  1329. {
  1330. if (((pass == 0) || (pass == 1)) && width)
  1331. {
  1332. int width_mmx = ((width >> 1) << 1);
  1333. width -= width_mmx;
  1334. if (width_mmx)
  1335. {
  1336. _asm
  1337. {
  1338. mov esi, sptr
  1339. mov edi, dp
  1340. mov ecx, width_mmx
  1341. sub esi, 2
  1342. sub edi, 30
  1343. loop2_pass0:
  1344. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1345. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1346. movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1347. punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
  1348. punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
  1349. movq [edi], mm0
  1350. movq [edi + 8], mm0
  1351. movq [edi + 16], mm1
  1352. movq [edi + 24], mm1
  1353. sub esi, 4
  1354. sub edi, 32
  1355. sub ecx, 2
  1356. jnz loop2_pass0
  1357. EMMS
  1358. }
  1359. }
  1360. sptr -= (width_mmx*2 - 2); // sign fixed
  1361. dp -= (width_mmx*16 - 2); // sign fixed
  1362. for (i = width; i; i--)
  1363. {
  1364. png_byte v[8];
  1365. int j;
  1366. sptr -= 2;
  1367. png_memcpy(v, sptr, 2);
  1368. for (j = 0; j < png_pass_inc[pass]; j++)
  1369. {
  1370. dp -= 2;
  1371. png_memcpy(dp, v, 2);
  1372. }
  1373. }
  1374. }
  1375. else if (((pass == 2) || (pass == 3)) && width)
  1376. {
  1377. int width_mmx = ((width >> 1) << 1) ;
  1378. width -= width_mmx;
  1379. if (width_mmx)
  1380. {
  1381. _asm
  1382. {
  1383. mov esi, sptr
  1384. mov edi, dp
  1385. mov ecx, width_mmx
  1386. sub esi, 2
  1387. sub edi, 14
  1388. loop2_pass2:
  1389. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1390. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1391. movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1392. punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
  1393. punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
  1394. movq [edi], mm0
  1395. sub esi, 4
  1396. movq [edi + 8], mm1
  1397. //sub esi, 4
  1398. sub edi, 16
  1399. sub ecx, 2
  1400. jnz loop2_pass2
  1401. EMMS
  1402. }
  1403. }
  1404. sptr -= (width_mmx*2 - 2); // sign fixed
  1405. dp -= (width_mmx*8 - 2); // sign fixed
  1406. for (i = width; i; i--)
  1407. {
  1408. png_byte v[8];
  1409. int j;
  1410. sptr -= 2;
  1411. png_memcpy(v, sptr, 2);
  1412. for (j = 0; j < png_pass_inc[pass]; j++)
  1413. {
  1414. dp -= 2;
  1415. png_memcpy(dp, v, 2);
  1416. }
  1417. }
  1418. }
  1419. else if (width) // pass == 4 or 5
  1420. {
  1421. int width_mmx = ((width >> 1) << 1) ;
  1422. width -= width_mmx;
  1423. if (width_mmx)
  1424. {
  1425. _asm
  1426. {
  1427. mov esi, sptr
  1428. mov edi, dp
  1429. mov ecx, width_mmx
  1430. sub esi, 2
  1431. sub edi, 6
  1432. loop2_pass4:
  1433. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1434. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1435. sub esi, 4
  1436. movq [edi], mm0
  1437. sub edi, 8
  1438. sub ecx, 2
  1439. jnz loop2_pass4
  1440. EMMS
  1441. }
  1442. }
  1443. sptr -= (width_mmx*2 - 2); // sign fixed
  1444. dp -= (width_mmx*4 - 2); // sign fixed
  1445. for (i = width; i; i--)
  1446. {
  1447. png_byte v[8];
  1448. int j;
  1449. sptr -= 2;
  1450. png_memcpy(v, sptr, 2);
  1451. for (j = 0; j < png_pass_inc[pass]; j++)
  1452. {
  1453. dp -= 2;
  1454. png_memcpy(dp, v, 2);
  1455. }
  1456. }
  1457. }
  1458. } /* end of pixel_bytes == 2 */
  1459. else if (pixel_bytes == 4)
  1460. {
  1461. if (((pass == 0) || (pass == 1)) && width)
  1462. {
  1463. int width_mmx = ((width >> 1) << 1) ;
  1464. width -= width_mmx;
  1465. if (width_mmx)
  1466. {
  1467. _asm
  1468. {
  1469. mov esi, sptr
  1470. mov edi, dp
  1471. mov ecx, width_mmx
  1472. sub esi, 4
  1473. sub edi, 60
  1474. loop4_pass0:
  1475. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1476. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1477. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1478. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1479. movq [edi], mm0
  1480. movq [edi + 8], mm0
  1481. movq [edi + 16], mm0
  1482. movq [edi + 24], mm0
  1483. movq [edi+32], mm1
  1484. movq [edi + 40], mm1
  1485. movq [edi+ 48], mm1
  1486. sub esi, 8
  1487. movq [edi + 56], mm1
  1488. sub edi, 64
  1489. sub ecx, 2
  1490. jnz loop4_pass0
  1491. EMMS
  1492. }
  1493. }
  1494. sptr -= (width_mmx*4 - 4); // sign fixed
  1495. dp -= (width_mmx*32 - 4); // sign fixed
  1496. for (i = width; i; i--)
  1497. {
  1498. png_byte v[8];
  1499. int j;
  1500. sptr -= 4;
  1501. png_memcpy(v, sptr, 4);
  1502. for (j = 0; j < png_pass_inc[pass]; j++)
  1503. {
  1504. dp -= 4;
  1505. png_memcpy(dp, v, 4);
  1506. }
  1507. }
  1508. }
  1509. else if (((pass == 2) || (pass == 3)) && width)
  1510. {
  1511. int width_mmx = ((width >> 1) << 1) ;
  1512. width -= width_mmx;
  1513. if (width_mmx)
  1514. {
  1515. _asm
  1516. {
  1517. mov esi, sptr
  1518. mov edi, dp
  1519. mov ecx, width_mmx
  1520. sub esi, 4
  1521. sub edi, 28
  1522. loop4_pass2:
  1523. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1524. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1525. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1526. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1527. movq [edi], mm0
  1528. movq [edi + 8], mm0
  1529. movq [edi+16], mm1
  1530. movq [edi + 24], mm1
  1531. sub esi, 8
  1532. sub edi, 32
  1533. sub ecx, 2
  1534. jnz loop4_pass2
  1535. EMMS
  1536. }
  1537. }
  1538. sptr -= (width_mmx*4 - 4); // sign fixed
  1539. dp -= (width_mmx*16 - 4); // sign fixed
  1540. for (i = width; i; i--)
  1541. {
  1542. png_byte v[8];
  1543. int j;
  1544. sptr -= 4;
  1545. png_memcpy(v, sptr, 4);
  1546. for (j = 0; j < png_pass_inc[pass]; j++)
  1547. {
  1548. dp -= 4;
  1549. png_memcpy(dp, v, 4);
  1550. }
  1551. }
  1552. }
  1553. else if (width) // pass == 4 or 5
  1554. {
  1555. int width_mmx = ((width >> 1) << 1) ;
  1556. width -= width_mmx;
  1557. if (width_mmx)
  1558. {
  1559. _asm
  1560. {
  1561. mov esi, sptr
  1562. mov edi, dp
  1563. mov ecx, width_mmx
  1564. sub esi, 4
  1565. sub edi, 12
  1566. loop4_pass4:
  1567. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1568. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1569. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1570. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1571. movq [edi], mm0
  1572. sub esi, 8
  1573. movq [edi + 8], mm1
  1574. sub edi, 16
  1575. sub ecx, 2
  1576. jnz loop4_pass4
  1577. EMMS
  1578. }
  1579. }
  1580. sptr -= (width_mmx*4 - 4); // sign fixed
  1581. dp -= (width_mmx*8 - 4); // sign fixed
  1582. for (i = width; i; i--)
  1583. {
  1584. png_byte v[8];
  1585. int j;
  1586. sptr -= 4;
  1587. png_memcpy(v, sptr, 4);
  1588. for (j = 0; j < png_pass_inc[pass]; j++)
  1589. {
  1590. dp -= 4;
  1591. png_memcpy(dp, v, 4);
  1592. }
  1593. }
  1594. }
  1595. } /* end of pixel_bytes == 4 */
  1596. else if (pixel_bytes == 6)
  1597. {
  1598. for (i = width; i; i--)
  1599. {
  1600. png_byte v[8];
  1601. int j;
  1602. png_memcpy(v, sptr, 6);
  1603. for (j = 0; j < png_pass_inc[pass]; j++)
  1604. {
  1605. png_memcpy(dp, v, 6);
  1606. dp -= 6;
  1607. }
  1608. sptr -= 6;
  1609. }
  1610. } /* end of pixel_bytes == 6 */
  1611. else
  1612. {
  1613. for (i = width; i; i--)
  1614. {
  1615. png_byte v[8];
  1616. int j;
  1617. png_memcpy(v, sptr, pixel_bytes);
  1618. for (j = 0; j < png_pass_inc[pass]; j++)
  1619. {
  1620. png_memcpy(dp, v, pixel_bytes);
  1621. dp -= pixel_bytes;
  1622. }
  1623. sptr-= pixel_bytes;
  1624. }
  1625. }
  1626. } /* end of mmx_supported */
  1627. else /* MMX not supported: use modified C code - takes advantage
  1628. * of inlining of memcpy for a constant */
  1629. {
  1630. if (pixel_bytes == 1)
  1631. {
  1632. for (i = width; i; i--)
  1633. {
  1634. int j;
  1635. for (j = 0; j < png_pass_inc[pass]; j++)
  1636. *dp-- = *sptr;
  1637. sptr--;
  1638. }
  1639. }
  1640. else if (pixel_bytes == 3)
  1641. {
  1642. for (i = width; i; i--)
  1643. {
  1644. png_byte v[8];
  1645. int j;
  1646. png_memcpy(v, sptr, pixel_bytes);
  1647. for (j = 0; j < png_pass_inc[pass]; j++)
  1648. {
  1649. png_memcpy(dp, v, pixel_bytes);
  1650. dp -= pixel_bytes;
  1651. }
  1652. sptr -= pixel_bytes;
  1653. }
  1654. }
  1655. else if (pixel_bytes == 2)
  1656. {
  1657. for (i = width; i; i--)
  1658. {
  1659. png_byte v[8];
  1660. int j;
  1661. png_memcpy(v, sptr, pixel_bytes);
  1662. for (j = 0; j < png_pass_inc[pass]; j++)
  1663. {
  1664. png_memcpy(dp, v, pixel_bytes);
  1665. dp -= pixel_bytes;
  1666. }
  1667. sptr -= pixel_bytes;
  1668. }
  1669. }
  1670. else if (pixel_bytes == 4)
  1671. {
  1672. for (i = width; i; i--)
  1673. {
  1674. png_byte v[8];
  1675. int j;
  1676. png_memcpy(v, sptr, pixel_bytes);
  1677. for (j = 0; j < png_pass_inc[pass]; j++)
  1678. {
  1679. png_memcpy(dp, v, pixel_bytes);
  1680. dp -= pixel_bytes;
  1681. }
  1682. sptr -= pixel_bytes;
  1683. }
  1684. }
  1685. else if (pixel_bytes == 6)
  1686. {
  1687. for (i = width; i; i--)
  1688. {
  1689. png_byte v[8];
  1690. int j;
  1691. png_memcpy(v, sptr, pixel_bytes);
  1692. for (j = 0; j < png_pass_inc[pass]; j++)
  1693. {
  1694. png_memcpy(dp, v, pixel_bytes);
  1695. dp -= pixel_bytes;
  1696. }
  1697. sptr -= pixel_bytes;
  1698. }
  1699. }
  1700. else
  1701. {
  1702. for (i = width; i; i--)
  1703. {
  1704. png_byte v[8];
  1705. int j;
  1706. png_memcpy(v, sptr, pixel_bytes);
  1707. for (j = 0; j < png_pass_inc[pass]; j++)
  1708. {
  1709. png_memcpy(dp, v, pixel_bytes);
  1710. dp -= pixel_bytes;
  1711. }
  1712. sptr -= pixel_bytes;
  1713. }
  1714. }
  1715. } /* end of MMX not supported */
  1716. break;
  1717. }
  1718. } /* end switch (row_info->pixel_depth) */
  1719. row_info->width = final_width;
  1720. row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
  1721. }
  1722. }
  1723. #endif /* PNG_READ_INTERLACING_SUPPORTED */
  1724. // These variables are utilized in the functions below. They are declared
  1725. // globally here to ensure alignment on 8-byte boundaries.
  1726. union uAll {
  1727. __int64 use;
  1728. double align;
  1729. } LBCarryMask = {0x0101010101010101},
  1730. HBClearMask = {0x7f7f7f7f7f7f7f7f},
  1731. ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
  1732. // Optimized code for PNG Average filter decoder
  1733. void /* PRIVATE */
  1734. png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
  1735. , png_bytep prev_row)
  1736. {
  1737. int bpp;
  1738. png_uint_32 FullLength;
  1739. png_uint_32 MMXLength;
  1740. //png_uint_32 len;
  1741. int diff;
  1742. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  1743. FullLength = row_info->rowbytes; // # of bytes to filter
  1744. _asm {
  1745. // Init address pointers and offset
  1746. mov edi, row // edi ==> Avg(x)
  1747. xor ebx, ebx // ebx ==> x
  1748. mov edx, edi
  1749. mov esi, prev_row // esi ==> Prior(x)
  1750. sub edx, bpp // edx ==> Raw(x-bpp)
  1751. xor eax, eax
  1752. // Compute the Raw value for the first bpp bytes
  1753. // Raw(x) = Avg(x) + (Prior(x)/2)
  1754. davgrlp:
  1755. mov al, [esi + ebx] // Load al with Prior(x)
  1756. inc ebx
  1757. shr al, 1 // divide by 2
  1758. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  1759. cmp ebx, bpp
  1760. mov [edi+ebx-1], al // Write back Raw(x);
  1761. // mov does not affect flags; -1 to offset inc ebx
  1762. jb davgrlp
  1763. // get # of bytes to alignment
  1764. mov diff, edi // take start of row
  1765. add diff, ebx // add bpp
  1766. add diff, 0xf // add 7 + 8 to incr past alignment boundary
  1767. and diff, 0xfffffff8 // mask to alignment boundary
  1768. sub diff, edi // subtract from start ==> value ebx at alignment
  1769. jz davggo
  1770. // fix alignment
  1771. // Compute the Raw value for the bytes upto the alignment boundary
  1772. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  1773. xor ecx, ecx
  1774. davglp1:
  1775. xor eax, eax
  1776. mov cl, [esi + ebx] // load cl with Prior(x)
  1777. mov al, [edx + ebx] // load al with Raw(x-bpp)
  1778. add ax, cx
  1779. inc ebx
  1780. shr ax, 1 // divide by 2
  1781. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  1782. cmp ebx, diff // Check if at alignment boundary
  1783. mov [edi+ebx-1], al // Write back Raw(x);
  1784. // mov does not affect flags; -1 to offset inc ebx
  1785. jb davglp1 // Repeat until at alignment boundary
  1786. davggo:
  1787. mov eax, FullLength
  1788. mov ecx, eax
  1789. sub eax, ebx // subtract alignment fix
  1790. and eax, 0x00000007 // calc bytes over mult of 8
  1791. sub ecx, eax // drop over bytes from original length
  1792. mov MMXLength, ecx
  1793. } // end _asm block
  1794. // Now do the math for the rest of the row
  1795. switch ( bpp )
  1796. {
  1797. case 3:
  1798. {
  1799. ActiveMask.use = 0x0000000000ffffff;
  1800. ShiftBpp.use = 24; // == 3 * 8
  1801. ShiftRem.use = 40; // == 64 - 24
  1802. _asm {
  1803. // Re-init address pointers and offset
  1804. movq mm7, ActiveMask
  1805. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1806. movq mm5, LBCarryMask
  1807. mov edi, row // edi ==> Avg(x)
  1808. movq mm4, HBClearMask
  1809. mov esi, prev_row // esi ==> Prior(x)
  1810. // PRIME the pump (load the first Raw(x-bpp) data set
  1811. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1812. // (we correct position in loop below)
  1813. davg3lp:
  1814. movq mm0, [edi + ebx] // Load mm0 with Avg(x)
  1815. // Add (Prev_row/2) to Average
  1816. movq mm3, mm5
  1817. psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
  1818. movq mm1, [esi + ebx] // Load mm1 with Prior(x)
  1819. movq mm6, mm7
  1820. pand mm3, mm1 // get lsb for each prev_row byte
  1821. psrlq mm1, 1 // divide prev_row bytes by 2
  1822. pand mm1, mm4 // clear invalid bit 7 of each byte
  1823. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1824. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1825. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1826. pand mm1, mm2 // get LBCarrys for each byte where both
  1827. // lsb's were == 1 (Only valid for active group)
  1828. psrlq mm2, 1 // divide raw bytes by 2
  1829. pand mm2, mm4 // clear invalid bit 7 of each byte
  1830. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1831. pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
  1832. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1833. // byte
  1834. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1835. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
  1836. movq mm2, mm0 // mov updated Raws to mm2
  1837. psllq mm2, ShiftBpp // shift data to position correctly
  1838. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1839. pand mm1, mm2 // get LBCarrys for each byte where both
  1840. // lsb's were == 1 (Only valid for active group)
  1841. psrlq mm2, 1 // divide raw bytes by 2
  1842. pand mm2, mm4 // clear invalid bit 7 of each byte
  1843. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1844. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1845. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1846. // byte
  1847. // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
  1848. psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
  1849. // bytes
  1850. movq mm2, mm0 // mov updated Raws to mm2
  1851. psllq mm2, ShiftBpp // shift data to position correctly
  1852. // Data only needs to be shifted once here to
  1853. // get the correct x-bpp offset.
  1854. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1855. pand mm1, mm2 // get LBCarrys for each byte where both
  1856. // lsb's were == 1 (Only valid for active group)
  1857. psrlq mm2, 1 // divide raw bytes by 2
  1858. pand mm2, mm4 // clear invalid bit 7 of each byte
  1859. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1860. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1861. add ebx, 8
  1862. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1863. // byte
  1864. // Now ready to write back to memory
  1865. movq [edi + ebx - 8], mm0
  1866. // Move updated Raw(x) to use as Raw(x-bpp) for next loop
  1867. cmp ebx, MMXLength
  1868. movq mm2, mm0 // mov updated Raw(x) to mm2
  1869. jb davg3lp
  1870. } // end _asm block
  1871. }
  1872. break;
  1873. case 6:
  1874. case 4:
  1875. case 7:
  1876. case 5:
  1877. {
  1878. ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
  1879. // appropriate inactive bytes
  1880. ShiftBpp.use = bpp << 3;
  1881. ShiftRem.use = 64 - ShiftBpp.use;
  1882. _asm {
  1883. movq mm4, HBClearMask
  1884. // Re-init address pointers and offset
  1885. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1886. // Load ActiveMask and clear all bytes except for 1st active group
  1887. movq mm7, ActiveMask
  1888. mov edi, row // edi ==> Avg(x)
  1889. psrlq mm7, ShiftRem
  1890. mov esi, prev_row // esi ==> Prior(x)
  1891. movq mm6, mm7
  1892. movq mm5, LBCarryMask
  1893. psllq mm6, ShiftBpp // Create mask for 2nd active group
  1894. // PRIME the pump (load the first Raw(x-bpp) data set
  1895. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1896. // (we correct position in loop below)
  1897. davg4lp:
  1898. movq mm0, [edi + ebx]
  1899. psrlq mm2, ShiftRem // shift data to position correctly
  1900. movq mm1, [esi + ebx]
  1901. // Add (Prev_row/2) to Average
  1902. movq mm3, mm5
  1903. pand mm3, mm1 // get lsb for each prev_row byte
  1904. psrlq mm1, 1 // divide prev_row bytes by 2
  1905. pand mm1, mm4 // clear invalid bit 7 of each byte
  1906. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1907. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1908. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1909. pand mm1, mm2 // get LBCarrys for each byte where both
  1910. // lsb's were == 1 (Only valid for active group)
  1911. psrlq mm2, 1 // divide raw bytes by 2
  1912. pand mm2, mm4 // clear invalid bit 7 of each byte
  1913. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1914. pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
  1915. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1916. // byte
  1917. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1918. movq mm2, mm0 // mov updated Raws to mm2
  1919. psllq mm2, ShiftBpp // shift data to position correctly
  1920. add ebx, 8
  1921. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1922. pand mm1, mm2 // get LBCarrys for each byte where both
  1923. // lsb's were == 1 (Only valid for active group)
  1924. psrlq mm2, 1 // divide raw bytes by 2
  1925. pand mm2, mm4 // clear invalid bit 7 of each byte
  1926. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1927. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1928. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1929. // byte
  1930. cmp ebx, MMXLength
  1931. // Now ready to write back to memory
  1932. movq [edi + ebx - 8], mm0
  1933. // Prep Raw(x-bpp) for next loop
  1934. movq mm2, mm0 // mov updated Raws to mm2
  1935. jb davg4lp
  1936. } // end _asm block
  1937. }
  1938. break;
  1939. case 2:
  1940. {
  1941. ActiveMask.use = 0x000000000000ffff;
  1942. ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
  1943. ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
  1944. _asm {
  1945. // Load ActiveMask
  1946. movq mm7, ActiveMask
  1947. // Re-init address pointers and offset
  1948. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1949. movq mm5, LBCarryMask
  1950. mov edi, row // edi ==> Avg(x)
  1951. movq mm4, HBClearMask
  1952. mov esi, prev_row // esi ==> Prior(x)
  1953. // PRIME the pump (load the first Raw(x-bpp) data set
  1954. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1955. // (we correct position in loop below)
  1956. davg2lp:
  1957. movq mm0, [edi + ebx]
  1958. psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
  1959. movq mm1, [esi + ebx]
  1960. // Add (Prev_row/2) to Average
  1961. movq mm3, mm5
  1962. pand mm3, mm1 // get lsb for each prev_row byte
  1963. psrlq mm1, 1 // divide prev_row bytes by 2
  1964. pand mm1, mm4 // clear invalid bit 7 of each byte
  1965. movq mm6, mm7
  1966. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1967. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1968. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1969. pand mm1, mm2 // get LBCarrys for each byte where both
  1970. // lsb's were == 1 (Only valid for active group)
  1971. psrlq mm2, 1 // divide raw bytes by 2
  1972. pand mm2, mm4 // clear invalid bit 7 of each byte
  1973. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1974. pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
  1975. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  1976. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1977. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
  1978. movq mm2, mm0 // mov updated Raws to mm2
  1979. psllq mm2, ShiftBpp // shift data to position correctly
  1980. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1981. pand mm1, mm2 // get LBCarrys for each byte where both
  1982. // lsb's were == 1 (Only valid for active group)
  1983. psrlq mm2, 1 // divide raw bytes by 2
  1984. pand mm2, mm4 // clear invalid bit 7 of each byte
  1985. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1986. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1987. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  1988. // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
  1989. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
  1990. movq mm2, mm0 // mov updated Raws to mm2
  1991. psllq mm2, ShiftBpp // shift data to position correctly
  1992. // Data only needs to be shifted once here to
  1993. // get the correct x-bpp offset.
  1994. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1995. pand mm1, mm2 // get LBCarrys for each byte where both
  1996. // lsb's were == 1 (Only valid for active group)
  1997. psrlq mm2, 1 // divide raw bytes by 2
  1998. pand mm2, mm4 // clear invalid bit 7 of each byte
  1999. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  2000. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  2001. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  2002. // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
  2003. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
  2004. movq mm2, mm0 // mov updated Raws to mm2
  2005. psllq mm2, ShiftBpp // shift data to position correctly
  2006. // Data only needs to be shifted once here to
  2007. // get the correct x-bpp offset.
  2008. add ebx, 8
  2009. movq mm1, mm3 // now use mm1 for getting LBCarrys
  2010. pand mm1, mm2 // get LBCarrys for each byte where both
  2011. // lsb's were == 1 (Only valid for active group)
  2012. psrlq mm2, 1 // divide raw bytes by 2
  2013. pand mm2, mm4 // clear invalid bit 7 of each byte
  2014. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  2015. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  2016. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  2017. cmp ebx, MMXLength
  2018. // Now ready to write back to memory
  2019. movq [edi + ebx - 8], mm0
  2020. // Prep Raw(x-bpp) for next loop
  2021. movq mm2, mm0 // mov updated Raws to mm2
  2022. jb davg2lp
  2023. } // end _asm block
  2024. }
  2025. break;
  2026. case 1: // bpp == 1
  2027. {
  2028. _asm {
  2029. // Re-init address pointers and offset
  2030. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2031. mov edi, row // edi ==> Avg(x)
  2032. cmp ebx, FullLength // Test if offset at end of array
  2033. jnb davg1end
  2034. // Do Paeth decode for remaining bytes
  2035. mov esi, prev_row // esi ==> Prior(x)
  2036. mov edx, edi
  2037. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2038. sub edx, bpp // edx ==> Raw(x-bpp)
  2039. davg1lp:
  2040. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2041. xor eax, eax
  2042. mov cl, [esi + ebx] // load cl with Prior(x)
  2043. mov al, [edx + ebx] // load al with Raw(x-bpp)
  2044. add ax, cx
  2045. inc ebx
  2046. shr ax, 1 // divide by 2
  2047. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  2048. cmp ebx, FullLength // Check if at end of array
  2049. mov [edi+ebx-1], al // Write back Raw(x);
  2050. // mov does not affect flags; -1 to offset inc ebx
  2051. jb davg1lp
  2052. davg1end:
  2053. } // end _asm block
  2054. }
  2055. return;
  2056. case 8: // bpp == 8
  2057. {
  2058. _asm {
  2059. // Re-init address pointers and offset
  2060. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2061. movq mm5, LBCarryMask
  2062. mov edi, row // edi ==> Avg(x)
  2063. movq mm4, HBClearMask
  2064. mov esi, prev_row // esi ==> Prior(x)
  2065. // PRIME the pump (load the first Raw(x-bpp) data set
  2066. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  2067. // (NO NEED to correct position in loop below)
  2068. davg8lp:
  2069. movq mm0, [edi + ebx]
  2070. movq mm3, mm5
  2071. movq mm1, [esi + ebx]
  2072. add ebx, 8
  2073. pand mm3, mm1 // get lsb for each prev_row byte
  2074. psrlq mm1, 1 // divide prev_row bytes by 2
  2075. pand mm3, mm2 // get LBCarrys for each byte where both
  2076. // lsb's were == 1
  2077. psrlq mm2, 1 // divide raw bytes by 2
  2078. pand mm1, mm4 // clear invalid bit 7 of each byte
  2079. paddb mm0, mm3 // add LBCarrys to Avg for each byte
  2080. pand mm2, mm4 // clear invalid bit 7 of each byte
  2081. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  2082. paddb mm0, mm2 // add (Raw/2) to Avg for each byte
  2083. cmp ebx, MMXLength
  2084. movq [edi + ebx - 8], mm0
  2085. movq mm2, mm0 // reuse as Raw(x-bpp)
  2086. jb davg8lp
  2087. } // end _asm block
  2088. }
  2089. break;
  2090. default: // bpp greater than 8
  2091. {
  2092. _asm {
  2093. movq mm5, LBCarryMask
  2094. // Re-init address pointers and offset
  2095. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2096. mov edi, row // edi ==> Avg(x)
  2097. movq mm4, HBClearMask
  2098. mov edx, edi
  2099. mov esi, prev_row // esi ==> Prior(x)
  2100. sub edx, bpp // edx ==> Raw(x-bpp)
  2101. davgAlp:
  2102. movq mm0, [edi + ebx]
  2103. movq mm3, mm5
  2104. movq mm1, [esi + ebx]
  2105. pand mm3, mm1 // get lsb for each prev_row byte
  2106. movq mm2, [edx + ebx]
  2107. psrlq mm1, 1 // divide prev_row bytes by 2
  2108. pand mm3, mm2 // get LBCarrys for each byte where both
  2109. // lsb's were == 1
  2110. psrlq mm2, 1 // divide raw bytes by 2
  2111. pand mm1, mm4 // clear invalid bit 7 of each byte
  2112. paddb mm0, mm3 // add LBCarrys to Avg for each byte
  2113. pand mm2, mm4 // clear invalid bit 7 of each byte
  2114. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  2115. add ebx, 8
  2116. paddb mm0, mm2 // add (Raw/2) to Avg for each byte
  2117. cmp ebx, MMXLength
  2118. movq [edi + ebx - 8], mm0
  2119. jb davgAlp
  2120. } // end _asm block
  2121. }
  2122. break;
  2123. } // end switch ( bpp )
  2124. _asm {
  2125. // MMX acceleration complete now do clean-up
  2126. // Check if any remaining bytes left to decode
  2127. mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
  2128. mov edi, row // edi ==> Avg(x)
  2129. cmp ebx, FullLength // Test if offset at end of array
  2130. jnb davgend
  2131. // Do Paeth decode for remaining bytes
  2132. mov esi, prev_row // esi ==> Prior(x)
  2133. mov edx, edi
  2134. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2135. sub edx, bpp // edx ==> Raw(x-bpp)
  2136. davglp2:
  2137. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2138. xor eax, eax
  2139. mov cl, [esi + ebx] // load cl with Prior(x)
  2140. mov al, [edx + ebx] // load al with Raw(x-bpp)
  2141. add ax, cx
  2142. inc ebx
  2143. shr ax, 1 // divide by 2
  2144. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  2145. cmp ebx, FullLength // Check if at end of array
  2146. mov [edi+ebx-1], al // Write back Raw(x);
  2147. // mov does not affect flags; -1 to offset inc ebx
  2148. jb davglp2
  2149. davgend:
  2150. emms // End MMX instructions; prep for possible FP instrs.
  2151. } // end _asm block
  2152. }
  2153. // Optimized code for PNG Paeth filter decoder
  2154. void /* PRIVATE */
  2155. png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
  2156. png_bytep prev_row)
  2157. {
  2158. png_uint_32 FullLength;
  2159. png_uint_32 MMXLength;
  2160. //png_uint_32 len;
  2161. int bpp;
  2162. int diff;
  2163. //int ptemp;
  2164. int patemp, pbtemp, pctemp;
  2165. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  2166. FullLength = row_info->rowbytes; // # of bytes to filter
  2167. _asm
  2168. {
  2169. xor ebx, ebx // ebx ==> x offset
  2170. mov edi, row
  2171. xor edx, edx // edx ==> x-bpp offset
  2172. mov esi, prev_row
  2173. xor eax, eax
  2174. // Compute the Raw value for the first bpp bytes
  2175. // Note: the formula works out to be always
  2176. // Paeth(x) = Raw(x) + Prior(x) where x < bpp
  2177. dpthrlp:
  2178. mov al, [edi + ebx]
  2179. add al, [esi + ebx]
  2180. inc ebx
  2181. cmp ebx, bpp
  2182. mov [edi + ebx - 1], al
  2183. jb dpthrlp
  2184. // get # of bytes to alignment
  2185. mov diff, edi // take start of row
  2186. add diff, ebx // add bpp
  2187. xor ecx, ecx
  2188. add diff, 0xf // add 7 + 8 to incr past alignment boundary
  2189. and diff, 0xfffffff8 // mask to alignment boundary
  2190. sub diff, edi // subtract from start ==> value ebx at alignment
  2191. jz dpthgo
  2192. // fix alignment
  2193. dpthlp1:
  2194. xor eax, eax
  2195. // pav = p - a = (a + b - c) - a = b - c
  2196. mov al, [esi + ebx] // load Prior(x) into al
  2197. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2198. sub eax, ecx // subtract Prior(x-bpp)
  2199. mov patemp, eax // Save pav for later use
  2200. xor eax, eax
  2201. // pbv = p - b = (a + b - c) - b = a - c
  2202. mov al, [edi + edx] // load Raw(x-bpp) into al
  2203. sub eax, ecx // subtract Prior(x-bpp)
  2204. mov ecx, eax
  2205. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2206. add eax, patemp // pcv = pav + pbv
  2207. // pc = abs(pcv)
  2208. test eax, 0x80000000
  2209. jz dpthpca
  2210. neg eax // reverse sign of neg values
  2211. dpthpca:
  2212. mov pctemp, eax // save pc for later use
  2213. // pb = abs(pbv)
  2214. test ecx, 0x80000000
  2215. jz dpthpba
  2216. neg ecx // reverse sign of neg values
  2217. dpthpba:
  2218. mov pbtemp, ecx // save pb for later use
  2219. // pa = abs(pav)
  2220. mov eax, patemp
  2221. test eax, 0x80000000
  2222. jz dpthpaa
  2223. neg eax // reverse sign of neg values
  2224. dpthpaa:
  2225. mov patemp, eax // save pa for later use
  2226. // test if pa <= pb
  2227. cmp eax, ecx
  2228. jna dpthabb
  2229. // pa > pb; now test if pb <= pc
  2230. cmp ecx, pctemp
  2231. jna dpthbbc
  2232. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2233. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2234. jmp dpthpaeth
  2235. dpthbbc:
  2236. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  2237. mov cl, [esi + ebx] // load Prior(x) into cl
  2238. jmp dpthpaeth
  2239. dpthabb:
  2240. // pa <= pb; now test if pa <= pc
  2241. cmp eax, pctemp
  2242. jna dpthabc
  2243. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2244. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2245. jmp dpthpaeth
  2246. dpthabc:
  2247. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  2248. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  2249. dpthpaeth:
  2250. inc ebx
  2251. inc edx
  2252. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  2253. add [edi + ebx - 1], cl
  2254. cmp ebx, diff
  2255. jb dpthlp1
  2256. dpthgo:
  2257. mov ecx, FullLength
  2258. mov eax, ecx
  2259. sub eax, ebx // subtract alignment fix
  2260. and eax, 0x00000007 // calc bytes over mult of 8
  2261. sub ecx, eax // drop over bytes from original length
  2262. mov MMXLength, ecx
  2263. } // end _asm block
  2264. // Now do the math for the rest of the row
  2265. switch ( bpp )
  2266. {
  2267. case 3:
  2268. {
  2269. ActiveMask.use = 0x0000000000ffffff;
  2270. ActiveMaskEnd.use = 0xffff000000000000;
  2271. ShiftBpp.use = 24; // == bpp(3) * 8
  2272. ShiftRem.use = 40; // == 64 - 24
  2273. _asm
  2274. {
  2275. mov ebx, diff
  2276. mov edi, row
  2277. mov esi, prev_row
  2278. pxor mm0, mm0
  2279. // PRIME the pump (load the first Raw(x-bpp) data set
  2280. movq mm1, [edi+ebx-8]
  2281. dpth3lp:
  2282. psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
  2283. movq mm2, [esi + ebx] // load b=Prior(x)
  2284. punpcklbw mm1, mm0 // Unpack High bytes of a
  2285. movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
  2286. punpcklbw mm2, mm0 // Unpack High bytes of b
  2287. psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
  2288. // pav = p - a = (a + b - c) - a = b - c
  2289. movq mm4, mm2
  2290. punpcklbw mm3, mm0 // Unpack High bytes of c
  2291. // pbv = p - b = (a + b - c) - b = a - c
  2292. movq mm5, mm1
  2293. psubw mm4, mm3
  2294. pxor mm7, mm7
  2295. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2296. movq mm6, mm4
  2297. psubw mm5, mm3
  2298. // pa = abs(p-a) = abs(pav)
  2299. // pb = abs(p-b) = abs(pbv)
  2300. // pc = abs(p-c) = abs(pcv)
  2301. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2302. paddw mm6, mm5
  2303. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2304. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2305. psubw mm4, mm0
  2306. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2307. psubw mm4, mm0
  2308. psubw mm5, mm7
  2309. pxor mm0, mm0
  2310. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2311. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2312. psubw mm5, mm7
  2313. psubw mm6, mm0
  2314. // test pa <= pb
  2315. movq mm7, mm4
  2316. psubw mm6, mm0
  2317. pcmpgtw mm7, mm5 // pa > pb?
  2318. movq mm0, mm7
  2319. // use mm7 mask to merge pa & pb
  2320. pand mm5, mm7
  2321. // use mm0 mask copy to merge a & b
  2322. pand mm2, mm0
  2323. pandn mm7, mm4
  2324. pandn mm0, mm1
  2325. paddw mm7, mm5
  2326. paddw mm0, mm2
  2327. // test ((pa <= pb)? pa:pb) <= pc
  2328. pcmpgtw mm7, mm6 // pab > pc?
  2329. pxor mm1, mm1
  2330. pand mm3, mm7
  2331. pandn mm7, mm0
  2332. paddw mm7, mm3
  2333. pxor mm0, mm0
  2334. packuswb mm7, mm1
  2335. movq mm3, [esi + ebx] // load c=Prior(x-bpp)
  2336. pand mm7, ActiveMask
  2337. movq mm2, mm3 // load b=Prior(x) step 1
  2338. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2339. punpcklbw mm3, mm0 // Unpack High bytes of c
  2340. movq [edi + ebx], mm7 // write back updated value
  2341. movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
  2342. // Now do Paeth for 2nd set of bytes (3-5)
  2343. psrlq mm2, ShiftBpp // load b=Prior(x) step 2
  2344. punpcklbw mm1, mm0 // Unpack High bytes of a
  2345. pxor mm7, mm7
  2346. punpcklbw mm2, mm0 // Unpack High bytes of b
  2347. // pbv = p - b = (a + b - c) - b = a - c
  2348. movq mm5, mm1
  2349. // pav = p - a = (a + b - c) - a = b - c
  2350. movq mm4, mm2
  2351. psubw mm5, mm3
  2352. psubw mm4, mm3
  2353. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
  2354. // pav + pbv = pbv + pav
  2355. movq mm6, mm5
  2356. paddw mm6, mm4
  2357. // pa = abs(p-a) = abs(pav)
  2358. // pb = abs(p-b) = abs(pbv)
  2359. // pc = abs(p-c) = abs(pcv)
  2360. pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
  2361. pcmpgtw mm7, mm4 // Create mask pav bytes < 0
  2362. pand mm0, mm5 // Only pbv bytes < 0 in mm0
  2363. pand mm7, mm4 // Only pav bytes < 0 in mm7
  2364. psubw mm5, mm0
  2365. psubw mm4, mm7
  2366. psubw mm5, mm0
  2367. psubw mm4, mm7
  2368. pxor mm0, mm0
  2369. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2370. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2371. psubw mm6, mm0
  2372. // test pa <= pb
  2373. movq mm7, mm4
  2374. psubw mm6, mm0
  2375. pcmpgtw mm7, mm5 // pa > pb?
  2376. movq mm0, mm7
  2377. // use mm7 mask to merge pa & pb
  2378. pand mm5, mm7
  2379. // use mm0 mask copy to merge a & b
  2380. pand mm2, mm0
  2381. pandn mm7, mm4
  2382. pandn mm0, mm1
  2383. paddw mm7, mm5
  2384. paddw mm0, mm2
  2385. // test ((pa <= pb)? pa:pb) <= pc
  2386. pcmpgtw mm7, mm6 // pab > pc?
  2387. movq mm2, [esi + ebx] // load b=Prior(x)
  2388. pand mm3, mm7
  2389. pandn mm7, mm0
  2390. pxor mm1, mm1
  2391. paddw mm7, mm3
  2392. pxor mm0, mm0
  2393. packuswb mm7, mm1
  2394. movq mm3, mm2 // load c=Prior(x-bpp) step 1
  2395. pand mm7, ActiveMask
  2396. punpckhbw mm2, mm0 // Unpack High bytes of b
  2397. psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
  2398. // pav = p - a = (a + b - c) - a = b - c
  2399. movq mm4, mm2
  2400. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2401. psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
  2402. movq [edi + ebx], mm7 // write back updated value
  2403. movq mm1, mm7
  2404. punpckhbw mm3, mm0 // Unpack High bytes of c
  2405. psllq mm1, ShiftBpp // Shift bytes
  2406. // Now mm1 will be used as Raw(x-bpp)
  2407. // Now do Paeth for 3rd, and final, set of bytes (6-7)
  2408. pxor mm7, mm7
  2409. punpckhbw mm1, mm0 // Unpack High bytes of a
  2410. psubw mm4, mm3
  2411. // pbv = p - b = (a + b - c) - b = a - c
  2412. movq mm5, mm1
  2413. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2414. movq mm6, mm4
  2415. psubw mm5, mm3
  2416. pxor mm0, mm0
  2417. paddw mm6, mm5
  2418. // pa = abs(p-a) = abs(pav)
  2419. // pb = abs(p-b) = abs(pbv)
  2420. // pc = abs(p-c) = abs(pcv)
  2421. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2422. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2423. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2424. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2425. psubw mm4, mm0
  2426. psubw mm5, mm7
  2427. psubw mm4, mm0
  2428. psubw mm5, mm7
  2429. pxor mm0, mm0
  2430. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2431. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2432. psubw mm6, mm0
  2433. // test pa <= pb
  2434. movq mm7, mm4
  2435. psubw mm6, mm0
  2436. pcmpgtw mm7, mm5 // pa > pb?
  2437. movq mm0, mm7
  2438. // use mm0 mask copy to merge a & b
  2439. pand mm2, mm0
  2440. // use mm7 mask to merge pa & pb
  2441. pand mm5, mm7
  2442. pandn mm0, mm1
  2443. pandn mm7, mm4
  2444. paddw mm0, mm2
  2445. paddw mm7, mm5
  2446. // test ((pa <= pb)? pa:pb) <= pc
  2447. pcmpgtw mm7, mm6 // pab > pc?
  2448. pand mm3, mm7
  2449. pandn mm7, mm0
  2450. paddw mm7, mm3
  2451. pxor mm1, mm1
  2452. packuswb mm1, mm7
  2453. // Step ebx to next set of 8 bytes and repeat loop til done
  2454. add ebx, 8
  2455. pand mm1, ActiveMaskEnd
  2456. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2457. cmp ebx, MMXLength
  2458. pxor mm0, mm0 // pxor does not affect flags
  2459. movq [edi + ebx - 8], mm1 // write back updated value
  2460. // mm1 will be used as Raw(x-bpp) next loop
  2461. // mm3 ready to be used as Prior(x-bpp) next loop
  2462. jb dpth3lp
  2463. } // end _asm block
  2464. }
  2465. break;
  2466. case 6:
  2467. case 7:
  2468. case 5:
  2469. {
  2470. ActiveMask.use = 0x00000000ffffffff;
  2471. ActiveMask2.use = 0xffffffff00000000;
  2472. ShiftBpp.use = bpp << 3; // == bpp * 8
  2473. ShiftRem.use = 64 - ShiftBpp.use;
  2474. _asm
  2475. {
  2476. mov ebx, diff
  2477. mov edi, row
  2478. mov esi, prev_row
  2479. // PRIME the pump (load the first Raw(x-bpp) data set
  2480. movq mm1, [edi+ebx-8]
  2481. pxor mm0, mm0
  2482. dpth6lp:
  2483. // Must shift to position Raw(x-bpp) data
  2484. psrlq mm1, ShiftRem
  2485. // Do first set of 4 bytes
  2486. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2487. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2488. movq mm2, [esi + ebx] // load b=Prior(x)
  2489. punpcklbw mm2, mm0 // Unpack Low bytes of b
  2490. // Must shift to position Prior(x-bpp) data
  2491. psrlq mm3, ShiftRem
  2492. // pav = p - a = (a + b - c) - a = b - c
  2493. movq mm4, mm2
  2494. punpcklbw mm3, mm0 // Unpack Low bytes of c
  2495. // pbv = p - b = (a + b - c) - b = a - c
  2496. movq mm5, mm1
  2497. psubw mm4, mm3
  2498. pxor mm7, mm7
  2499. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2500. movq mm6, mm4
  2501. psubw mm5, mm3
  2502. // pa = abs(p-a) = abs(pav)
  2503. // pb = abs(p-b) = abs(pbv)
  2504. // pc = abs(p-c) = abs(pcv)
  2505. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2506. paddw mm6, mm5
  2507. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2508. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2509. psubw mm4, mm0
  2510. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2511. psubw mm4, mm0
  2512. psubw mm5, mm7
  2513. pxor mm0, mm0
  2514. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2515. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2516. psubw mm5, mm7
  2517. psubw mm6, mm0
  2518. // test pa <= pb
  2519. movq mm7, mm4
  2520. psubw mm6, mm0
  2521. pcmpgtw mm7, mm5 // pa > pb?
  2522. movq mm0, mm7
  2523. // use mm7 mask to merge pa & pb
  2524. pand mm5, mm7
  2525. // use mm0 mask copy to merge a & b
  2526. pand mm2, mm0
  2527. pandn mm7, mm4
  2528. pandn mm0, mm1
  2529. paddw mm7, mm5
  2530. paddw mm0, mm2
  2531. // test ((pa <= pb)? pa:pb) <= pc
  2532. pcmpgtw mm7, mm6 // pab > pc?
  2533. pxor mm1, mm1
  2534. pand mm3, mm7
  2535. pandn mm7, mm0
  2536. paddw mm7, mm3
  2537. pxor mm0, mm0
  2538. packuswb mm7, mm1
  2539. movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
  2540. pand mm7, ActiveMask
  2541. psrlq mm3, ShiftRem
  2542. movq mm2, [esi + ebx] // load b=Prior(x) step 1
  2543. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2544. movq mm6, mm2
  2545. movq [edi + ebx], mm7 // write back updated value
  2546. movq mm1, [edi+ebx-8]
  2547. psllq mm6, ShiftBpp
  2548. movq mm5, mm7
  2549. psrlq mm1, ShiftRem
  2550. por mm3, mm6
  2551. psllq mm5, ShiftBpp
  2552. punpckhbw mm3, mm0 // Unpack High bytes of c
  2553. por mm1, mm5
  2554. // Do second set of 4 bytes
  2555. punpckhbw mm2, mm0 // Unpack High bytes of b
  2556. punpckhbw mm1, mm0 // Unpack High bytes of a
  2557. // pav = p - a = (a + b - c) - a = b - c
  2558. movq mm4, mm2
  2559. // pbv = p - b = (a + b - c) - b = a - c
  2560. movq mm5, mm1
  2561. psubw mm4, mm3
  2562. pxor mm7, mm7
  2563. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2564. movq mm6, mm4
  2565. psubw mm5, mm3
  2566. // pa = abs(p-a) = abs(pav)
  2567. // pb = abs(p-b) = abs(pbv)
  2568. // pc = abs(p-c) = abs(pcv)
  2569. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2570. paddw mm6, mm5
  2571. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2572. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2573. psubw mm4, mm0
  2574. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2575. psubw mm4, mm0
  2576. psubw mm5, mm7
  2577. pxor mm0, mm0
  2578. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2579. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2580. psubw mm5, mm7
  2581. psubw mm6, mm0
  2582. // test pa <= pb
  2583. movq mm7, mm4
  2584. psubw mm6, mm0
  2585. pcmpgtw mm7, mm5 // pa > pb?
  2586. movq mm0, mm7
  2587. // use mm7 mask to merge pa & pb
  2588. pand mm5, mm7
  2589. // use mm0 mask copy to merge a & b
  2590. pand mm2, mm0
  2591. pandn mm7, mm4
  2592. pandn mm0, mm1
  2593. paddw mm7, mm5
  2594. paddw mm0, mm2
  2595. // test ((pa <= pb)? pa:pb) <= pc
  2596. pcmpgtw mm7, mm6 // pab > pc?
  2597. pxor mm1, mm1
  2598. pand mm3, mm7
  2599. pandn mm7, mm0
  2600. pxor mm1, mm1
  2601. paddw mm7, mm3
  2602. pxor mm0, mm0
  2603. // Step ex to next set of 8 bytes and repeat loop til done
  2604. add ebx, 8
  2605. packuswb mm1, mm7
  2606. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2607. cmp ebx, MMXLength
  2608. movq [edi + ebx - 8], mm1 // write back updated value
  2609. // mm1 will be used as Raw(x-bpp) next loop
  2610. jb dpth6lp
  2611. } // end _asm block
  2612. }
  2613. break;
  2614. case 4:
  2615. {
  2616. ActiveMask.use = 0x00000000ffffffff;
  2617. _asm {
  2618. mov ebx, diff
  2619. mov edi, row
  2620. mov esi, prev_row
  2621. pxor mm0, mm0
  2622. // PRIME the pump (load the first Raw(x-bpp) data set
  2623. movq mm1, [edi+ebx-8] // Only time should need to read
  2624. // a=Raw(x-bpp) bytes
  2625. dpth4lp:
  2626. // Do first set of 4 bytes
  2627. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2628. punpckhbw mm1, mm0 // Unpack Low bytes of a
  2629. movq mm2, [esi + ebx] // load b=Prior(x)
  2630. punpcklbw mm2, mm0 // Unpack High bytes of b
  2631. // pav = p - a = (a + b - c) - a = b - c
  2632. movq mm4, mm2
  2633. punpckhbw mm3, mm0 // Unpack High bytes of c
  2634. // pbv = p - b = (a + b - c) - b = a - c
  2635. movq mm5, mm1
  2636. psubw mm4, mm3
  2637. pxor mm7, mm7
  2638. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2639. movq mm6, mm4
  2640. psubw mm5, mm3
  2641. // pa = abs(p-a) = abs(pav)
  2642. // pb = abs(p-b) = abs(pbv)
  2643. // pc = abs(p-c) = abs(pcv)
  2644. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2645. paddw mm6, mm5
  2646. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2647. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2648. psubw mm4, mm0
  2649. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2650. psubw mm4, mm0
  2651. psubw mm5, mm7
  2652. pxor mm0, mm0
  2653. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2654. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2655. psubw mm5, mm7
  2656. psubw mm6, mm0
  2657. // test pa <= pb
  2658. movq mm7, mm4
  2659. psubw mm6, mm0
  2660. pcmpgtw mm7, mm5 // pa > pb?
  2661. movq mm0, mm7
  2662. // use mm7 mask to merge pa & pb
  2663. pand mm5, mm7
  2664. // use mm0 mask copy to merge a & b
  2665. pand mm2, mm0
  2666. pandn mm7, mm4
  2667. pandn mm0, mm1
  2668. paddw mm7, mm5
  2669. paddw mm0, mm2
  2670. // test ((pa <= pb)? pa:pb) <= pc
  2671. pcmpgtw mm7, mm6 // pab > pc?
  2672. pxor mm1, mm1
  2673. pand mm3, mm7
  2674. pandn mm7, mm0
  2675. paddw mm7, mm3
  2676. pxor mm0, mm0
  2677. packuswb mm7, mm1
  2678. movq mm3, [esi + ebx] // load c=Prior(x-bpp)
  2679. pand mm7, ActiveMask
  2680. movq mm2, mm3 // load b=Prior(x) step 1
  2681. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2682. punpcklbw mm3, mm0 // Unpack High bytes of c
  2683. movq [edi + ebx], mm7 // write back updated value
  2684. movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
  2685. // Do second set of 4 bytes
  2686. punpckhbw mm2, mm0 // Unpack Low bytes of b
  2687. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2688. // pav = p - a = (a + b - c) - a = b - c
  2689. movq mm4, mm2
  2690. // pbv = p - b = (a + b - c) - b = a - c
  2691. movq mm5, mm1
  2692. psubw mm4, mm3
  2693. pxor mm7, mm7
  2694. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2695. movq mm6, mm4
  2696. psubw mm5, mm3
  2697. // pa = abs(p-a) = abs(pav)
  2698. // pb = abs(p-b) = abs(pbv)
  2699. // pc = abs(p-c) = abs(pcv)
  2700. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2701. paddw mm6, mm5
  2702. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2703. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2704. psubw mm4, mm0
  2705. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2706. psubw mm4, mm0
  2707. psubw mm5, mm7
  2708. pxor mm0, mm0
  2709. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2710. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2711. psubw mm5, mm7
  2712. psubw mm6, mm0
  2713. // test pa <= pb
  2714. movq mm7, mm4
  2715. psubw mm6, mm0
  2716. pcmpgtw mm7, mm5 // pa > pb?
  2717. movq mm0, mm7
  2718. // use mm7 mask to merge pa & pb
  2719. pand mm5, mm7
  2720. // use mm0 mask copy to merge a & b
  2721. pand mm2, mm0
  2722. pandn mm7, mm4
  2723. pandn mm0, mm1
  2724. paddw mm7, mm5
  2725. paddw mm0, mm2
  2726. // test ((pa <= pb)? pa:pb) <= pc
  2727. pcmpgtw mm7, mm6 // pab > pc?
  2728. pxor mm1, mm1
  2729. pand mm3, mm7
  2730. pandn mm7, mm0
  2731. pxor mm1, mm1
  2732. paddw mm7, mm3
  2733. pxor mm0, mm0
  2734. // Step ex to next set of 8 bytes and repeat loop til done
  2735. add ebx, 8
  2736. packuswb mm1, mm7
  2737. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2738. cmp ebx, MMXLength
  2739. movq [edi + ebx - 8], mm1 // write back updated value
  2740. // mm1 will be used as Raw(x-bpp) next loop
  2741. jb dpth4lp
  2742. } // end _asm block
  2743. }
  2744. break;
  2745. case 8: // bpp == 8
  2746. {
  2747. ActiveMask.use = 0x00000000ffffffff;
  2748. _asm {
  2749. mov ebx, diff
  2750. mov edi, row
  2751. mov esi, prev_row
  2752. pxor mm0, mm0
  2753. // PRIME the pump (load the first Raw(x-bpp) data set
  2754. movq mm1, [edi+ebx-8] // Only time should need to read
  2755. // a=Raw(x-bpp) bytes
  2756. dpth8lp:
  2757. // Do first set of 4 bytes
  2758. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2759. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2760. movq mm2, [esi + ebx] // load b=Prior(x)
  2761. punpcklbw mm2, mm0 // Unpack Low bytes of b
  2762. // pav = p - a = (a + b - c) - a = b - c
  2763. movq mm4, mm2
  2764. punpcklbw mm3, mm0 // Unpack Low bytes of c
  2765. // pbv = p - b = (a + b - c) - b = a - c
  2766. movq mm5, mm1
  2767. psubw mm4, mm3
  2768. pxor mm7, mm7
  2769. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2770. movq mm6, mm4
  2771. psubw mm5, mm3
  2772. // pa = abs(p-a) = abs(pav)
  2773. // pb = abs(p-b) = abs(pbv)
  2774. // pc = abs(p-c) = abs(pcv)
  2775. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2776. paddw mm6, mm5
  2777. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2778. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2779. psubw mm4, mm0
  2780. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2781. psubw mm4, mm0
  2782. psubw mm5, mm7
  2783. pxor mm0, mm0
  2784. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2785. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2786. psubw mm5, mm7
  2787. psubw mm6, mm0
  2788. // test pa <= pb
  2789. movq mm7, mm4
  2790. psubw mm6, mm0
  2791. pcmpgtw mm7, mm5 // pa > pb?
  2792. movq mm0, mm7
  2793. // use mm7 mask to merge pa & pb
  2794. pand mm5, mm7
  2795. // use mm0 mask copy to merge a & b
  2796. pand mm2, mm0
  2797. pandn mm7, mm4
  2798. pandn mm0, mm1
  2799. paddw mm7, mm5
  2800. paddw mm0, mm2
  2801. // test ((pa <= pb)? pa:pb) <= pc
  2802. pcmpgtw mm7, mm6 // pab > pc?
  2803. pxor mm1, mm1
  2804. pand mm3, mm7
  2805. pandn mm7, mm0
  2806. paddw mm7, mm3
  2807. pxor mm0, mm0
  2808. packuswb mm7, mm1
  2809. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2810. pand mm7, ActiveMask
  2811. movq mm2, [esi + ebx] // load b=Prior(x)
  2812. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2813. punpckhbw mm3, mm0 // Unpack High bytes of c
  2814. movq [edi + ebx], mm7 // write back updated value
  2815. movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
  2816. // Do second set of 4 bytes
  2817. punpckhbw mm2, mm0 // Unpack High bytes of b
  2818. punpckhbw mm1, mm0 // Unpack High bytes of a
  2819. // pav = p - a = (a + b - c) - a = b - c
  2820. movq mm4, mm2
  2821. // pbv = p - b = (a + b - c) - b = a - c
  2822. movq mm5, mm1
  2823. psubw mm4, mm3
  2824. pxor mm7, mm7
  2825. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2826. movq mm6, mm4
  2827. psubw mm5, mm3
  2828. // pa = abs(p-a) = abs(pav)
  2829. // pb = abs(p-b) = abs(pbv)
  2830. // pc = abs(p-c) = abs(pcv)
  2831. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2832. paddw mm6, mm5
  2833. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2834. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2835. psubw mm4, mm0
  2836. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2837. psubw mm4, mm0
  2838. psubw mm5, mm7
  2839. pxor mm0, mm0
  2840. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2841. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2842. psubw mm5, mm7
  2843. psubw mm6, mm0
  2844. // test pa <= pb
  2845. movq mm7, mm4
  2846. psubw mm6, mm0
  2847. pcmpgtw mm7, mm5 // pa > pb?
  2848. movq mm0, mm7
  2849. // use mm7 mask to merge pa & pb
  2850. pand mm5, mm7
  2851. // use mm0 mask copy to merge a & b
  2852. pand mm2, mm0
  2853. pandn mm7, mm4
  2854. pandn mm0, mm1
  2855. paddw mm7, mm5
  2856. paddw mm0, mm2
  2857. // test ((pa <= pb)? pa:pb) <= pc
  2858. pcmpgtw mm7, mm6 // pab > pc?
  2859. pxor mm1, mm1
  2860. pand mm3, mm7
  2861. pandn mm7, mm0
  2862. pxor mm1, mm1
  2863. paddw mm7, mm3
  2864. pxor mm0, mm0
  2865. // Step ex to next set of 8 bytes and repeat loop til done
  2866. add ebx, 8
  2867. packuswb mm1, mm7
  2868. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2869. cmp ebx, MMXLength
  2870. movq [edi + ebx - 8], mm1 // write back updated value
  2871. // mm1 will be used as Raw(x-bpp) next loop
  2872. jb dpth8lp
  2873. } // end _asm block
  2874. }
  2875. break;
  2876. case 1: // bpp = 1
  2877. case 2: // bpp = 2
  2878. default: // bpp > 8
  2879. {
  2880. _asm {
  2881. mov ebx, diff
  2882. cmp ebx, FullLength
  2883. jnb dpthdend
  2884. mov edi, row
  2885. mov esi, prev_row
  2886. // Do Paeth decode for remaining bytes
  2887. mov edx, ebx
  2888. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2889. sub edx, bpp // Set edx = ebx - bpp
  2890. dpthdlp:
  2891. xor eax, eax
  2892. // pav = p - a = (a + b - c) - a = b - c
  2893. mov al, [esi + ebx] // load Prior(x) into al
  2894. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2895. sub eax, ecx // subtract Prior(x-bpp)
  2896. mov patemp, eax // Save pav for later use
  2897. xor eax, eax
  2898. // pbv = p - b = (a + b - c) - b = a - c
  2899. mov al, [edi + edx] // load Raw(x-bpp) into al
  2900. sub eax, ecx // subtract Prior(x-bpp)
  2901. mov ecx, eax
  2902. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2903. add eax, patemp // pcv = pav + pbv
  2904. // pc = abs(pcv)
  2905. test eax, 0x80000000
  2906. jz dpthdpca
  2907. neg eax // reverse sign of neg values
  2908. dpthdpca:
  2909. mov pctemp, eax // save pc for later use
  2910. // pb = abs(pbv)
  2911. test ecx, 0x80000000
  2912. jz dpthdpba
  2913. neg ecx // reverse sign of neg values
  2914. dpthdpba:
  2915. mov pbtemp, ecx // save pb for later use
  2916. // pa = abs(pav)
  2917. mov eax, patemp
  2918. test eax, 0x80000000
  2919. jz dpthdpaa
  2920. neg eax // reverse sign of neg values
  2921. dpthdpaa:
  2922. mov patemp, eax // save pa for later use
  2923. // test if pa <= pb
  2924. cmp eax, ecx
  2925. jna dpthdabb
  2926. // pa > pb; now test if pb <= pc
  2927. cmp ecx, pctemp
  2928. jna dpthdbbc
  2929. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2930. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2931. jmp dpthdpaeth
  2932. dpthdbbc:
  2933. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  2934. mov cl, [esi + ebx] // load Prior(x) into cl
  2935. jmp dpthdpaeth
  2936. dpthdabb:
  2937. // pa <= pb; now test if pa <= pc
  2938. cmp eax, pctemp
  2939. jna dpthdabc
  2940. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2941. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2942. jmp dpthdpaeth
  2943. dpthdabc:
  2944. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  2945. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  2946. dpthdpaeth:
  2947. inc ebx
  2948. inc edx
  2949. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  2950. add [edi + ebx - 1], cl
  2951. cmp ebx, FullLength
  2952. jb dpthdlp
  2953. dpthdend:
  2954. } // end _asm block
  2955. }
  2956. return; // No need to go further with this one
  2957. } // end switch ( bpp )
  2958. _asm
  2959. {
  2960. // MMX acceleration complete now do clean-up
  2961. // Check if any remaining bytes left to decode
  2962. mov ebx, MMXLength
  2963. cmp ebx, FullLength
  2964. jnb dpthend
  2965. mov edi, row
  2966. mov esi, prev_row
  2967. // Do Paeth decode for remaining bytes
  2968. mov edx, ebx
  2969. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2970. sub edx, bpp // Set edx = ebx - bpp
  2971. dpthlp2:
  2972. xor eax, eax
  2973. // pav = p - a = (a + b - c) - a = b - c
  2974. mov al, [esi + ebx] // load Prior(x) into al
  2975. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2976. sub eax, ecx // subtract Prior(x-bpp)
  2977. mov patemp, eax // Save pav for later use
  2978. xor eax, eax
  2979. // pbv = p - b = (a + b - c) - b = a - c
  2980. mov al, [edi + edx] // load Raw(x-bpp) into al
  2981. sub eax, ecx // subtract Prior(x-bpp)
  2982. mov ecx, eax
  2983. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2984. add eax, patemp // pcv = pav + pbv
  2985. // pc = abs(pcv)
  2986. test eax, 0x80000000
  2987. jz dpthpca2
  2988. neg eax // reverse sign of neg values
  2989. dpthpca2:
  2990. mov pctemp, eax // save pc for later use
  2991. // pb = abs(pbv)
  2992. test ecx, 0x80000000
  2993. jz dpthpba2
  2994. neg ecx // reverse sign of neg values
  2995. dpthpba2:
  2996. mov pbtemp, ecx // save pb for later use
  2997. // pa = abs(pav)
  2998. mov eax, patemp
  2999. test eax, 0x80000000
  3000. jz dpthpaa2
  3001. neg eax // reverse sign of neg values
  3002. dpthpaa2:
  3003. mov patemp, eax // save pa for later use
  3004. // test if pa <= pb
  3005. cmp eax, ecx
  3006. jna dpthabb2
  3007. // pa > pb; now test if pb <= pc
  3008. cmp ecx, pctemp
  3009. jna dpthbbc2
  3010. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3011. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  3012. jmp dpthpaeth2
  3013. dpthbbc2:
  3014. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3015. mov cl, [esi + ebx] // load Prior(x) into cl
  3016. jmp dpthpaeth2
  3017. dpthabb2:
  3018. // pa <= pb; now test if pa <= pc
  3019. cmp eax, pctemp
  3020. jna dpthabc2
  3021. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3022. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  3023. jmp dpthpaeth2
  3024. dpthabc2:
  3025. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3026. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  3027. dpthpaeth2:
  3028. inc ebx
  3029. inc edx
  3030. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3031. add [edi + ebx - 1], cl
  3032. cmp ebx, FullLength
  3033. jb dpthlp2
  3034. dpthend:
  3035. emms // End MMX instructions; prep for possible FP instrs.
  3036. } // end _asm block
  3037. }
  3038. // Optimized code for PNG Sub filter decoder
  3039. void /* PRIVATE */
  3040. png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
  3041. {
  3042. //int test;
  3043. int bpp;
  3044. png_uint_32 FullLength;
  3045. png_uint_32 MMXLength;
  3046. int diff;
  3047. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  3048. FullLength = row_info->rowbytes - bpp; // # of bytes to filter
  3049. _asm {
  3050. mov edi, row
  3051. mov esi, edi // lp = row
  3052. add edi, bpp // rp = row + bpp
  3053. xor eax, eax
  3054. // get # of bytes to alignment
  3055. mov diff, edi // take start of row
  3056. add diff, 0xf // add 7 + 8 to incr past
  3057. // alignment boundary
  3058. xor ebx, ebx
  3059. and diff, 0xfffffff8 // mask to alignment boundary
  3060. sub diff, edi // subtract from start ==> value
  3061. // ebx at alignment
  3062. jz dsubgo
  3063. // fix alignment
  3064. dsublp1:
  3065. mov al, [esi+ebx]
  3066. add [edi+ebx], al
  3067. inc ebx
  3068. cmp ebx, diff
  3069. jb dsublp1
  3070. dsubgo:
  3071. mov ecx, FullLength
  3072. mov edx, ecx
  3073. sub edx, ebx // subtract alignment fix
  3074. and edx, 0x00000007 // calc bytes over mult of 8
  3075. sub ecx, edx // drop over bytes from length
  3076. mov MMXLength, ecx
  3077. } // end _asm block
  3078. // Now do the math for the rest of the row
  3079. switch ( bpp )
  3080. {
  3081. case 3:
  3082. {
  3083. ActiveMask.use = 0x0000ffffff000000;
  3084. ShiftBpp.use = 24; // == 3 * 8
  3085. ShiftRem.use = 40; // == 64 - 24
  3086. _asm {
  3087. mov edi, row
  3088. movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
  3089. mov esi, edi // lp = row
  3090. add edi, bpp // rp = row + bpp
  3091. movq mm6, mm7
  3092. mov ebx, diff
  3093. psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
  3094. // byte group
  3095. // PRIME the pump (load the first Raw(x-bpp) data set
  3096. movq mm1, [edi+ebx-8]
  3097. dsub3lp:
  3098. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3099. // no need for mask; shift clears inactive bytes
  3100. // Add 1st active group
  3101. movq mm0, [edi+ebx]
  3102. paddb mm0, mm1
  3103. // Add 2nd active group
  3104. movq mm1, mm0 // mov updated Raws to mm1
  3105. psllq mm1, ShiftBpp // shift data to position correctly
  3106. pand mm1, mm7 // mask to use only 2nd active group
  3107. paddb mm0, mm1
  3108. // Add 3rd active group
  3109. movq mm1, mm0 // mov updated Raws to mm1
  3110. psllq mm1, ShiftBpp // shift data to position correctly
  3111. pand mm1, mm6 // mask to use only 3rd active group
  3112. add ebx, 8
  3113. paddb mm0, mm1
  3114. cmp ebx, MMXLength
  3115. movq [edi+ebx-8], mm0 // Write updated Raws back to array
  3116. // Prep for doing 1st add at top of loop
  3117. movq mm1, mm0
  3118. jb dsub3lp
  3119. } // end _asm block
  3120. }
  3121. break;
  3122. case 1:
  3123. {
  3124. // Placed here just in case this is a duplicate of the
  3125. // non-MMX code for the SUB filter in png_read_filter_row below
  3126. //
  3127. // png_bytep rp;
  3128. // png_bytep lp;
  3129. // png_uint_32 i;
  3130. // bpp = (row_info->pixel_depth + 7) >> 3;
  3131. // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
  3132. // i < row_info->rowbytes; i++, rp++, lp++)
  3133. // {
  3134. // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
  3135. // }
  3136. _asm {
  3137. mov ebx, diff
  3138. mov edi, row
  3139. cmp ebx, FullLength
  3140. jnb dsub1end
  3141. mov esi, edi // lp = row
  3142. xor eax, eax
  3143. add edi, bpp // rp = row + bpp
  3144. dsub1lp:
  3145. mov al, [esi+ebx]
  3146. add [edi+ebx], al
  3147. inc ebx
  3148. cmp ebx, FullLength
  3149. jb dsub1lp
  3150. dsub1end:
  3151. } // end _asm block
  3152. }
  3153. return;
  3154. case 6:
  3155. case 7:
  3156. case 4:
  3157. case 5:
  3158. {
  3159. ShiftBpp.use = bpp << 3;
  3160. ShiftRem.use = 64 - ShiftBpp.use;
  3161. _asm {
  3162. mov edi, row
  3163. mov ebx, diff
  3164. mov esi, edi // lp = row
  3165. add edi, bpp // rp = row + bpp
  3166. // PRIME the pump (load the first Raw(x-bpp) data set
  3167. movq mm1, [edi+ebx-8]
  3168. dsub4lp:
  3169. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3170. // no need for mask; shift clears inactive bytes
  3171. movq mm0, [edi+ebx]
  3172. paddb mm0, mm1
  3173. // Add 2nd active group
  3174. movq mm1, mm0 // mov updated Raws to mm1
  3175. psllq mm1, ShiftBpp // shift data to position correctly
  3176. // there is no need for any mask
  3177. // since shift clears inactive bits/bytes
  3178. add ebx, 8
  3179. paddb mm0, mm1
  3180. cmp ebx, MMXLength
  3181. movq [edi+ebx-8], mm0
  3182. movq mm1, mm0 // Prep for doing 1st add at top of loop
  3183. jb dsub4lp
  3184. } // end _asm block
  3185. }
  3186. break;
  3187. case 2:
  3188. {
  3189. ActiveMask.use = 0x00000000ffff0000;
  3190. ShiftBpp.use = 16; // == 2 * 8
  3191. ShiftRem.use = 48; // == 64 - 16
  3192. _asm {
  3193. movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
  3194. mov ebx, diff
  3195. movq mm6, mm7
  3196. mov edi, row
  3197. psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
  3198. // byte group
  3199. mov esi, edi // lp = row
  3200. movq mm5, mm6
  3201. add edi, bpp // rp = row + bpp
  3202. psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
  3203. // byte group
  3204. // PRIME the pump (load the first Raw(x-bpp) data set
  3205. movq mm1, [edi+ebx-8]
  3206. dsub2lp:
  3207. // Add 1st active group
  3208. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3209. // no need for mask; shift clears inactive
  3210. // bytes
  3211. movq mm0, [edi+ebx]
  3212. paddb mm0, mm1
  3213. // Add 2nd active group
  3214. movq mm1, mm0 // mov updated Raws to mm1
  3215. psllq mm1, ShiftBpp // shift data to position correctly
  3216. pand mm1, mm7 // mask to use only 2nd active group
  3217. paddb mm0, mm1
  3218. // Add 3rd active group
  3219. movq mm1, mm0 // mov updated Raws to mm1
  3220. psllq mm1, ShiftBpp // shift data to position correctly
  3221. pand mm1, mm6 // mask to use only 3rd active group
  3222. paddb mm0, mm1
  3223. // Add 4th active group
  3224. movq mm1, mm0 // mov updated Raws to mm1
  3225. psllq mm1, ShiftBpp // shift data to position correctly
  3226. pand mm1, mm5 // mask to use only 4th active group
  3227. add ebx, 8
  3228. paddb mm0, mm1
  3229. cmp ebx, MMXLength
  3230. movq [edi+ebx-8], mm0 // Write updated Raws back to array
  3231. movq mm1, mm0 // Prep for doing 1st add at top of loop
  3232. jb dsub2lp
  3233. } // end _asm block
  3234. }
  3235. break;
  3236. case 8:
  3237. {
  3238. _asm {
  3239. mov edi, row
  3240. mov ebx, diff
  3241. mov esi, edi // lp = row
  3242. add edi, bpp // rp = row + bpp
  3243. mov ecx, MMXLength
  3244. movq mm7, [edi+ebx-8] // PRIME the pump (load the first
  3245. // Raw(x-bpp) data set
  3246. and ecx, 0x0000003f // calc bytes over mult of 64
  3247. dsub8lp:
  3248. movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
  3249. paddb mm0, mm7
  3250. movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
  3251. movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
  3252. // Now mm0 will be used as Raw(x-bpp) for
  3253. // the 2nd group of 8 bytes. This will be
  3254. // repeated for each group of 8 bytes with
  3255. // the 8th group being used as the Raw(x-bpp)
  3256. // for the 1st group of the next loop.
  3257. paddb mm1, mm0
  3258. movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
  3259. movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
  3260. paddb mm2, mm1
  3261. movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
  3262. movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
  3263. paddb mm3, mm2
  3264. movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
  3265. movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
  3266. paddb mm4, mm3
  3267. movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
  3268. movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
  3269. paddb mm5, mm4
  3270. movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
  3271. movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
  3272. paddb mm6, mm5
  3273. movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
  3274. movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
  3275. add ebx, 64
  3276. paddb mm7, mm6
  3277. cmp ebx, ecx
  3278. movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
  3279. jb dsub8lp
  3280. cmp ebx, MMXLength
  3281. jnb dsub8lt8
  3282. dsub8lpA:
  3283. movq mm0, [edi+ebx]
  3284. add ebx, 8
  3285. paddb mm0, mm7
  3286. cmp ebx, MMXLength
  3287. movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
  3288. movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
  3289. // be the new Raw(x-bpp) for the next loop
  3290. jb dsub8lpA
  3291. dsub8lt8:
  3292. } // end _asm block
  3293. }
  3294. break;
  3295. default: // bpp greater than 8 bytes
  3296. {
  3297. _asm {
  3298. mov ebx, diff
  3299. mov edi, row
  3300. mov esi, edi // lp = row
  3301. add edi, bpp // rp = row + bpp
  3302. dsubAlp:
  3303. movq mm0, [edi+ebx]
  3304. movq mm1, [esi+ebx]
  3305. add ebx, 8
  3306. paddb mm0, mm1
  3307. cmp ebx, MMXLength
  3308. movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
  3309. // add ebx
  3310. jb dsubAlp
  3311. } // end _asm block
  3312. }
  3313. break;
  3314. } // end switch ( bpp )
  3315. _asm {
  3316. mov ebx, MMXLength
  3317. mov edi, row
  3318. cmp ebx, FullLength
  3319. jnb dsubend
  3320. mov esi, edi // lp = row
  3321. xor eax, eax
  3322. add edi, bpp // rp = row + bpp
  3323. dsublp2:
  3324. mov al, [esi+ebx]
  3325. add [edi+ebx], al
  3326. inc ebx
  3327. cmp ebx, FullLength
  3328. jb dsublp2
  3329. dsubend:
  3330. emms // End MMX instructions; prep for possible FP instrs.
  3331. } // end _asm block
  3332. }
  3333. // Optimized code for PNG Up filter decoder
  3334. void /* PRIVATE */
  3335. png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
  3336. png_bytep prev_row)
  3337. {
  3338. png_uint_32 len;
  3339. len = row_info->rowbytes; // # of bytes to filter
  3340. _asm {
  3341. mov edi, row
  3342. // get # of bytes to alignment
  3343. mov ecx, edi
  3344. xor ebx, ebx
  3345. add ecx, 0x7
  3346. xor eax, eax
  3347. and ecx, 0xfffffff8
  3348. mov esi, prev_row
  3349. sub ecx, edi
  3350. jz dupgo
  3351. // fix alignment
  3352. duplp1:
  3353. mov al, [edi+ebx]
  3354. add al, [esi+ebx]
  3355. inc ebx
  3356. cmp ebx, ecx
  3357. mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
  3358. jb duplp1
  3359. dupgo:
  3360. mov ecx, len
  3361. mov edx, ecx
  3362. sub edx, ebx // subtract alignment fix
  3363. and edx, 0x0000003f // calc bytes over mult of 64
  3364. sub ecx, edx // drop over bytes from length
  3365. // Unrolled loop - use all MMX registers and interleave to reduce
  3366. // number of branch instructions (loops) and reduce partial stalls
  3367. duploop:
  3368. movq mm1, [esi+ebx]
  3369. movq mm0, [edi+ebx]
  3370. movq mm3, [esi+ebx+8]
  3371. paddb mm0, mm1
  3372. movq mm2, [edi+ebx+8]
  3373. movq [edi+ebx], mm0
  3374. paddb mm2, mm3
  3375. movq mm5, [esi+ebx+16]
  3376. movq [edi+ebx+8], mm2
  3377. movq mm4, [edi+ebx+16]
  3378. movq mm7, [esi+ebx+24]
  3379. paddb mm4, mm5
  3380. movq mm6, [edi+ebx+24]
  3381. movq [edi+ebx+16], mm4
  3382. paddb mm6, mm7
  3383. movq mm1, [esi+ebx+32]
  3384. movq [edi+ebx+24], mm6
  3385. movq mm0, [edi+ebx+32]
  3386. movq mm3, [esi+ebx+40]
  3387. paddb mm0, mm1
  3388. movq mm2, [edi+ebx+40]
  3389. movq [edi+ebx+32], mm0
  3390. paddb mm2, mm3
  3391. movq mm5, [esi+ebx+48]
  3392. movq [edi+ebx+40], mm2
  3393. movq mm4, [edi+ebx+48]
  3394. movq mm7, [esi+ebx+56]
  3395. paddb mm4, mm5
  3396. movq mm6, [edi+ebx+56]
  3397. movq [edi+ebx+48], mm4
  3398. add ebx, 64
  3399. paddb mm6, mm7
  3400. cmp ebx, ecx
  3401. movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
  3402. // -8 to offset add ebx
  3403. jb duploop
  3404. cmp edx, 0 // Test for bytes over mult of 64
  3405. jz dupend
  3406. // 2 lines added by lcreeve at netins.net
  3407. // (mail 11 Jul 98 in png-implement list)
  3408. cmp edx, 8 //test for less than 8 bytes
  3409. jb duplt8
  3410. add ecx, edx
  3411. and edx, 0x00000007 // calc bytes over mult of 8
  3412. sub ecx, edx // drop over bytes from length
  3413. jz duplt8
  3414. // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
  3415. duplpA:
  3416. movq mm1, [esi+ebx]
  3417. movq mm0, [edi+ebx]
  3418. add ebx, 8
  3419. paddb mm0, mm1
  3420. cmp ebx, ecx
  3421. movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
  3422. jb duplpA
  3423. cmp edx, 0 // Test for bytes over mult of 8
  3424. jz dupend
  3425. duplt8:
  3426. xor eax, eax
  3427. add ecx, edx // move over byte count into counter
  3428. // Loop using x86 registers to update remaining bytes
  3429. duplp2:
  3430. mov al, [edi + ebx]
  3431. add al, [esi + ebx]
  3432. inc ebx
  3433. cmp ebx, ecx
  3434. mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
  3435. jb duplp2
  3436. dupend:
  3437. // Conversion of filtered row completed
  3438. emms // End MMX instructions; prep for possible FP instrs.
  3439. } // end _asm block
  3440. }
  3441. // Optimized png_read_filter_row routines
  3442. void /* PRIVATE */
  3443. png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
  3444. row, png_bytep prev_row, int filter)
  3445. {
  3446. #ifdef PNG_DEBUG
  3447. char filnm[10];
  3448. #endif
  3449. if (mmx_supported == 2) {
  3450. #if !defined(PNG_1_0_X)
  3451. /* this should have happened in png_init_mmx_flags() already */
  3452. png_warning(png_ptr, "asm_flags may not have been initialized");
  3453. #endif
  3454. png_mmx_support();
  3455. }
  3456. #ifdef PNG_DEBUG
  3457. png_debug(1, "in png_read_filter_row\n");
  3458. switch (filter)
  3459. {
  3460. case 0: sprintf(filnm, "none");
  3461. break;
  3462. #if !defined(PNG_1_0_X)
  3463. case 1: sprintf(filnm, "sub-%s",
  3464. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
  3465. break;
  3466. case 2: sprintf(filnm, "up-%s",
  3467. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
  3468. break;
  3469. case 3: sprintf(filnm, "avg-%s",
  3470. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
  3471. break;
  3472. case 4: sprintf(filnm, "Paeth-%s",
  3473. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
  3474. break;
  3475. #else
  3476. case 1: sprintf(filnm, "sub");
  3477. break;
  3478. case 2: sprintf(filnm, "up");
  3479. break;
  3480. case 3: sprintf(filnm, "avg");
  3481. break;
  3482. case 4: sprintf(filnm, "Paeth");
  3483. break;
  3484. #endif
  3485. default: sprintf(filnm, "unknw");
  3486. break;
  3487. }
  3488. png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
  3489. png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
  3490. (int)((row_info->pixel_depth + 7) >> 3));
  3491. png_debug1(0,"len=%8d, ", row_info->rowbytes);
  3492. #endif /* PNG_DEBUG */
  3493. switch (filter)
  3494. {
  3495. case PNG_FILTER_VALUE_NONE:
  3496. break;
  3497. case PNG_FILTER_VALUE_SUB:
  3498. {
  3499. #if !defined(PNG_1_0_X)
  3500. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
  3501. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3502. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3503. #else
  3504. if (mmx_supported)
  3505. #endif
  3506. {
  3507. png_read_filter_row_mmx_sub(row_info, row);
  3508. }
  3509. else
  3510. {
  3511. png_uint_32 i;
  3512. png_uint_32 istop = row_info->rowbytes;
  3513. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3514. png_bytep rp = row + bpp;
  3515. png_bytep lp = row;
  3516. for (i = bpp; i < istop; i++)
  3517. {
  3518. *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
  3519. rp++;
  3520. }
  3521. }
  3522. break;
  3523. }
  3524. case PNG_FILTER_VALUE_UP:
  3525. {
  3526. #if !defined(PNG_1_0_X)
  3527. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
  3528. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3529. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3530. #else
  3531. if (mmx_supported)
  3532. #endif
  3533. {
  3534. png_read_filter_row_mmx_up(row_info, row, prev_row);
  3535. }
  3536. else
  3537. {
  3538. png_uint_32 i;
  3539. png_uint_32 istop = row_info->rowbytes;
  3540. png_bytep rp = row;
  3541. png_bytep pp = prev_row;
  3542. for (i = 0; i < istop; ++i)
  3543. {
  3544. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  3545. rp++;
  3546. }
  3547. }
  3548. break;
  3549. }
  3550. case PNG_FILTER_VALUE_AVG:
  3551. {
  3552. #if !defined(PNG_1_0_X)
  3553. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
  3554. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3555. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3556. #else
  3557. if (mmx_supported)
  3558. #endif
  3559. {
  3560. png_read_filter_row_mmx_avg(row_info, row, prev_row);
  3561. }
  3562. else
  3563. {
  3564. png_uint_32 i;
  3565. png_bytep rp = row;
  3566. png_bytep pp = prev_row;
  3567. png_bytep lp = row;
  3568. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3569. png_uint_32 istop = row_info->rowbytes - bpp;
  3570. for (i = 0; i < bpp; i++)
  3571. {
  3572. *rp = (png_byte)(((int)(*rp) +
  3573. ((int)(*pp++) >> 1)) & 0xff);
  3574. rp++;
  3575. }
  3576. for (i = 0; i < istop; i++)
  3577. {
  3578. *rp = (png_byte)(((int)(*rp) +
  3579. ((int)(*pp++ + *lp++) >> 1)) & 0xff);
  3580. rp++;
  3581. }
  3582. }
  3583. break;
  3584. }
  3585. case PNG_FILTER_VALUE_PAETH:
  3586. {
  3587. #if !defined(PNG_1_0_X)
  3588. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
  3589. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3590. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3591. #else
  3592. if (mmx_supported)
  3593. #endif
  3594. {
  3595. png_read_filter_row_mmx_paeth(row_info, row, prev_row);
  3596. }
  3597. else
  3598. {
  3599. png_uint_32 i;
  3600. png_bytep rp = row;
  3601. png_bytep pp = prev_row;
  3602. png_bytep lp = row;
  3603. png_bytep cp = prev_row;
  3604. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3605. png_uint_32 istop=row_info->rowbytes - bpp;
  3606. for (i = 0; i < bpp; i++)
  3607. {
  3608. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  3609. rp++;
  3610. }
  3611. for (i = 0; i < istop; i++) // use leftover rp,pp
  3612. {
  3613. int a, b, c, pa, pb, pc, p;
  3614. a = *lp++;
  3615. b = *pp++;
  3616. c = *cp++;
  3617. p = b - c;
  3618. pc = a - c;
  3619. #ifdef PNG_USE_ABS
  3620. pa = abs(p);
  3621. pb = abs(pc);
  3622. pc = abs(p + pc);
  3623. #else
  3624. pa = p < 0 ? -p : p;
  3625. pb = pc < 0 ? -pc : pc;
  3626. pc = (p + pc) < 0 ? -(p + pc) : p + pc;
  3627. #endif
  3628. /*
  3629. if (pa <= pb && pa <= pc)
  3630. p = a;
  3631. else if (pb <= pc)
  3632. p = b;
  3633. else
  3634. p = c;
  3635. */
  3636. p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
  3637. *rp = (png_byte)(((int)(*rp) + p) & 0xff);
  3638. rp++;
  3639. }
  3640. }
  3641. break;
  3642. }
  3643. default:
  3644. png_warning(png_ptr, "Ignoring bad row filter type");
  3645. *row=0;
  3646. break;
  3647. }
  3648. }
  3649. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */