You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sha256_sse4.cpp 42KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
  1. // Copyright (c) 2017 The Starwels developers
  2. // Distributed under the MIT software license, see the accompanying
  3. // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  4. //
  5. // This is a translation to GCC extended asm syntax from YASM code by Intel
  6. // (available at the bottom of this file).
  7. #include <stdint.h>
  8. #include <stdlib.h>
  9. #if defined(__x86_64__) || defined(__amd64__)
  10. namespace sha256_sse4
  11. {
  12. void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
  13. {
  14. static const uint32_t K256 alignas(16) [] = {
  15. 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  16. 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  17. 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  18. 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  19. 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  20. 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  21. 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  22. 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  23. 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  24. 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  25. 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  26. 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  27. 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  28. 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  29. 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  30. 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  31. };
  32. static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
  33. static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
  34. static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
  35. uint32_t a, b, c, d, f, g, h, y0, y1, y2;
  36. uint64_t tbl;
  37. uint64_t inp_end, inp;
  38. uint32_t xfer alignas(16) [4];
  39. __asm__ __volatile__(
  40. "shl $0x6,%2;"
  41. "je Ldone_hash_%=;"
  42. "add %1,%2;"
  43. "mov %2,%14;"
  44. "mov (%0),%3;"
  45. "mov 0x4(%0),%4;"
  46. "mov 0x8(%0),%5;"
  47. "mov 0xc(%0),%6;"
  48. "mov 0x10(%0),%k2;"
  49. "mov 0x14(%0),%7;"
  50. "mov 0x18(%0),%8;"
  51. "mov 0x1c(%0),%9;"
  52. "movdqa %18,%%xmm12;"
  53. "movdqa %19,%%xmm10;"
  54. "movdqa %20,%%xmm11;"
  55. "Lloop0_%=:"
  56. "lea %17,%13;"
  57. "movdqu (%1),%%xmm4;"
  58. "pshufb %%xmm12,%%xmm4;"
  59. "movdqu 0x10(%1),%%xmm5;"
  60. "pshufb %%xmm12,%%xmm5;"
  61. "movdqu 0x20(%1),%%xmm6;"
  62. "pshufb %%xmm12,%%xmm6;"
  63. "movdqu 0x30(%1),%%xmm7;"
  64. "pshufb %%xmm12,%%xmm7;"
  65. "mov %1,%15;"
  66. "mov $3,%1;"
  67. "Lloop1_%=:"
  68. "movdqa 0x0(%13),%%xmm9;"
  69. "paddd %%xmm4,%%xmm9;"
  70. "movdqa %%xmm9,%16;"
  71. "movdqa %%xmm7,%%xmm0;"
  72. "mov %k2,%10;"
  73. "ror $0xe,%10;"
  74. "mov %3,%11;"
  75. "palignr $0x4,%%xmm6,%%xmm0;"
  76. "ror $0x9,%11;"
  77. "xor %k2,%10;"
  78. "mov %7,%12;"
  79. "ror $0x5,%10;"
  80. "movdqa %%xmm5,%%xmm1;"
  81. "xor %3,%11;"
  82. "xor %8,%12;"
  83. "paddd %%xmm4,%%xmm0;"
  84. "xor %k2,%10;"
  85. "and %k2,%12;"
  86. "ror $0xb,%11;"
  87. "palignr $0x4,%%xmm4,%%xmm1;"
  88. "xor %3,%11;"
  89. "ror $0x6,%10;"
  90. "xor %8,%12;"
  91. "movdqa %%xmm1,%%xmm2;"
  92. "ror $0x2,%11;"
  93. "add %10,%12;"
  94. "add %16,%12;"
  95. "movdqa %%xmm1,%%xmm3;"
  96. "mov %3,%10;"
  97. "add %12,%9;"
  98. "mov %3,%12;"
  99. "pslld $0x19,%%xmm1;"
  100. "or %5,%10;"
  101. "add %9,%6;"
  102. "and %5,%12;"
  103. "psrld $0x7,%%xmm2;"
  104. "and %4,%10;"
  105. "add %11,%9;"
  106. "por %%xmm2,%%xmm1;"
  107. "or %12,%10;"
  108. "add %10,%9;"
  109. "movdqa %%xmm3,%%xmm2;"
  110. "mov %6,%10;"
  111. "mov %9,%11;"
  112. "movdqa %%xmm3,%%xmm8;"
  113. "ror $0xe,%10;"
  114. "xor %6,%10;"
  115. "mov %k2,%12;"
  116. "ror $0x9,%11;"
  117. "pslld $0xe,%%xmm3;"
  118. "xor %9,%11;"
  119. "ror $0x5,%10;"
  120. "xor %7,%12;"
  121. "psrld $0x12,%%xmm2;"
  122. "ror $0xb,%11;"
  123. "xor %6,%10;"
  124. "and %6,%12;"
  125. "ror $0x6,%10;"
  126. "pxor %%xmm3,%%xmm1;"
  127. "xor %9,%11;"
  128. "xor %7,%12;"
  129. "psrld $0x3,%%xmm8;"
  130. "add %10,%12;"
  131. "add 4+%16,%12;"
  132. "ror $0x2,%11;"
  133. "pxor %%xmm2,%%xmm1;"
  134. "mov %9,%10;"
  135. "add %12,%8;"
  136. "mov %9,%12;"
  137. "pxor %%xmm8,%%xmm1;"
  138. "or %4,%10;"
  139. "add %8,%5;"
  140. "and %4,%12;"
  141. "pshufd $0xfa,%%xmm7,%%xmm2;"
  142. "and %3,%10;"
  143. "add %11,%8;"
  144. "paddd %%xmm1,%%xmm0;"
  145. "or %12,%10;"
  146. "add %10,%8;"
  147. "movdqa %%xmm2,%%xmm3;"
  148. "mov %5,%10;"
  149. "mov %8,%11;"
  150. "ror $0xe,%10;"
  151. "movdqa %%xmm2,%%xmm8;"
  152. "xor %5,%10;"
  153. "ror $0x9,%11;"
  154. "mov %6,%12;"
  155. "xor %8,%11;"
  156. "ror $0x5,%10;"
  157. "psrlq $0x11,%%xmm2;"
  158. "xor %k2,%12;"
  159. "psrlq $0x13,%%xmm3;"
  160. "xor %5,%10;"
  161. "and %5,%12;"
  162. "psrld $0xa,%%xmm8;"
  163. "ror $0xb,%11;"
  164. "xor %8,%11;"
  165. "xor %k2,%12;"
  166. "ror $0x6,%10;"
  167. "pxor %%xmm3,%%xmm2;"
  168. "add %10,%12;"
  169. "ror $0x2,%11;"
  170. "add 8+%16,%12;"
  171. "pxor %%xmm2,%%xmm8;"
  172. "mov %8,%10;"
  173. "add %12,%7;"
  174. "mov %8,%12;"
  175. "pshufb %%xmm10,%%xmm8;"
  176. "or %3,%10;"
  177. "add %7,%4;"
  178. "and %3,%12;"
  179. "paddd %%xmm8,%%xmm0;"
  180. "and %9,%10;"
  181. "add %11,%7;"
  182. "pshufd $0x50,%%xmm0,%%xmm2;"
  183. "or %12,%10;"
  184. "add %10,%7;"
  185. "movdqa %%xmm2,%%xmm3;"
  186. "mov %4,%10;"
  187. "ror $0xe,%10;"
  188. "mov %7,%11;"
  189. "movdqa %%xmm2,%%xmm4;"
  190. "ror $0x9,%11;"
  191. "xor %4,%10;"
  192. "mov %5,%12;"
  193. "ror $0x5,%10;"
  194. "psrlq $0x11,%%xmm2;"
  195. "xor %7,%11;"
  196. "xor %6,%12;"
  197. "psrlq $0x13,%%xmm3;"
  198. "xor %4,%10;"
  199. "and %4,%12;"
  200. "ror $0xb,%11;"
  201. "psrld $0xa,%%xmm4;"
  202. "xor %7,%11;"
  203. "ror $0x6,%10;"
  204. "xor %6,%12;"
  205. "pxor %%xmm3,%%xmm2;"
  206. "ror $0x2,%11;"
  207. "add %10,%12;"
  208. "add 12+%16,%12;"
  209. "pxor %%xmm2,%%xmm4;"
  210. "mov %7,%10;"
  211. "add %12,%k2;"
  212. "mov %7,%12;"
  213. "pshufb %%xmm11,%%xmm4;"
  214. "or %9,%10;"
  215. "add %k2,%3;"
  216. "and %9,%12;"
  217. "paddd %%xmm0,%%xmm4;"
  218. "and %8,%10;"
  219. "add %11,%k2;"
  220. "or %12,%10;"
  221. "add %10,%k2;"
  222. "movdqa 0x10(%13),%%xmm9;"
  223. "paddd %%xmm5,%%xmm9;"
  224. "movdqa %%xmm9,%16;"
  225. "movdqa %%xmm4,%%xmm0;"
  226. "mov %3,%10;"
  227. "ror $0xe,%10;"
  228. "mov %k2,%11;"
  229. "palignr $0x4,%%xmm7,%%xmm0;"
  230. "ror $0x9,%11;"
  231. "xor %3,%10;"
  232. "mov %4,%12;"
  233. "ror $0x5,%10;"
  234. "movdqa %%xmm6,%%xmm1;"
  235. "xor %k2,%11;"
  236. "xor %5,%12;"
  237. "paddd %%xmm5,%%xmm0;"
  238. "xor %3,%10;"
  239. "and %3,%12;"
  240. "ror $0xb,%11;"
  241. "palignr $0x4,%%xmm5,%%xmm1;"
  242. "xor %k2,%11;"
  243. "ror $0x6,%10;"
  244. "xor %5,%12;"
  245. "movdqa %%xmm1,%%xmm2;"
  246. "ror $0x2,%11;"
  247. "add %10,%12;"
  248. "add %16,%12;"
  249. "movdqa %%xmm1,%%xmm3;"
  250. "mov %k2,%10;"
  251. "add %12,%6;"
  252. "mov %k2,%12;"
  253. "pslld $0x19,%%xmm1;"
  254. "or %8,%10;"
  255. "add %6,%9;"
  256. "and %8,%12;"
  257. "psrld $0x7,%%xmm2;"
  258. "and %7,%10;"
  259. "add %11,%6;"
  260. "por %%xmm2,%%xmm1;"
  261. "or %12,%10;"
  262. "add %10,%6;"
  263. "movdqa %%xmm3,%%xmm2;"
  264. "mov %9,%10;"
  265. "mov %6,%11;"
  266. "movdqa %%xmm3,%%xmm8;"
  267. "ror $0xe,%10;"
  268. "xor %9,%10;"
  269. "mov %3,%12;"
  270. "ror $0x9,%11;"
  271. "pslld $0xe,%%xmm3;"
  272. "xor %6,%11;"
  273. "ror $0x5,%10;"
  274. "xor %4,%12;"
  275. "psrld $0x12,%%xmm2;"
  276. "ror $0xb,%11;"
  277. "xor %9,%10;"
  278. "and %9,%12;"
  279. "ror $0x6,%10;"
  280. "pxor %%xmm3,%%xmm1;"
  281. "xor %6,%11;"
  282. "xor %4,%12;"
  283. "psrld $0x3,%%xmm8;"
  284. "add %10,%12;"
  285. "add 4+%16,%12;"
  286. "ror $0x2,%11;"
  287. "pxor %%xmm2,%%xmm1;"
  288. "mov %6,%10;"
  289. "add %12,%5;"
  290. "mov %6,%12;"
  291. "pxor %%xmm8,%%xmm1;"
  292. "or %7,%10;"
  293. "add %5,%8;"
  294. "and %7,%12;"
  295. "pshufd $0xfa,%%xmm4,%%xmm2;"
  296. "and %k2,%10;"
  297. "add %11,%5;"
  298. "paddd %%xmm1,%%xmm0;"
  299. "or %12,%10;"
  300. "add %10,%5;"
  301. "movdqa %%xmm2,%%xmm3;"
  302. "mov %8,%10;"
  303. "mov %5,%11;"
  304. "ror $0xe,%10;"
  305. "movdqa %%xmm2,%%xmm8;"
  306. "xor %8,%10;"
  307. "ror $0x9,%11;"
  308. "mov %9,%12;"
  309. "xor %5,%11;"
  310. "ror $0x5,%10;"
  311. "psrlq $0x11,%%xmm2;"
  312. "xor %3,%12;"
  313. "psrlq $0x13,%%xmm3;"
  314. "xor %8,%10;"
  315. "and %8,%12;"
  316. "psrld $0xa,%%xmm8;"
  317. "ror $0xb,%11;"
  318. "xor %5,%11;"
  319. "xor %3,%12;"
  320. "ror $0x6,%10;"
  321. "pxor %%xmm3,%%xmm2;"
  322. "add %10,%12;"
  323. "ror $0x2,%11;"
  324. "add 8+%16,%12;"
  325. "pxor %%xmm2,%%xmm8;"
  326. "mov %5,%10;"
  327. "add %12,%4;"
  328. "mov %5,%12;"
  329. "pshufb %%xmm10,%%xmm8;"
  330. "or %k2,%10;"
  331. "add %4,%7;"
  332. "and %k2,%12;"
  333. "paddd %%xmm8,%%xmm0;"
  334. "and %6,%10;"
  335. "add %11,%4;"
  336. "pshufd $0x50,%%xmm0,%%xmm2;"
  337. "or %12,%10;"
  338. "add %10,%4;"
  339. "movdqa %%xmm2,%%xmm3;"
  340. "mov %7,%10;"
  341. "ror $0xe,%10;"
  342. "mov %4,%11;"
  343. "movdqa %%xmm2,%%xmm5;"
  344. "ror $0x9,%11;"
  345. "xor %7,%10;"
  346. "mov %8,%12;"
  347. "ror $0x5,%10;"
  348. "psrlq $0x11,%%xmm2;"
  349. "xor %4,%11;"
  350. "xor %9,%12;"
  351. "psrlq $0x13,%%xmm3;"
  352. "xor %7,%10;"
  353. "and %7,%12;"
  354. "ror $0xb,%11;"
  355. "psrld $0xa,%%xmm5;"
  356. "xor %4,%11;"
  357. "ror $0x6,%10;"
  358. "xor %9,%12;"
  359. "pxor %%xmm3,%%xmm2;"
  360. "ror $0x2,%11;"
  361. "add %10,%12;"
  362. "add 12+%16,%12;"
  363. "pxor %%xmm2,%%xmm5;"
  364. "mov %4,%10;"
  365. "add %12,%3;"
  366. "mov %4,%12;"
  367. "pshufb %%xmm11,%%xmm5;"
  368. "or %6,%10;"
  369. "add %3,%k2;"
  370. "and %6,%12;"
  371. "paddd %%xmm0,%%xmm5;"
  372. "and %5,%10;"
  373. "add %11,%3;"
  374. "or %12,%10;"
  375. "add %10,%3;"
  376. "movdqa 0x20(%13),%%xmm9;"
  377. "paddd %%xmm6,%%xmm9;"
  378. "movdqa %%xmm9,%16;"
  379. "movdqa %%xmm5,%%xmm0;"
  380. "mov %k2,%10;"
  381. "ror $0xe,%10;"
  382. "mov %3,%11;"
  383. "palignr $0x4,%%xmm4,%%xmm0;"
  384. "ror $0x9,%11;"
  385. "xor %k2,%10;"
  386. "mov %7,%12;"
  387. "ror $0x5,%10;"
  388. "movdqa %%xmm7,%%xmm1;"
  389. "xor %3,%11;"
  390. "xor %8,%12;"
  391. "paddd %%xmm6,%%xmm0;"
  392. "xor %k2,%10;"
  393. "and %k2,%12;"
  394. "ror $0xb,%11;"
  395. "palignr $0x4,%%xmm6,%%xmm1;"
  396. "xor %3,%11;"
  397. "ror $0x6,%10;"
  398. "xor %8,%12;"
  399. "movdqa %%xmm1,%%xmm2;"
  400. "ror $0x2,%11;"
  401. "add %10,%12;"
  402. "add %16,%12;"
  403. "movdqa %%xmm1,%%xmm3;"
  404. "mov %3,%10;"
  405. "add %12,%9;"
  406. "mov %3,%12;"
  407. "pslld $0x19,%%xmm1;"
  408. "or %5,%10;"
  409. "add %9,%6;"
  410. "and %5,%12;"
  411. "psrld $0x7,%%xmm2;"
  412. "and %4,%10;"
  413. "add %11,%9;"
  414. "por %%xmm2,%%xmm1;"
  415. "or %12,%10;"
  416. "add %10,%9;"
  417. "movdqa %%xmm3,%%xmm2;"
  418. "mov %6,%10;"
  419. "mov %9,%11;"
  420. "movdqa %%xmm3,%%xmm8;"
  421. "ror $0xe,%10;"
  422. "xor %6,%10;"
  423. "mov %k2,%12;"
  424. "ror $0x9,%11;"
  425. "pslld $0xe,%%xmm3;"
  426. "xor %9,%11;"
  427. "ror $0x5,%10;"
  428. "xor %7,%12;"
  429. "psrld $0x12,%%xmm2;"
  430. "ror $0xb,%11;"
  431. "xor %6,%10;"
  432. "and %6,%12;"
  433. "ror $0x6,%10;"
  434. "pxor %%xmm3,%%xmm1;"
  435. "xor %9,%11;"
  436. "xor %7,%12;"
  437. "psrld $0x3,%%xmm8;"
  438. "add %10,%12;"
  439. "add 4+%16,%12;"
  440. "ror $0x2,%11;"
  441. "pxor %%xmm2,%%xmm1;"
  442. "mov %9,%10;"
  443. "add %12,%8;"
  444. "mov %9,%12;"
  445. "pxor %%xmm8,%%xmm1;"
  446. "or %4,%10;"
  447. "add %8,%5;"
  448. "and %4,%12;"
  449. "pshufd $0xfa,%%xmm5,%%xmm2;"
  450. "and %3,%10;"
  451. "add %11,%8;"
  452. "paddd %%xmm1,%%xmm0;"
  453. "or %12,%10;"
  454. "add %10,%8;"
  455. "movdqa %%xmm2,%%xmm3;"
  456. "mov %5,%10;"
  457. "mov %8,%11;"
  458. "ror $0xe,%10;"
  459. "movdqa %%xmm2,%%xmm8;"
  460. "xor %5,%10;"
  461. "ror $0x9,%11;"
  462. "mov %6,%12;"
  463. "xor %8,%11;"
  464. "ror $0x5,%10;"
  465. "psrlq $0x11,%%xmm2;"
  466. "xor %k2,%12;"
  467. "psrlq $0x13,%%xmm3;"
  468. "xor %5,%10;"
  469. "and %5,%12;"
  470. "psrld $0xa,%%xmm8;"
  471. "ror $0xb,%11;"
  472. "xor %8,%11;"
  473. "xor %k2,%12;"
  474. "ror $0x6,%10;"
  475. "pxor %%xmm3,%%xmm2;"
  476. "add %10,%12;"
  477. "ror $0x2,%11;"
  478. "add 8+%16,%12;"
  479. "pxor %%xmm2,%%xmm8;"
  480. "mov %8,%10;"
  481. "add %12,%7;"
  482. "mov %8,%12;"
  483. "pshufb %%xmm10,%%xmm8;"
  484. "or %3,%10;"
  485. "add %7,%4;"
  486. "and %3,%12;"
  487. "paddd %%xmm8,%%xmm0;"
  488. "and %9,%10;"
  489. "add %11,%7;"
  490. "pshufd $0x50,%%xmm0,%%xmm2;"
  491. "or %12,%10;"
  492. "add %10,%7;"
  493. "movdqa %%xmm2,%%xmm3;"
  494. "mov %4,%10;"
  495. "ror $0xe,%10;"
  496. "mov %7,%11;"
  497. "movdqa %%xmm2,%%xmm6;"
  498. "ror $0x9,%11;"
  499. "xor %4,%10;"
  500. "mov %5,%12;"
  501. "ror $0x5,%10;"
  502. "psrlq $0x11,%%xmm2;"
  503. "xor %7,%11;"
  504. "xor %6,%12;"
  505. "psrlq $0x13,%%xmm3;"
  506. "xor %4,%10;"
  507. "and %4,%12;"
  508. "ror $0xb,%11;"
  509. "psrld $0xa,%%xmm6;"
  510. "xor %7,%11;"
  511. "ror $0x6,%10;"
  512. "xor %6,%12;"
  513. "pxor %%xmm3,%%xmm2;"
  514. "ror $0x2,%11;"
  515. "add %10,%12;"
  516. "add 12+%16,%12;"
  517. "pxor %%xmm2,%%xmm6;"
  518. "mov %7,%10;"
  519. "add %12,%k2;"
  520. "mov %7,%12;"
  521. "pshufb %%xmm11,%%xmm6;"
  522. "or %9,%10;"
  523. "add %k2,%3;"
  524. "and %9,%12;"
  525. "paddd %%xmm0,%%xmm6;"
  526. "and %8,%10;"
  527. "add %11,%k2;"
  528. "or %12,%10;"
  529. "add %10,%k2;"
  530. "movdqa 0x30(%13),%%xmm9;"
  531. "paddd %%xmm7,%%xmm9;"
  532. "movdqa %%xmm9,%16;"
  533. "add $0x40,%13;"
  534. "movdqa %%xmm6,%%xmm0;"
  535. "mov %3,%10;"
  536. "ror $0xe,%10;"
  537. "mov %k2,%11;"
  538. "palignr $0x4,%%xmm5,%%xmm0;"
  539. "ror $0x9,%11;"
  540. "xor %3,%10;"
  541. "mov %4,%12;"
  542. "ror $0x5,%10;"
  543. "movdqa %%xmm4,%%xmm1;"
  544. "xor %k2,%11;"
  545. "xor %5,%12;"
  546. "paddd %%xmm7,%%xmm0;"
  547. "xor %3,%10;"
  548. "and %3,%12;"
  549. "ror $0xb,%11;"
  550. "palignr $0x4,%%xmm7,%%xmm1;"
  551. "xor %k2,%11;"
  552. "ror $0x6,%10;"
  553. "xor %5,%12;"
  554. "movdqa %%xmm1,%%xmm2;"
  555. "ror $0x2,%11;"
  556. "add %10,%12;"
  557. "add %16,%12;"
  558. "movdqa %%xmm1,%%xmm3;"
  559. "mov %k2,%10;"
  560. "add %12,%6;"
  561. "mov %k2,%12;"
  562. "pslld $0x19,%%xmm1;"
  563. "or %8,%10;"
  564. "add %6,%9;"
  565. "and %8,%12;"
  566. "psrld $0x7,%%xmm2;"
  567. "and %7,%10;"
  568. "add %11,%6;"
  569. "por %%xmm2,%%xmm1;"
  570. "or %12,%10;"
  571. "add %10,%6;"
  572. "movdqa %%xmm3,%%xmm2;"
  573. "mov %9,%10;"
  574. "mov %6,%11;"
  575. "movdqa %%xmm3,%%xmm8;"
  576. "ror $0xe,%10;"
  577. "xor %9,%10;"
  578. "mov %3,%12;"
  579. "ror $0x9,%11;"
  580. "pslld $0xe,%%xmm3;"
  581. "xor %6,%11;"
  582. "ror $0x5,%10;"
  583. "xor %4,%12;"
  584. "psrld $0x12,%%xmm2;"
  585. "ror $0xb,%11;"
  586. "xor %9,%10;"
  587. "and %9,%12;"
  588. "ror $0x6,%10;"
  589. "pxor %%xmm3,%%xmm1;"
  590. "xor %6,%11;"
  591. "xor %4,%12;"
  592. "psrld $0x3,%%xmm8;"
  593. "add %10,%12;"
  594. "add 4+%16,%12;"
  595. "ror $0x2,%11;"
  596. "pxor %%xmm2,%%xmm1;"
  597. "mov %6,%10;"
  598. "add %12,%5;"
  599. "mov %6,%12;"
  600. "pxor %%xmm8,%%xmm1;"
  601. "or %7,%10;"
  602. "add %5,%8;"
  603. "and %7,%12;"
  604. "pshufd $0xfa,%%xmm6,%%xmm2;"
  605. "and %k2,%10;"
  606. "add %11,%5;"
  607. "paddd %%xmm1,%%xmm0;"
  608. "or %12,%10;"
  609. "add %10,%5;"
  610. "movdqa %%xmm2,%%xmm3;"
  611. "mov %8,%10;"
  612. "mov %5,%11;"
  613. "ror $0xe,%10;"
  614. "movdqa %%xmm2,%%xmm8;"
  615. "xor %8,%10;"
  616. "ror $0x9,%11;"
  617. "mov %9,%12;"
  618. "xor %5,%11;"
  619. "ror $0x5,%10;"
  620. "psrlq $0x11,%%xmm2;"
  621. "xor %3,%12;"
  622. "psrlq $0x13,%%xmm3;"
  623. "xor %8,%10;"
  624. "and %8,%12;"
  625. "psrld $0xa,%%xmm8;"
  626. "ror $0xb,%11;"
  627. "xor %5,%11;"
  628. "xor %3,%12;"
  629. "ror $0x6,%10;"
  630. "pxor %%xmm3,%%xmm2;"
  631. "add %10,%12;"
  632. "ror $0x2,%11;"
  633. "add 8+%16,%12;"
  634. "pxor %%xmm2,%%xmm8;"
  635. "mov %5,%10;"
  636. "add %12,%4;"
  637. "mov %5,%12;"
  638. "pshufb %%xmm10,%%xmm8;"
  639. "or %k2,%10;"
  640. "add %4,%7;"
  641. "and %k2,%12;"
  642. "paddd %%xmm8,%%xmm0;"
  643. "and %6,%10;"
  644. "add %11,%4;"
  645. "pshufd $0x50,%%xmm0,%%xmm2;"
  646. "or %12,%10;"
  647. "add %10,%4;"
  648. "movdqa %%xmm2,%%xmm3;"
  649. "mov %7,%10;"
  650. "ror $0xe,%10;"
  651. "mov %4,%11;"
  652. "movdqa %%xmm2,%%xmm7;"
  653. "ror $0x9,%11;"
  654. "xor %7,%10;"
  655. "mov %8,%12;"
  656. "ror $0x5,%10;"
  657. "psrlq $0x11,%%xmm2;"
  658. "xor %4,%11;"
  659. "xor %9,%12;"
  660. "psrlq $0x13,%%xmm3;"
  661. "xor %7,%10;"
  662. "and %7,%12;"
  663. "ror $0xb,%11;"
  664. "psrld $0xa,%%xmm7;"
  665. "xor %4,%11;"
  666. "ror $0x6,%10;"
  667. "xor %9,%12;"
  668. "pxor %%xmm3,%%xmm2;"
  669. "ror $0x2,%11;"
  670. "add %10,%12;"
  671. "add 12+%16,%12;"
  672. "pxor %%xmm2,%%xmm7;"
  673. "mov %4,%10;"
  674. "add %12,%3;"
  675. "mov %4,%12;"
  676. "pshufb %%xmm11,%%xmm7;"
  677. "or %6,%10;"
  678. "add %3,%k2;"
  679. "and %6,%12;"
  680. "paddd %%xmm0,%%xmm7;"
  681. "and %5,%10;"
  682. "add %11,%3;"
  683. "or %12,%10;"
  684. "add %10,%3;"
  685. "sub $0x1,%1;"
  686. "jne Lloop1_%=;"
  687. "mov $0x2,%1;"
  688. "Lloop2_%=:"
  689. "paddd 0x0(%13),%%xmm4;"
  690. "movdqa %%xmm4,%16;"
  691. "mov %k2,%10;"
  692. "ror $0xe,%10;"
  693. "mov %3,%11;"
  694. "xor %k2,%10;"
  695. "ror $0x9,%11;"
  696. "mov %7,%12;"
  697. "xor %3,%11;"
  698. "ror $0x5,%10;"
  699. "xor %8,%12;"
  700. "xor %k2,%10;"
  701. "ror $0xb,%11;"
  702. "and %k2,%12;"
  703. "xor %3,%11;"
  704. "ror $0x6,%10;"
  705. "xor %8,%12;"
  706. "add %10,%12;"
  707. "ror $0x2,%11;"
  708. "add %16,%12;"
  709. "mov %3,%10;"
  710. "add %12,%9;"
  711. "mov %3,%12;"
  712. "or %5,%10;"
  713. "add %9,%6;"
  714. "and %5,%12;"
  715. "and %4,%10;"
  716. "add %11,%9;"
  717. "or %12,%10;"
  718. "add %10,%9;"
  719. "mov %6,%10;"
  720. "ror $0xe,%10;"
  721. "mov %9,%11;"
  722. "xor %6,%10;"
  723. "ror $0x9,%11;"
  724. "mov %k2,%12;"
  725. "xor %9,%11;"
  726. "ror $0x5,%10;"
  727. "xor %7,%12;"
  728. "xor %6,%10;"
  729. "ror $0xb,%11;"
  730. "and %6,%12;"
  731. "xor %9,%11;"
  732. "ror $0x6,%10;"
  733. "xor %7,%12;"
  734. "add %10,%12;"
  735. "ror $0x2,%11;"
  736. "add 4+%16,%12;"
  737. "mov %9,%10;"
  738. "add %12,%8;"
  739. "mov %9,%12;"
  740. "or %4,%10;"
  741. "add %8,%5;"
  742. "and %4,%12;"
  743. "and %3,%10;"
  744. "add %11,%8;"
  745. "or %12,%10;"
  746. "add %10,%8;"
  747. "mov %5,%10;"
  748. "ror $0xe,%10;"
  749. "mov %8,%11;"
  750. "xor %5,%10;"
  751. "ror $0x9,%11;"
  752. "mov %6,%12;"
  753. "xor %8,%11;"
  754. "ror $0x5,%10;"
  755. "xor %k2,%12;"
  756. "xor %5,%10;"
  757. "ror $0xb,%11;"
  758. "and %5,%12;"
  759. "xor %8,%11;"
  760. "ror $0x6,%10;"
  761. "xor %k2,%12;"
  762. "add %10,%12;"
  763. "ror $0x2,%11;"
  764. "add 8+%16,%12;"
  765. "mov %8,%10;"
  766. "add %12,%7;"
  767. "mov %8,%12;"
  768. "or %3,%10;"
  769. "add %7,%4;"
  770. "and %3,%12;"
  771. "and %9,%10;"
  772. "add %11,%7;"
  773. "or %12,%10;"
  774. "add %10,%7;"
  775. "mov %4,%10;"
  776. "ror $0xe,%10;"
  777. "mov %7,%11;"
  778. "xor %4,%10;"
  779. "ror $0x9,%11;"
  780. "mov %5,%12;"
  781. "xor %7,%11;"
  782. "ror $0x5,%10;"
  783. "xor %6,%12;"
  784. "xor %4,%10;"
  785. "ror $0xb,%11;"
  786. "and %4,%12;"
  787. "xor %7,%11;"
  788. "ror $0x6,%10;"
  789. "xor %6,%12;"
  790. "add %10,%12;"
  791. "ror $0x2,%11;"
  792. "add 12+%16,%12;"
  793. "mov %7,%10;"
  794. "add %12,%k2;"
  795. "mov %7,%12;"
  796. "or %9,%10;"
  797. "add %k2,%3;"
  798. "and %9,%12;"
  799. "and %8,%10;"
  800. "add %11,%k2;"
  801. "or %12,%10;"
  802. "add %10,%k2;"
  803. "paddd 0x10(%13),%%xmm5;"
  804. "movdqa %%xmm5,%16;"
  805. "add $0x20,%13;"
  806. "mov %3,%10;"
  807. "ror $0xe,%10;"
  808. "mov %k2,%11;"
  809. "xor %3,%10;"
  810. "ror $0x9,%11;"
  811. "mov %4,%12;"
  812. "xor %k2,%11;"
  813. "ror $0x5,%10;"
  814. "xor %5,%12;"
  815. "xor %3,%10;"
  816. "ror $0xb,%11;"
  817. "and %3,%12;"
  818. "xor %k2,%11;"
  819. "ror $0x6,%10;"
  820. "xor %5,%12;"
  821. "add %10,%12;"
  822. "ror $0x2,%11;"
  823. "add %16,%12;"
  824. "mov %k2,%10;"
  825. "add %12,%6;"
  826. "mov %k2,%12;"
  827. "or %8,%10;"
  828. "add %6,%9;"
  829. "and %8,%12;"
  830. "and %7,%10;"
  831. "add %11,%6;"
  832. "or %12,%10;"
  833. "add %10,%6;"
  834. "mov %9,%10;"
  835. "ror $0xe,%10;"
  836. "mov %6,%11;"
  837. "xor %9,%10;"
  838. "ror $0x9,%11;"
  839. "mov %3,%12;"
  840. "xor %6,%11;"
  841. "ror $0x5,%10;"
  842. "xor %4,%12;"
  843. "xor %9,%10;"
  844. "ror $0xb,%11;"
  845. "and %9,%12;"
  846. "xor %6,%11;"
  847. "ror $0x6,%10;"
  848. "xor %4,%12;"
  849. "add %10,%12;"
  850. "ror $0x2,%11;"
  851. "add 4+%16,%12;"
  852. "mov %6,%10;"
  853. "add %12,%5;"
  854. "mov %6,%12;"
  855. "or %7,%10;"
  856. "add %5,%8;"
  857. "and %7,%12;"
  858. "and %k2,%10;"
  859. "add %11,%5;"
  860. "or %12,%10;"
  861. "add %10,%5;"
  862. "mov %8,%10;"
  863. "ror $0xe,%10;"
  864. "mov %5,%11;"
  865. "xor %8,%10;"
  866. "ror $0x9,%11;"
  867. "mov %9,%12;"
  868. "xor %5,%11;"
  869. "ror $0x5,%10;"
  870. "xor %3,%12;"
  871. "xor %8,%10;"
  872. "ror $0xb,%11;"
  873. "and %8,%12;"
  874. "xor %5,%11;"
  875. "ror $0x6,%10;"
  876. "xor %3,%12;"
  877. "add %10,%12;"
  878. "ror $0x2,%11;"
  879. "add 8+%16,%12;"
  880. "mov %5,%10;"
  881. "add %12,%4;"
  882. "mov %5,%12;"
  883. "or %k2,%10;"
  884. "add %4,%7;"
  885. "and %k2,%12;"
  886. "and %6,%10;"
  887. "add %11,%4;"
  888. "or %12,%10;"
  889. "add %10,%4;"
  890. "mov %7,%10;"
  891. "ror $0xe,%10;"
  892. "mov %4,%11;"
  893. "xor %7,%10;"
  894. "ror $0x9,%11;"
  895. "mov %8,%12;"
  896. "xor %4,%11;"
  897. "ror $0x5,%10;"
  898. "xor %9,%12;"
  899. "xor %7,%10;"
  900. "ror $0xb,%11;"
  901. "and %7,%12;"
  902. "xor %4,%11;"
  903. "ror $0x6,%10;"
  904. "xor %9,%12;"
  905. "add %10,%12;"
  906. "ror $0x2,%11;"
  907. "add 12+%16,%12;"
  908. "mov %4,%10;"
  909. "add %12,%3;"
  910. "mov %4,%12;"
  911. "or %6,%10;"
  912. "add %3,%k2;"
  913. "and %6,%12;"
  914. "and %5,%10;"
  915. "add %11,%3;"
  916. "or %12,%10;"
  917. "add %10,%3;"
  918. "movdqa %%xmm6,%%xmm4;"
  919. "movdqa %%xmm7,%%xmm5;"
  920. "sub $0x1,%1;"
  921. "jne Lloop2_%=;"
  922. "add (%0),%3;"
  923. "mov %3,(%0);"
  924. "add 0x4(%0),%4;"
  925. "mov %4,0x4(%0);"
  926. "add 0x8(%0),%5;"
  927. "mov %5,0x8(%0);"
  928. "add 0xc(%0),%6;"
  929. "mov %6,0xc(%0);"
  930. "add 0x10(%0),%k2;"
  931. "mov %k2,0x10(%0);"
  932. "add 0x14(%0),%7;"
  933. "mov %7,0x14(%0);"
  934. "add 0x18(%0),%8;"
  935. "mov %8,0x18(%0);"
  936. "add 0x1c(%0),%9;"
  937. "mov %9,0x1c(%0);"
  938. "mov %15,%1;"
  939. "add $0x40,%1;"
  940. "cmp %14,%1;"
  941. "jne Lloop0_%=;"
  942. "Ldone_hash_%=:"
  943. : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
  944. : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
  945. : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
  946. );
  947. }
  948. }
  949. /*
  950. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  951. ; Copyright (c) 2012, Intel Corporation
  952. ;
  953. ; All rights reserved.
  954. ;
  955. ; Redistribution and use in source and binary forms, with or without
  956. ; modification, are permitted provided that the following conditions are
  957. ; met:
  958. ;
  959. ; * Redistributions of source code must retain the above copyright
  960. ; notice, this list of conditions and the following disclaimer.
  961. ;
  962. ; * Redistributions in binary form must reproduce the above copyright
  963. ; notice, this list of conditions and the following disclaimer in the
  964. ; documentation and/or other materials provided with the
  965. ; distribution.
  966. ;
  967. ; * Neither the name of the Intel Corporation nor the names of its
  968. ; contributors may be used to endorse or promote products derived from
  969. ; this software without specific prior written permission.
  970. ;
  971. ;
  972. ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
  973. ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  974. ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  975. ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  976. ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  977. ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  978. ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  979. ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  980. ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  981. ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  982. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  983. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  984. ;
  985. ; Example YASM command lines:
  986. ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
  987. ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
  988. ;
  989. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  990. ;
  991. ; This code is described in an Intel White-Paper:
  992. ; "Fast SHA-256 Implementations on Intel Architecture Processors"
  993. ;
  994. ; To find it, surf to http://www.intel.com/p/en_US/embedded
  995. ; and search for that title.
  996. ; The paper is expected to be released roughly at the end of April, 2012
  997. ;
  998. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  999. ; This code schedules 1 blocks at a time, with 4 lanes per block
  1000. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1001. %define MOVDQ movdqu ;; assume buffers not aligned
  1002. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
  1003. ; addm [mem], reg
  1004. ; Add reg to mem using reg-mem add and store
  1005. %macro addm 2
  1006. add %2, %1
  1007. mov %1, %2
  1008. %endm
  1009. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1010. ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  1011. ; Load xmm with mem and byte swap each dword
  1012. %macro COPY_XMM_AND_BSWAP 3
  1013. MOVDQ %1, %2
  1014. pshufb %1, %3
  1015. %endmacro
  1016. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1017. %define X0 xmm4
  1018. %define X1 xmm5
  1019. %define X2 xmm6
  1020. %define X3 xmm7
  1021. %define XTMP0 xmm0
  1022. %define XTMP1 xmm1
  1023. %define XTMP2 xmm2
  1024. %define XTMP3 xmm3
  1025. %define XTMP4 xmm8
  1026. %define XFER xmm9
  1027. %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
  1028. %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
  1029. %define BYTE_FLIP_MASK xmm12
  1030. %ifdef LINUX
  1031. %define NUM_BLKS rdx ; 3rd arg
  1032. %define CTX rsi ; 2nd arg
  1033. %define INP rdi ; 1st arg
  1034. %define SRND rdi ; clobbers INP
  1035. %define c ecx
  1036. %define d r8d
  1037. %define e edx
  1038. %else
  1039. %define NUM_BLKS r8 ; 3rd arg
  1040. %define CTX rdx ; 2nd arg
  1041. %define INP rcx ; 1st arg
  1042. %define SRND rcx ; clobbers INP
  1043. %define c edi
  1044. %define d esi
  1045. %define e r8d
  1046. %endif
  1047. %define TBL rbp
  1048. %define a eax
  1049. %define b ebx
  1050. %define f r9d
  1051. %define g r10d
  1052. %define h r11d
  1053. %define y0 r13d
  1054. %define y1 r14d
  1055. %define y2 r15d
  1056. _INP_END_SIZE equ 8
  1057. _INP_SIZE equ 8
  1058. _XFER_SIZE equ 8
  1059. %ifdef LINUX
  1060. _XMM_SAVE_SIZE equ 0
  1061. %else
  1062. _XMM_SAVE_SIZE equ 7*16
  1063. %endif
  1064. ; STACK_SIZE plus pushes must be an odd multiple of 8
  1065. _ALIGN_SIZE equ 8
  1066. _INP_END equ 0
  1067. _INP equ _INP_END + _INP_END_SIZE
  1068. _XFER equ _INP + _INP_SIZE
  1069. _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
  1070. STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
  1071. ; rotate_Xs
  1072. ; Rotate values of symbols X0...X3
  1073. %macro rotate_Xs 0
  1074. %xdefine X_ X0
  1075. %xdefine X0 X1
  1076. %xdefine X1 X2
  1077. %xdefine X2 X3
  1078. %xdefine X3 X_
  1079. %endm
  1080. ; ROTATE_ARGS
  1081. ; Rotate values of symbols a...h
  1082. %macro ROTATE_ARGS 0
  1083. %xdefine TMP_ h
  1084. %xdefine h g
  1085. %xdefine g f
  1086. %xdefine f e
  1087. %xdefine e d
  1088. %xdefine d c
  1089. %xdefine c b
  1090. %xdefine b a
  1091. %xdefine a TMP_
  1092. %endm
  1093. %macro FOUR_ROUNDS_AND_SCHED 0
  1094. ;; compute s0 four at a time and s1 two at a time
  1095. ;; compute W[-16] + W[-7] 4 at a time
  1096. movdqa XTMP0, X3
  1097. mov y0, e ; y0 = e
  1098. ror y0, (25-11) ; y0 = e >> (25-11)
  1099. mov y1, a ; y1 = a
  1100. palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
  1101. ror y1, (22-13) ; y1 = a >> (22-13)
  1102. xor y0, e ; y0 = e ^ (e >> (25-11))
  1103. mov y2, f ; y2 = f
  1104. ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  1105. movdqa XTMP1, X1
  1106. xor y1, a ; y1 = a ^ (a >> (22-13)
  1107. xor y2, g ; y2 = f^g
  1108. paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
  1109. xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  1110. and y2, e ; y2 = (f^g)&e
  1111. ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  1112. ;; compute s0
  1113. palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
  1114. xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  1115. ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  1116. xor y2, g ; y2 = CH = ((f^g)&e)^g
  1117. movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
  1118. ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  1119. add y2, y0 ; y2 = S1 + CH
  1120. add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
  1121. movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
  1122. mov y0, a ; y0 = a
  1123. add h, y2 ; h = h + S1 + CH + k + w
  1124. mov y2, a ; y2 = a
  1125. pslld XTMP1, (32-7)
  1126. or y0, c ; y0 = a|c
  1127. add d, h ; d = d + h + S1 + CH + k + w
  1128. and y2, c ; y2 = a&c
  1129. psrld XTMP2, 7
  1130. and y0, b ; y0 = (a|c)&b
  1131. add h, y1 ; h = h + S1 + CH + k + w + S0
  1132. por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
  1133. or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
  1134. add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
  1135. ROTATE_ARGS
  1136. movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
  1137. mov y0, e ; y0 = e
  1138. mov y1, a ; y1 = a
  1139. movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
  1140. ror y0, (25-11) ; y0 = e >> (25-11)
  1141. xor y0, e ; y0 = e ^ (e >> (25-11))
  1142. mov y2, f ; y2 = f
  1143. ror y1, (22-13) ; y1 = a >> (22-13)
  1144. pslld XTMP3, (32-18)
  1145. xor y1, a ; y1 = a ^ (a >> (22-13)
  1146. ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  1147. xor y2, g ; y2 = f^g
  1148. psrld XTMP2, 18
  1149. ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  1150. xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  1151. and y2, e ; y2 = (f^g)&e
  1152. ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  1153. pxor XTMP1, XTMP3
  1154. xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  1155. xor y2, g ; y2 = CH = ((f^g)&e)^g
  1156. psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
  1157. add y2, y0 ; y2 = S1 + CH
  1158. add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
  1159. ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  1160. pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
  1161. mov y0, a ; y0 = a
  1162. add h, y2 ; h = h + S1 + CH + k + w
  1163. mov y2, a ; y2 = a
  1164. pxor XTMP1, XTMP4 ; XTMP1 = s0
  1165. or y0, c ; y0 = a|c
  1166. add d, h ; d = d + h + S1 + CH + k + w
  1167. and y2, c ; y2 = a&c
  1168. ;; compute low s1
  1169. pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
  1170. and y0, b ; y0 = (a|c)&b
  1171. add h, y1 ; h = h + S1 + CH + k + w + S0
  1172. paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
  1173. or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
  1174. add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
  1175. ROTATE_ARGS
  1176. movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
  1177. mov y0, e ; y0 = e
  1178. mov y1, a ; y1 = a
  1179. ror y0, (25-11) ; y0 = e >> (25-11)
  1180. movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
  1181. xor y0, e ; y0 = e ^ (e >> (25-11))
  1182. ror y1, (22-13) ; y1 = a >> (22-13)
  1183. mov y2, f ; y2 = f
  1184. xor y1, a ; y1 = a ^ (a >> (22-13)
  1185. ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  1186. psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
  1187. xor y2, g ; y2 = f^g
  1188. psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
  1189. xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  1190. and y2, e ; y2 = (f^g)&e
  1191. psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
  1192. ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  1193. xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  1194. xor y2, g ; y2 = CH = ((f^g)&e)^g
  1195. ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  1196. pxor XTMP2, XTMP3
  1197. add y2, y0 ; y2 = S1 + CH
  1198. ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  1199. add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
  1200. pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
  1201. mov y0, a ; y0 = a
  1202. add h, y2 ; h = h + S1 + CH + k + w
  1203. mov y2, a ; y2 = a
  1204. pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
  1205. or y0, c ; y0 = a|c
  1206. add d, h ; d = d + h + S1 + CH + k + w
  1207. and y2, c ; y2 = a&c
  1208. paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
  1209. and y0, b ; y0 = (a|c)&b
  1210. add h, y1 ; h = h + S1 + CH + k + w + S0
  1211. ;; compute high s1
  1212. pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
  1213. or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
  1214. add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
  1215. ROTATE_ARGS
  1216. movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
  1217. mov y0, e ; y0 = e
  1218. ror y0, (25-11) ; y0 = e >> (25-11)
  1219. mov y1, a ; y1 = a
  1220. movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
  1221. ror y1, (22-13) ; y1 = a >> (22-13)
  1222. xor y0, e ; y0 = e ^ (e >> (25-11))
  1223. mov y2, f ; y2 = f
  1224. ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  1225. psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
  1226. xor y1, a ; y1 = a ^ (a >> (22-13)
  1227. xor y2, g ; y2 = f^g
  1228. psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
  1229. xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  1230. and y2, e ; y2 = (f^g)&e
  1231. ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  1232. psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
  1233. xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  1234. ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  1235. xor y2, g ; y2 = CH = ((f^g)&e)^g
  1236. pxor XTMP2, XTMP3
  1237. ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  1238. add y2, y0 ; y2 = S1 + CH
  1239. add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
  1240. pxor X0, XTMP2 ; X0 = s1 {xDxC}
  1241. mov y0, a ; y0 = a
  1242. add h, y2 ; h = h + S1 + CH + k + w
  1243. mov y2, a ; y2 = a
  1244. pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
  1245. or y0, c ; y0 = a|c
  1246. add d, h ; d = d + h + S1 + CH + k + w
  1247. and y2, c ; y2 = a&c
  1248. paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
  1249. and y0, b ; y0 = (a|c)&b
  1250. add h, y1 ; h = h + S1 + CH + k + w + S0
  1251. or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
  1252. add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
  1253. ROTATE_ARGS
  1254. rotate_Xs
  1255. %endm
  1256. ;; input is [rsp + _XFER + %1 * 4]
  1257. %macro DO_ROUND 1
  1258. mov y0, e ; y0 = e
  1259. ror y0, (25-11) ; y0 = e >> (25-11)
  1260. mov y1, a ; y1 = a
  1261. xor y0, e ; y0 = e ^ (e >> (25-11))
  1262. ror y1, (22-13) ; y1 = a >> (22-13)
  1263. mov y2, f ; y2 = f
  1264. xor y1, a ; y1 = a ^ (a >> (22-13)
  1265. ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  1266. xor y2, g ; y2 = f^g
  1267. xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  1268. ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  1269. and y2, e ; y2 = (f^g)&e
  1270. xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  1271. ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  1272. xor y2, g ; y2 = CH = ((f^g)&e)^g
  1273. add y2, y0 ; y2 = S1 + CH
  1274. ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  1275. add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
  1276. mov y0, a ; y0 = a
  1277. add h, y2 ; h = h + S1 + CH + k + w
  1278. mov y2, a ; y2 = a
  1279. or y0, c ; y0 = a|c
  1280. add d, h ; d = d + h + S1 + CH + k + w
  1281. and y2, c ; y2 = a&c
  1282. and y0, b ; y0 = (a|c)&b
  1283. add h, y1 ; h = h + S1 + CH + k + w + S0
  1284. or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
  1285. add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
  1286. ROTATE_ARGS
  1287. %endm
  1288. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1289. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1290. ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
  1291. ;; arg 1 : pointer to input data
  1292. ;; arg 2 : pointer to digest
  1293. ;; arg 3 : Num blocks
  1294. section .text
  1295. global sha256_sse4
  1296. align 32
  1297. sha256_sse4:
  1298. push rbx
  1299. %ifndef LINUX
  1300. push rsi
  1301. push rdi
  1302. %endif
  1303. push rbp
  1304. push r13
  1305. push r14
  1306. push r15
  1307. sub rsp,STACK_SIZE
  1308. %ifndef LINUX
  1309. movdqa [rsp + _XMM_SAVE + 0*16],xmm6
  1310. movdqa [rsp + _XMM_SAVE + 1*16],xmm7
  1311. movdqa [rsp + _XMM_SAVE + 2*16],xmm8
  1312. movdqa [rsp + _XMM_SAVE + 3*16],xmm9
  1313. movdqa [rsp + _XMM_SAVE + 4*16],xmm10
  1314. movdqa [rsp + _XMM_SAVE + 5*16],xmm11
  1315. movdqa [rsp + _XMM_SAVE + 6*16],xmm12
  1316. %endif
  1317. shl NUM_BLKS, 6 ; convert to bytes
  1318. jz done_hash
  1319. add NUM_BLKS, INP ; pointer to end of data
  1320. mov [rsp + _INP_END], NUM_BLKS
  1321. ;; load initial digest
  1322. mov a,[4*0 + CTX]
  1323. mov b,[4*1 + CTX]
  1324. mov c,[4*2 + CTX]
  1325. mov d,[4*3 + CTX]
  1326. mov e,[4*4 + CTX]
  1327. mov f,[4*5 + CTX]
  1328. mov g,[4*6 + CTX]
  1329. mov h,[4*7 + CTX]
  1330. movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
  1331. movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
  1332. movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
  1333. loop0:
  1334. lea TBL,[K256 wrt rip]
  1335. ;; byte swap first 16 dwords
  1336. COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
  1337. COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
  1338. COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
  1339. COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
  1340. mov [rsp + _INP], INP
  1341. ;; schedule 48 input dwords, by doing 3 rounds of 16 each
  1342. mov SRND, 3
  1343. align 16
  1344. loop1:
  1345. movdqa XFER, [TBL + 0*16]
  1346. paddd XFER, X0
  1347. movdqa [rsp + _XFER], XFER
  1348. FOUR_ROUNDS_AND_SCHED
  1349. movdqa XFER, [TBL + 1*16]
  1350. paddd XFER, X0
  1351. movdqa [rsp + _XFER], XFER
  1352. FOUR_ROUNDS_AND_SCHED
  1353. movdqa XFER, [TBL + 2*16]
  1354. paddd XFER, X0
  1355. movdqa [rsp + _XFER], XFER
  1356. FOUR_ROUNDS_AND_SCHED
  1357. movdqa XFER, [TBL + 3*16]
  1358. paddd XFER, X0
  1359. movdqa [rsp + _XFER], XFER
  1360. add TBL, 4*16
  1361. FOUR_ROUNDS_AND_SCHED
  1362. sub SRND, 1
  1363. jne loop1
  1364. mov SRND, 2
  1365. loop2:
  1366. paddd X0, [TBL + 0*16]
  1367. movdqa [rsp + _XFER], X0
  1368. DO_ROUND 0
  1369. DO_ROUND 1
  1370. DO_ROUND 2
  1371. DO_ROUND 3
  1372. paddd X1, [TBL + 1*16]
  1373. movdqa [rsp + _XFER], X1
  1374. add TBL, 2*16
  1375. DO_ROUND 0
  1376. DO_ROUND 1
  1377. DO_ROUND 2
  1378. DO_ROUND 3
  1379. movdqa X0, X2
  1380. movdqa X1, X3
  1381. sub SRND, 1
  1382. jne loop2
  1383. addm [4*0 + CTX],a
  1384. addm [4*1 + CTX],b
  1385. addm [4*2 + CTX],c
  1386. addm [4*3 + CTX],d
  1387. addm [4*4 + CTX],e
  1388. addm [4*5 + CTX],f
  1389. addm [4*6 + CTX],g
  1390. addm [4*7 + CTX],h
  1391. mov INP, [rsp + _INP]
  1392. add INP, 64
  1393. cmp INP, [rsp + _INP_END]
  1394. jne loop0
  1395. done_hash:
  1396. %ifndef LINUX
  1397. movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
  1398. movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
  1399. movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
  1400. movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
  1401. movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
  1402. movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
  1403. movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
  1404. %endif
  1405. add rsp, STACK_SIZE
  1406. pop r15
  1407. pop r14
  1408. pop r13
  1409. pop rbp
  1410. %ifndef LINUX
  1411. pop rdi
  1412. pop rsi
  1413. %endif
  1414. pop rbx
  1415. ret
  1416. section .data
  1417. align 64
  1418. K256:
  1419. dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  1420. dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  1421. dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  1422. dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  1423. dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  1424. dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  1425. dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  1426. dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  1427. dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  1428. dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  1429. dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  1430. dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  1431. dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  1432. dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  1433. dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  1434. dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  1435. PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
  1436. ; shuffle xBxA -> 00BA
  1437. _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
  1438. ; shuffle xDxC -> DC00
  1439. _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
  1440. */
  1441. #endif