fix: SIMD instruction set detection on non-x86 hardware
[proxmark3-svn] / client / hardnested / hardnested_bf_core.c
1 //-----------------------------------------------------------------------------
2 // Copyright (C) 2016, 2017 by piwi
3 //
4 // This code is licensed to you under the terms of the GNU GPL, version 2 or,
5 // at your option, any later version. See the LICENSE.txt file for the text of
6 // the license.
7 //-----------------------------------------------------------------------------
8 // Implements a card only attack based on crypto text (encrypted nonces
9 // received during a nested authentication) only. Unlike other card only
10 // attacks this doesn't rely on implementation errors but only on the
11 // inherent weaknesses of the crypto1 cypher. Described in
12 // Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
13 // Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
14 // Computer and Communications Security, 2015
15 //-----------------------------------------------------------------------------
16 //
17 // brute forcing is based on @aczids bitsliced brute forcer
18 // https://github.com/aczid/crypto1_bs with some modifications. Mainly:
19 // - don't rollback. Start with 2nd byte of nonce instead
20 // - reuse results of filter subfunctions
21 // - reuse results of previous nonces if some first bits are identical
22 //
23 //-----------------------------------------------------------------------------
24 // aczid's Copyright notice:
25 //
26 // Bit-sliced Crypto-1 brute-forcing implementation
27 // Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid)
28 /*
29 Copyright (c) 2015-2016 Aram Verstegen
30
31 Permission is hereby granted, free of charge, to any person obtaining a copy
32 of this software and associated documentation files (the "Software"), to deal
33 in the Software without restriction, including without limitation the rights
34 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
35 copies of the Software, and to permit persons to whom the Software is
36 furnished to do so, subject to the following conditions:
37
38 The above copyright notice and this permission notice shall be included in
39 all copies or substantial portions of the Software.
40
41 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
47 THE SOFTWARE.
48 */
49
50 #include "hardnested_bf_core.h"
51
52 #include <stdint.h>
53 #include <stdbool.h>
54 #include <stdlib.h>
55 #ifndef __APPLE__
56 #include <malloc.h>
57 #endif
58 #include <stdio.h>
59 #include <string.h>
60 #include "crapto1/crapto1.h"
61 #include "parity.h"
62
63 // bitslice type
64 // while AVX supports 256 bit vector floating point operations, we need integer operations for boolean logic
65 // same for AVX2 and 512 bit vectors
66 // using larger vectors works but seems to generate more register pressure
67 #if defined(__AVX512F__)
68 #define MAX_BITSLICES 512
69 #elif defined(__AVX2__)
70 #define MAX_BITSLICES 256
71 #elif defined(__AVX__)
72 #define MAX_BITSLICES 128
73 #elif defined(__SSE2__)
74 #define MAX_BITSLICES 128
75 #else // MMX or SSE or NOSIMD
76 #define MAX_BITSLICES 64
77 #endif
78
79 #define VECTOR_SIZE (MAX_BITSLICES/8)
80 typedef unsigned int __attribute__((aligned(VECTOR_SIZE))) __attribute__((vector_size(VECTOR_SIZE))) bitslice_value_t;
81 typedef union {
82 bitslice_value_t value;
83 uint64_t bytes64[MAX_BITSLICES/64];
84 uint8_t bytes[MAX_BITSLICES/8];
85 } bitslice_t;
86
87 // filter function (f20)
88 // sourced from ``Wirelessly Pickpocketing a Mifare Classic Card'' by Flavio Garcia, Peter van Rossum, Roel Verdult and Ronny Wichers Schreur
89 #define f20a(a,b,c,d) (((a|b)^(a&d))^(c&((a^b)|d)))
90 #define f20b(a,b,c,d) (((a&b)|c)^((a^b)&(c|d)))
91 #define f20c(a,b,c,d,e) ((a|((b|e)&(d^e)))^((a^(b&d))&((c^d)|(b&e))))
92
93 // bit indexing
94 #define get_bit(n, word) (((word) >> (n)) & 1)
95 #define get_vector_bit(slice, value) get_bit((slice)&0x3f, value.bytes64[(slice)>>6])
96
97 // size of crypto-1 state
98 #define STATE_SIZE 48
99 // size of nonce to be decrypted
100 #define KEYSTREAM_SIZE 24
101
102 // endianness conversion
103 #define rev32(word) ((((word) & 0xff) << 24) | ((((word) >> 8) & 0xff) << 16) | ((((word) >> 16) & 0xff) << 8) | ((((word) >> 24) & 0xff)))
104
105 // this needs to be compiled several times for each instruction set.
106 // For each instruction set, define a dedicated function name:
107 #if defined (__AVX512F__)
108 #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX512
109 #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX512
110 #elif defined (__AVX2__)
111 #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX2
112 #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX2
113 #elif defined (__AVX__)
114 #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX
115 #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX
116 #elif defined (__SSE2__)
117 #define BITSLICE_TEST_NONCES bitslice_test_nonces_SSE2
118 #define CRACK_STATES_BITSLICED crack_states_bitsliced_SSE2
119 #elif defined (__MMX__)
120 #define BITSLICE_TEST_NONCES bitslice_test_nonces_MMX
121 #define CRACK_STATES_BITSLICED crack_states_bitsliced_MMX
122 #else
123 #define BITSLICE_TEST_NONCES bitslice_test_nonces_NOSIMD
124 #define CRACK_STATES_BITSLICED crack_states_bitsliced_NOSIMD
125 #endif
126
127 // typedefs and declaration of functions:
128 typedef const uint64_t crack_states_bitsliced_t(uint32_t, uint8_t*, statelist_t*, uint32_t*, uint64_t*, uint32_t, uint8_t*, noncelist_t*);
129 crack_states_bitsliced_t crack_states_bitsliced_AVX512;
130 crack_states_bitsliced_t crack_states_bitsliced_AVX2;
131 crack_states_bitsliced_t crack_states_bitsliced_AVX;
132 crack_states_bitsliced_t crack_states_bitsliced_SSE2;
133 crack_states_bitsliced_t crack_states_bitsliced_MMX;
134 crack_states_bitsliced_t crack_states_bitsliced_NOSIMD;
135 crack_states_bitsliced_t crack_states_bitsliced_dispatch;
136
137 typedef void bitslice_test_nonces_t(uint32_t, uint32_t*, uint8_t*);
138 bitslice_test_nonces_t bitslice_test_nonces_AVX512;
139 bitslice_test_nonces_t bitslice_test_nonces_AVX2;
140 bitslice_test_nonces_t bitslice_test_nonces_AVX;
141 bitslice_test_nonces_t bitslice_test_nonces_SSE2;
142 bitslice_test_nonces_t bitslice_test_nonces_MMX;
143 bitslice_test_nonces_t bitslice_test_nonces_NOSIMD;
144 bitslice_test_nonces_t bitslice_test_nonces_dispatch;
145
146 #if defined (_WIN32)
147 #define malloc_bitslice(x) __builtin_assume_aligned(_aligned_malloc((x), MAX_BITSLICES/8), MAX_BITSLICES/8)
148 #define free_bitslice(x) _aligned_free(x)
149 #elif defined (__APPLE__)
150 static void *malloc_bitslice(size_t x) {
151 char *allocated_memory;
152 if (posix_memalign((void**)&allocated_memory, MAX_BITSLICES/8, x)) {
153 return NULL;
154 } else {
155 return __builtin_assume_aligned(allocated_memory, MAX_BITSLICES/8);
156 }
157 }
158 #define free_bitslice(x) free(x)
159 #else
160 #define malloc_bitslice(x) memalign(MAX_BITSLICES/8, (x))
161 #define free_bitslice(x) free(x)
162 #endif
163
164 typedef enum {
165 EVEN_STATE = 0,
166 ODD_STATE = 1
167 } odd_even_t;
168
169
170 // arrays of bitsliced states with identical values in all slices
171 static bitslice_t bitsliced_encrypted_nonces[256][KEYSTREAM_SIZE];
172 static bitslice_t bitsliced_encrypted_parity_bits[256][4];
173 // 1 and 0 vectors
174 static bitslice_t bs_ones;
175 static bitslice_t bs_zeroes;
176
177
178 void BITSLICE_TEST_NONCES(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
179
180 // initialize 1 and 0 vectors
181 memset(bs_ones.bytes, 0xff, VECTOR_SIZE);
182 memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE);
183
184 // bitslice nonces' 2nd to 4th byte
185 for (uint32_t i = 0; i < nonces_to_bruteforce; i++) {
186 for(uint32_t bit_idx = 0; bit_idx < KEYSTREAM_SIZE; bit_idx++){
187 bool bit = get_bit(KEYSTREAM_SIZE-1-bit_idx, rev32(bf_test_nonce[i] << 8));
188 if(bit){
189 bitsliced_encrypted_nonces[i][bit_idx].value = bs_ones.value;
190 } else {
191 bitsliced_encrypted_nonces[i][bit_idx].value = bs_zeroes.value;
192 }
193 }
194 }
195 // bitslice nonces' parity (4 bits)
196 for (uint32_t i = 0; i < nonces_to_bruteforce; i++) {
197 for(uint32_t bit_idx = 0; bit_idx < 4; bit_idx++){
198 bool bit = get_bit(4-1-bit_idx, bf_test_nonce_par[i]);
199 if(bit){
200 bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_ones.value;
201 } else {
202 bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_zeroes.value;
203 }
204 }
205 }
206
207 }
208
209
210 const uint64_t CRACK_STATES_BITSLICED(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces){
211
212 // Unlike aczid's implementation this doesn't roll back at all when performing bitsliced bruteforce.
213 // We know that the best first byte is already shifted in. Testing with the remaining three bytes of
214 // the nonces is sufficient to eliminate most of them. The small rest is tested with a simple unsliced
215 // brute forcing (including roll back).
216
217 bitslice_t states[KEYSTREAM_SIZE+STATE_SIZE];
218 bitslice_t * restrict state_p;
219 uint64_t key = -1;
220 uint64_t bucket_states_tested = 0;
221 uint32_t bucket_size[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1];
222 uint32_t bitsliced_blocks = 0;
223 uint32_t const *restrict p_even_end = p->states[EVEN_STATE] + p->len[EVEN_STATE];
224 #if defined (DEBUG_BRUTE_FORCE)
225 uint32_t elimination_step = 0;
226 #define MAX_ELIMINATION_STEP 32
227 uint64_t keys_eliminated[MAX_ELIMINATION_STEP] = {0};
228 #endif
229 #ifdef DEBUG_KEY_ELIMINATION
230 bool bucket_contains_test_key[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1];
231 #endif
232
233 // constant ones/zeroes
234 bitslice_t bs_ones;
235 memset(bs_ones.bytes, 0xff, VECTOR_SIZE);
236 bitslice_t bs_zeroes;
237 memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE);
238
239 // bitslice all the even states
240 bitslice_t **restrict bitsliced_even_states = (bitslice_t **)malloc(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_t *));
241 if (bitsliced_even_states == NULL) {
242 printf("Out of memory error in brute_force. Aborting...");
243 exit(4);
244 }
245 bitslice_value_t *restrict bitsliced_even_feedback = malloc_bitslice(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_value_t));
246 if (bitsliced_even_feedback == NULL) {
247 printf("Out of memory error in brute_force. Aborting...");
248 exit(4);
249 }
250 for(uint32_t *restrict p_even = p->states[EVEN_STATE]; p_even < p_even_end; p_even += MAX_BITSLICES){
251 bitslice_t *restrict lstate_p = malloc_bitslice(STATE_SIZE/2*sizeof(bitslice_t));
252 if (lstate_p == NULL) {
253 printf("Out of memory error in brute_force. Aborting... \n");
254 exit(4);
255 }
256 memset(lstate_p, 0x00, STATE_SIZE/2*sizeof(bitslice_t)); // zero even bits
257 // bitslice even half-states
258 const uint32_t max_slices = (p_even_end-p_even) < MAX_BITSLICES ? p_even_end-p_even : MAX_BITSLICES;
259 bucket_size[bitsliced_blocks] = max_slices;
260 #ifdef DEBUG_KEY_ELIMINATION
261 bucket_contains_test_key[bitsliced_blocks] = false;
262 #endif
263 uint32_t slice_idx;
264 for(slice_idx = 0; slice_idx < max_slices; ++slice_idx){
265 uint32_t e = *(p_even+slice_idx);
266 #ifdef DEBUG_KEY_ELIMINATION
267 if (known_target_key != -1 && e == test_state[EVEN_STATE]) {
268 bucket_contains_test_key[bitsliced_blocks] = true;
269 // printf("bucket %d contains test key even state\n", bitsliced_blocks);
270 // printf("in slice %d\n", slice_idx);
271 }
272 #endif
273 for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){
274 // set even bits
275 if(e&1){
276 lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f);
277 }
278 }
279 }
280 // padding with last even state
281 for ( ; slice_idx < MAX_BITSLICES; ++slice_idx) {
282 uint32_t e = *(p_even_end-1);
283 for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){
284 // set even bits
285 if(e&1){
286 lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f);
287 }
288 }
289 }
290 bitsliced_even_states[bitsliced_blocks] = lstate_p;
291 // bitsliced_even_feedback[bitsliced_blocks] = bs_ones;
292 bitsliced_even_feedback[bitsliced_blocks] = lstate_p[(47- 0)/2].value ^
293 lstate_p[(47-10)/2].value ^ lstate_p[(47-12)/2].value ^ lstate_p[(47-14)/2].value ^
294 lstate_p[(47-24)/2].value ^ lstate_p[(47-42)/2].value;
295 bitsliced_blocks++;
296 }
297 // bitslice every odd state to every block of even states
298 for(uint32_t const *restrict p_odd = p->states[ODD_STATE]; p_odd < p->states[ODD_STATE] + p->len[ODD_STATE]; ++p_odd){
299 // early abort
300 if(*keys_found){
301 goto out;
302 }
303
304 // set odd state bits and pre-compute first keystream bit vector. This is the same for all blocks of even states
305
306 state_p = &states[KEYSTREAM_SIZE];
307 uint32_t o = *p_odd;
308
309 // pre-compute the odd feedback bit
310 bool odd_feedback_bit = evenparity32(o&0x29ce5c);
311 const bitslice_value_t odd_feedback = odd_feedback_bit ? bs_ones.value : bs_zeroes.value;
312
313 // set odd state bits
314 for (uint32_t state_idx = 0; state_idx < STATE_SIZE; o >>= 1, state_idx += 2) {
315 if (o & 1){
316 state_p[state_idx] = bs_ones;
317 } else {
318 state_p[state_idx] = bs_zeroes;
319 }
320 }
321
322 bitslice_value_t crypto1_bs_f20b_2[16];
323 bitslice_value_t crypto1_bs_f20b_3[8];
324
325 crypto1_bs_f20b_2[0] = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
326 crypto1_bs_f20b_3[0] = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value);
327
328 bitslice_value_t ksb[8];
329 ksb[0] = f20c(f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value),
330 f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value),
331 crypto1_bs_f20b_2[0],
332 f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value),
333 crypto1_bs_f20b_3[0]);
334
335 uint32_t *restrict p_even = p->states[EVEN_STATE];
336 for (uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx, p_even += MAX_BITSLICES) {
337
338 #ifdef DEBUG_KEY_ELIMINATION
339 // if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) {
340 // printf("Now testing known target key.\n");
341 // printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
342 // }
343 #endif
344 // add the even state bits
345 const bitslice_t *restrict bitsliced_even_state = bitsliced_even_states[block_idx];
346 for(uint32_t state_idx = 1; state_idx < STATE_SIZE; state_idx += 2) {
347 state_p[state_idx] = bitsliced_even_state[state_idx/2];
348 }
349
350 // pre-compute first feedback bit vector. This is the same for all nonces
351 bitslice_value_t fbb[8];
352 fbb[0] = odd_feedback ^ bitsliced_even_feedback[block_idx];
353
354 // vector to contain test results (1 = passed, 0 = failed)
355 bitslice_t results = bs_ones;
356
357 // parity_bits
358 bitslice_value_t par[8];
359 par[0] = bs_zeroes.value;
360 uint32_t next_common_bits = 0;
361
362 for(uint32_t tests = 0; tests < nonces_to_bruteforce; ++tests){
363 // common bits with preceding test nonce
364 uint32_t common_bits = next_common_bits; //tests ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests-1]) : 0;
365 next_common_bits = tests < nonces_to_bruteforce - 1 ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests+1]) : 0;
366 uint32_t parity_bit_idx = 1; // start checking with the parity of second nonce byte
367 bitslice_value_t fb_bits = fbb[common_bits]; // start with precomputed feedback bits from previous nonce
368 bitslice_value_t ks_bits = ksb[common_bits]; // dito for first keystream bits
369 bitslice_value_t parity_bit_vector = par[common_bits]; // dito for first parity vector
370 // bitslice_value_t fb_bits = fbb[0]; // start with precomputed feedback bits from previous nonce
371 // bitslice_value_t ks_bits = ksb[0]; // dito for first keystream bits
372 // bitslice_value_t parity_bit_vector = par[0]; // dito for first parity vector
373 state_p -= common_bits; // and reuse the already calculated state bits
374 // highest bit is transmitted/received first. We start with Bit 23 (highest bit of second nonce byte),
375 // or the highest bit which differs from the previous nonce
376 for (int32_t ks_idx = KEYSTREAM_SIZE-1-common_bits; ks_idx >= 0; --ks_idx) {
377
378 // decrypt nonce bits
379 const bitslice_value_t encrypted_nonce_bit_vector = bitsliced_encrypted_nonces[tests][ks_idx].value;
380 const bitslice_value_t decrypted_nonce_bit_vector = encrypted_nonce_bit_vector ^ ks_bits;
381
382 // compute real parity bits on the fly
383 parity_bit_vector ^= decrypted_nonce_bit_vector;
384
385 // update state
386 state_p--;
387 state_p[0].value = fb_bits ^ decrypted_nonce_bit_vector;
388
389 // update crypto1 subfunctions
390 bitslice_value_t f20a_1, f20b_1, f20b_2, f20a_2, f20b_3;
391 f20a_2 = f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value);
392 f20b_3 = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value);
393 if (ks_idx > KEYSTREAM_SIZE - 8) {
394 f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
395 f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value);
396 f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
397 crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2;
398 crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx] = f20b_3;
399 } else if (ks_idx > KEYSTREAM_SIZE - 16) {
400 f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
401 f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8];
402 f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
403 crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2;
404 } else if (ks_idx > KEYSTREAM_SIZE - 24){
405 f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
406 f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8];
407 f20b_2 = crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx - 16];
408 } else {
409 f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
410 f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value);
411 f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
412 }
413 // update keystream bit
414 ks_bits = f20c(f20a_1, f20b_1, f20b_2, f20a_2, f20b_3);
415
416 // for each completed byte:
417 if ((ks_idx & 0x07) == 0) {
418 // get encrypted parity bits
419 const bitslice_value_t encrypted_parity_bit_vector = bitsliced_encrypted_parity_bits[tests][parity_bit_idx++].value;
420
421 // decrypt parity bits
422 const bitslice_value_t decrypted_parity_bit_vector = encrypted_parity_bit_vector ^ ks_bits;
423
424 // compare actual parity bits with decrypted parity bits and take count in results vector
425 results.value &= ~parity_bit_vector ^ decrypted_parity_bit_vector;
426
427 // make sure we still have a match in our set
428 // if(memcmp(&results, &bs_zeroes, sizeof(bitslice_t)) == 0){
429
430 // this is much faster on my gcc, because somehow a memcmp needlessly spills/fills all the xmm registers to/from the stack - ???
431 // the short-circuiting also helps
432 if(results.bytes64[0] == 0
433 #if MAX_BITSLICES > 64
434 && results.bytes64[1] == 0
435 #endif
436 #if MAX_BITSLICES > 128
437 && results.bytes64[2] == 0
438 && results.bytes64[3] == 0
439 #endif
440 ) {
441 #if defined (DEBUG_BRUTE_FORCE)
442 if (elimination_step < MAX_ELIMINATION_STEP) {
443 keys_eliminated[elimination_step] += MAX_BITSLICES;
444 }
445 #endif
446 #ifdef DEBUG_KEY_ELIMINATION
447 if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) {
448 printf("Known target key eliminated in brute_force.\n");
449 printf("block_idx = %d/%d, nonce = %d/%d\n", block_idx, bitsliced_blocks, tests, nonces_to_bruteforce);
450 }
451 #endif
452 goto stop_tests;
453 }
454 // prepare for next nonce byte
455 #if defined (DEBUG_BRUTE_FORCE)
456 elimination_step++;
457 #endif
458 parity_bit_vector = bs_zeroes.value;
459 }
460 // update feedback bit vector
461 if (ks_idx != 0) {
462 fb_bits =
463 (state_p[47- 0].value ^ state_p[47- 5].value ^ state_p[47- 9].value ^
464 state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^
465 state_p[47-15].value ^ state_p[47-17].value ^ state_p[47-19].value ^
466 state_p[47-24].value ^ state_p[47-25].value ^ state_p[47-27].value ^
467 state_p[47-29].value ^ state_p[47-35].value ^ state_p[47-39].value ^
468 state_p[47-41].value ^ state_p[47-42].value ^ state_p[47-43].value);
469 }
470 // remember feedback and keystream vectors for later use
471 uint8_t bit = KEYSTREAM_SIZE - ks_idx;
472 if (bit <= next_common_bits) { // if needed and not yet stored
473 fbb[bit] = fb_bits;
474 ksb[bit] = ks_bits;
475 par[bit] = parity_bit_vector;
476 }
477 }
478 // prepare for next nonce. Revert to initial state
479 state_p = &states[KEYSTREAM_SIZE];
480 }
481
482 // all nonce tests were successful: we've found a possible key in this block!
483 uint32_t *p_even_test = p_even;
484 for (uint32_t results_word = 0; results_word < MAX_BITSLICES / 64; ++results_word) {
485 uint64_t results64 = results.bytes64[results_word];
486 for (uint32_t results_bit = 0; results_bit < 64; results_bit++) {
487 if (results64 & 0x01) {
488 if (verify_key(cuid, nonces, best_first_bytes, *p_odd, *p_even_test)) {
489 struct Crypto1State pcs;
490 pcs.odd = *p_odd;
491 pcs.even = *p_even_test;
492 lfsr_rollback_byte(&pcs, (cuid >> 24) ^ best_first_bytes[0], true);
493 crypto1_get_lfsr(&pcs, &key);
494 bucket_states_tested += 64 * results_word + results_bit;
495 goto out;
496 }
497 #ifdef DEBUG_KEY_ELIMINATION
498 if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) {
499 printf("Known target key eliminated in brute_force verification.\n");
500 printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
501 }
502 #endif
503 }
504 #ifdef DEBUG_KEY_ELIMINATION
505 if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) {
506 printf("Known target key eliminated in brute_force (results_bit == 0).\n");
507 printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
508 }
509 #endif
510 results64 >>= 1;
511 p_even_test++;
512 if (p_even_test == p_even_end) {
513 goto stop_tests;
514 }
515 }
516 }
517 stop_tests:
518 #if defined (DEBUG_BRUTE_FORCE)
519 elimination_step = 0;
520 #endif
521 bucket_states_tested += bucket_size[block_idx];
522 // prepare to set new states
523 state_p = &states[KEYSTREAM_SIZE];
524 continue;
525 }
526 }
527 out:
528 for(uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){
529 free_bitslice(bitsliced_even_states[block_idx]);
530 }
531 free(bitsliced_even_states);
532 free_bitslice(bitsliced_even_feedback);
533 __sync_fetch_and_add(num_keys_tested, bucket_states_tested);
534
535 #if defined (DEBUG_BRUTE_FORCE)
536 for (uint32_t i = 0; i < MAX_ELIMINATION_STEP; i++) {
537 printf("Eliminated after %2u test_bytes: %5.2f%%\n", i+1, (float)keys_eliminated[i] / bucket_states_tested * 100);
538 }
539 #endif
540 return key;
541 }
542
543
544
545 #ifndef __MMX__
546
547 // pointers to functions:
548 crack_states_bitsliced_t *crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch;
549 bitslice_test_nonces_t *bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch;
550
551 static SIMDExecInstr intSIMDInstr = SIMD_AUTO;
552
553 void SetSIMDInstr(SIMDExecInstr instr) {
554 intSIMDInstr = instr;
555
556 crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch;
557 bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch;
558 }
559
560 SIMDExecInstr GetSIMDInstr() {
561 SIMDExecInstr instr = SIMD_NONE;
562
563 #if defined (__i386__) || defined (__x86_64__)
564 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
565 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
566 if (__builtin_cpu_supports("avx512f")) instr = SIMD_AVX512;
567 else if (__builtin_cpu_supports("avx2")) instr = SIMD_AVX2;
568 #else
569 if (__builtin_cpu_supports("avx2")) instr = SIMD_AVX2;
570 #endif
571 else if (__builtin_cpu_supports("avx")) instr = SIMD_AVX;
572 else if (__builtin_cpu_supports("sse2")) instr = SIMD_SSE2;
573 else if (__builtin_cpu_supports("mmx")) instr = SIMD_MMX;
574 else
575 #endif
576 #endif
577 instr = SIMD_NONE;
578
579 return instr;
580 }
581
582 SIMDExecInstr GetSIMDInstrAuto() {
583 SIMDExecInstr instr = intSIMDInstr;
584 if (instr == SIMD_AUTO)
585 return GetSIMDInstr();
586
587 return instr;
588 }
589
590 // determine the available instruction set at runtime and call the correct function
591 const uint64_t crack_states_bitsliced_dispatch(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) {
592 switch(GetSIMDInstrAuto()) {
593 #if defined (__i386__) || defined (__x86_64__)
594 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
595 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
596 case SIMD_AVX512:
597 crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX512;
598 break;
599 #endif
600 case SIMD_AVX2:
601 crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX2;
602 break;
603 case SIMD_AVX:
604 crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX;
605 break;
606 case SIMD_SSE2:
607 crack_states_bitsliced_function_p = &crack_states_bitsliced_SSE2;
608 break;
609 case SIMD_MMX:
610 crack_states_bitsliced_function_p = &crack_states_bitsliced_MMX;
611 break;
612 #endif
613 #endif
614 default:
615 crack_states_bitsliced_function_p = &crack_states_bitsliced_NOSIMD;
616 break;
617 }
618
619 // call the most optimized function for this CPU
620 return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces);
621 }
622
623 void bitslice_test_nonces_dispatch(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
624 switch(GetSIMDInstrAuto()) {
625 #if defined (__i386__) || defined (__x86_64__)
626 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
627 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
628 case SIMD_AVX512:
629 bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX512;
630 break;
631 #endif
632 case SIMD_AVX2:
633 bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX2;
634 break;
635 case SIMD_AVX:
636 bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX;
637 break;
638 case SIMD_SSE2:
639 bitslice_test_nonces_function_p = &bitslice_test_nonces_SSE2;
640 break;
641 case SIMD_MMX:
642 bitslice_test_nonces_function_p = &bitslice_test_nonces_MMX;
643 break;
644 #endif
645 #endif
646 default:
647 bitslice_test_nonces_function_p = &bitslice_test_nonces_NOSIMD;
648 break;
649 }
650
651 // call the most optimized function for this CPU
652 (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par);
653 }
654
655 // Entries to dispatched function calls
656 const uint64_t crack_states_bitsliced(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) {
657 return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces);
658 }
659
660 void bitslice_test_nonces(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
661 (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par);
662 }
663
664 #endif
Impressum, Datenschutz