]>
Commit | Line | Data |
---|---|---|
1 | //----------------------------------------------------------------------------- | |
2 | // Copyright (C) 2016, 2017 by piwi | |
3 | // | |
4 | // This code is licensed to you under the terms of the GNU GPL, version 2 or, | |
5 | // at your option, any later version. See the LICENSE.txt file for the text of | |
6 | // the license. | |
7 | //----------------------------------------------------------------------------- | |
8 | // Implements a card only attack based on crypto text (encrypted nonces | |
9 | // received during a nested authentication) only. Unlike other card only | |
10 | // attacks this doesn't rely on implementation errors but only on the | |
11 | // inherent weaknesses of the crypto1 cypher. Described in | |
12 | // Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened | |
13 | // Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on | |
14 | // Computer and Communications Security, 2015 | |
15 | //----------------------------------------------------------------------------- | |
16 | // | |
17 | // brute forcing is based on @aczids bitsliced brute forcer | |
18 | // https://github.com/aczid/crypto1_bs with some modifications. Mainly: | |
19 | // - don't rollback. Start with 2nd byte of nonce instead | |
20 | // - reuse results of filter subfunctions | |
21 | // - reuse results of previous nonces if some first bits are identical | |
22 | // | |
23 | //----------------------------------------------------------------------------- | |
24 | // aczid's Copyright notice: | |
25 | // | |
26 | // Bit-sliced Crypto-1 brute-forcing implementation | |
27 | // Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid) | |
28 | /* | |
29 | Copyright (c) 2015-2016 Aram Verstegen | |
30 | ||
31 | Permission is hereby granted, free of charge, to any person obtaining a copy | |
32 | of this software and associated documentation files (the "Software"), to deal | |
33 | in the Software without restriction, including without limitation the rights | |
34 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
35 | copies of the Software, and to permit persons to whom the Software is | |
36 | furnished to do so, subject to the following conditions: | |
37 | ||
38 | The above copyright notice and this permission notice shall be included in | |
39 | all copies or substantial portions of the Software. | |
40 | ||
41 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
42 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
43 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
44 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
45 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
46 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
47 | THE SOFTWARE. | |
48 | */ | |
49 | ||
50 | #include "hardnested_bf_core.h" | |
51 | ||
52 | #include <stdint.h> | |
53 | #include <stdbool.h> | |
54 | #include <stdlib.h> | |
55 | #ifndef __APPLE__ | |
56 | #include <malloc.h> | |
57 | #endif | |
58 | #include <stdio.h> | |
59 | #include <string.h> | |
60 | #include "crapto1/crapto1.h" | |
61 | #include "parity.h" | |
62 | ||
63 | // bitslice type | |
64 | // while AVX supports 256 bit vector floating point operations, we need integer operations for boolean logic | |
65 | // same for AVX2 and 512 bit vectors | |
66 | // using larger vectors works but seems to generate more register pressure | |
67 | #if defined(__AVX512F__) | |
68 | #define MAX_BITSLICES 512 | |
69 | #elif defined(__AVX2__) | |
70 | #define MAX_BITSLICES 256 | |
71 | #elif defined(__AVX__) | |
72 | #define MAX_BITSLICES 128 | |
73 | #elif defined(__SSE2__) | |
74 | #define MAX_BITSLICES 128 | |
75 | #else // MMX or SSE or NOSIMD | |
76 | #define MAX_BITSLICES 64 | |
77 | #endif | |
78 | ||
79 | #define VECTOR_SIZE (MAX_BITSLICES/8) | |
80 | typedef unsigned int __attribute__((aligned(VECTOR_SIZE))) __attribute__((vector_size(VECTOR_SIZE))) bitslice_value_t; | |
81 | typedef union { | |
82 | bitslice_value_t value; | |
83 | uint64_t bytes64[MAX_BITSLICES/64]; | |
84 | uint8_t bytes[MAX_BITSLICES/8]; | |
85 | } bitslice_t; | |
86 | ||
87 | // filter function (f20) | |
88 | // sourced from ``Wirelessly Pickpocketing a Mifare Classic Card'' by Flavio Garcia, Peter van Rossum, Roel Verdult and Ronny Wichers Schreur | |
89 | #define f20a(a,b,c,d) (((a|b)^(a&d))^(c&((a^b)|d))) | |
90 | #define f20b(a,b,c,d) (((a&b)|c)^((a^b)&(c|d))) | |
91 | #define f20c(a,b,c,d,e) ((a|((b|e)&(d^e)))^((a^(b&d))&((c^d)|(b&e)))) | |
92 | ||
93 | // bit indexing | |
94 | #define get_bit(n, word) (((word) >> (n)) & 1) | |
95 | #define get_vector_bit(slice, value) get_bit((slice)&0x3f, value.bytes64[(slice)>>6]) | |
96 | ||
97 | // size of crypto-1 state | |
98 | #define STATE_SIZE 48 | |
99 | // size of nonce to be decrypted | |
100 | #define KEYSTREAM_SIZE 24 | |
101 | ||
102 | // endianness conversion | |
103 | #define rev32(word) ((((word) & 0xff) << 24) | ((((word) >> 8) & 0xff) << 16) | ((((word) >> 16) & 0xff) << 8) | ((((word) >> 24) & 0xff))) | |
104 | ||
105 | // this needs to be compiled several times for each instruction set. | |
106 | // For each instruction set, define a dedicated function name: | |
107 | #if defined (__AVX512F__) | |
108 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX512 | |
109 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX512 | |
110 | #elif defined (__AVX2__) | |
111 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX2 | |
112 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX2 | |
113 | #elif defined (__AVX__) | |
114 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX | |
115 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX | |
116 | #elif defined (__SSE2__) | |
117 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_SSE2 | |
118 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_SSE2 | |
119 | #elif defined (__MMX__) | |
120 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_MMX | |
121 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_MMX | |
122 | #else | |
123 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_NOSIMD | |
124 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_NOSIMD | |
125 | #endif | |
126 | ||
127 | // typedefs and declaration of functions: | |
128 | typedef const uint64_t crack_states_bitsliced_t(uint32_t, uint8_t*, statelist_t*, uint32_t*, uint64_t*, uint32_t, uint8_t*, noncelist_t*); | |
129 | crack_states_bitsliced_t crack_states_bitsliced_AVX512; | |
130 | crack_states_bitsliced_t crack_states_bitsliced_AVX2; | |
131 | crack_states_bitsliced_t crack_states_bitsliced_AVX; | |
132 | crack_states_bitsliced_t crack_states_bitsliced_SSE2; | |
133 | crack_states_bitsliced_t crack_states_bitsliced_MMX; | |
134 | crack_states_bitsliced_t crack_states_bitsliced_NOSIMD; | |
135 | crack_states_bitsliced_t crack_states_bitsliced_dispatch; | |
136 | ||
137 | typedef void bitslice_test_nonces_t(uint32_t, uint32_t*, uint8_t*); | |
138 | bitslice_test_nonces_t bitslice_test_nonces_AVX512; | |
139 | bitslice_test_nonces_t bitslice_test_nonces_AVX2; | |
140 | bitslice_test_nonces_t bitslice_test_nonces_AVX; | |
141 | bitslice_test_nonces_t bitslice_test_nonces_SSE2; | |
142 | bitslice_test_nonces_t bitslice_test_nonces_MMX; | |
143 | bitslice_test_nonces_t bitslice_test_nonces_NOSIMD; | |
144 | bitslice_test_nonces_t bitslice_test_nonces_dispatch; | |
145 | ||
146 | #if defined (_WIN32) | |
147 | #define malloc_bitslice(x) __builtin_assume_aligned(_aligned_malloc((x), MAX_BITSLICES/8), MAX_BITSLICES/8) | |
148 | #define free_bitslice(x) _aligned_free(x) | |
149 | #elif defined (__APPLE__) | |
150 | static void *malloc_bitslice(size_t x) { | |
151 | char *allocated_memory; | |
152 | if (posix_memalign((void**)&allocated_memory, MAX_BITSLICES/8, x)) { | |
153 | return NULL; | |
154 | } else { | |
155 | return __builtin_assume_aligned(allocated_memory, MAX_BITSLICES/8); | |
156 | } | |
157 | } | |
158 | #define free_bitslice(x) free(x) | |
159 | #else | |
160 | #define malloc_bitslice(x) memalign(MAX_BITSLICES/8, (x)) | |
161 | #define free_bitslice(x) free(x) | |
162 | #endif | |
163 | ||
164 | typedef enum { | |
165 | EVEN_STATE = 0, | |
166 | ODD_STATE = 1 | |
167 | } odd_even_t; | |
168 | ||
169 | ||
170 | // arrays of bitsliced states with identical values in all slices | |
171 | static bitslice_t bitsliced_encrypted_nonces[256][KEYSTREAM_SIZE]; | |
172 | static bitslice_t bitsliced_encrypted_parity_bits[256][4]; | |
173 | // 1 and 0 vectors | |
174 | static bitslice_t bs_ones; | |
175 | static bitslice_t bs_zeroes; | |
176 | ||
177 | ||
178 | void BITSLICE_TEST_NONCES(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { | |
179 | ||
180 | // initialize 1 and 0 vectors | |
181 | memset(bs_ones.bytes, 0xff, VECTOR_SIZE); | |
182 | memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE); | |
183 | ||
184 | // bitslice nonces' 2nd to 4th byte | |
185 | for (uint32_t i = 0; i < nonces_to_bruteforce; i++) { | |
186 | for(uint32_t bit_idx = 0; bit_idx < KEYSTREAM_SIZE; bit_idx++){ | |
187 | bool bit = get_bit(KEYSTREAM_SIZE-1-bit_idx, rev32(bf_test_nonce[i] << 8)); | |
188 | if(bit){ | |
189 | bitsliced_encrypted_nonces[i][bit_idx].value = bs_ones.value; | |
190 | } else { | |
191 | bitsliced_encrypted_nonces[i][bit_idx].value = bs_zeroes.value; | |
192 | } | |
193 | } | |
194 | } | |
195 | // bitslice nonces' parity (4 bits) | |
196 | for (uint32_t i = 0; i < nonces_to_bruteforce; i++) { | |
197 | for(uint32_t bit_idx = 0; bit_idx < 4; bit_idx++){ | |
198 | bool bit = get_bit(4-1-bit_idx, bf_test_nonce_par[i]); | |
199 | if(bit){ | |
200 | bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_ones.value; | |
201 | } else { | |
202 | bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_zeroes.value; | |
203 | } | |
204 | } | |
205 | } | |
206 | ||
207 | } | |
208 | ||
209 | ||
210 | const uint64_t CRACK_STATES_BITSLICED(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces){ | |
211 | ||
212 | // Unlike aczid's implementation this doesn't roll back at all when performing bitsliced bruteforce. | |
213 | // We know that the best first byte is already shifted in. Testing with the remaining three bytes of | |
214 | // the nonces is sufficient to eliminate most of them. The small rest is tested with a simple unsliced | |
215 | // brute forcing (including roll back). | |
216 | ||
217 | bitslice_t states[KEYSTREAM_SIZE+STATE_SIZE]; | |
218 | bitslice_t * restrict state_p; | |
219 | uint64_t key = -1; | |
220 | uint64_t bucket_states_tested = 0; | |
221 | uint32_t bucket_size[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1]; | |
222 | uint32_t bitsliced_blocks = 0; | |
223 | uint32_t const *restrict p_even_end = p->states[EVEN_STATE] + p->len[EVEN_STATE]; | |
224 | #if defined (DEBUG_BRUTE_FORCE) | |
225 | uint32_t elimination_step = 0; | |
226 | #define MAX_ELIMINATION_STEP 32 | |
227 | uint64_t keys_eliminated[MAX_ELIMINATION_STEP] = {0}; | |
228 | #endif | |
229 | #ifdef DEBUG_KEY_ELIMINATION | |
230 | bool bucket_contains_test_key[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1]; | |
231 | #endif | |
232 | ||
233 | // constant ones/zeroes | |
234 | bitslice_t bs_ones; | |
235 | memset(bs_ones.bytes, 0xff, VECTOR_SIZE); | |
236 | bitslice_t bs_zeroes; | |
237 | memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE); | |
238 | ||
239 | // bitslice all the even states | |
240 | bitslice_t **restrict bitsliced_even_states = (bitslice_t **)malloc(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_t *)); | |
241 | if (bitsliced_even_states == NULL) { | |
242 | printf("Out of memory error in brute_force. Aborting..."); | |
243 | exit(4); | |
244 | } | |
245 | bitslice_value_t *restrict bitsliced_even_feedback = malloc_bitslice(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_value_t)); | |
246 | if (bitsliced_even_feedback == NULL) { | |
247 | printf("Out of memory error in brute_force. Aborting..."); | |
248 | exit(4); | |
249 | } | |
250 | for(uint32_t *restrict p_even = p->states[EVEN_STATE]; p_even < p_even_end; p_even += MAX_BITSLICES){ | |
251 | bitslice_t *restrict lstate_p = malloc_bitslice(STATE_SIZE/2*sizeof(bitslice_t)); | |
252 | if (lstate_p == NULL) { | |
253 | printf("Out of memory error in brute_force. Aborting... \n"); | |
254 | exit(4); | |
255 | } | |
256 | memset(lstate_p, 0x00, STATE_SIZE/2*sizeof(bitslice_t)); // zero even bits | |
257 | // bitslice even half-states | |
258 | const uint32_t max_slices = (p_even_end-p_even) < MAX_BITSLICES ? p_even_end-p_even : MAX_BITSLICES; | |
259 | bucket_size[bitsliced_blocks] = max_slices; | |
260 | #ifdef DEBUG_KEY_ELIMINATION | |
261 | bucket_contains_test_key[bitsliced_blocks] = false; | |
262 | #endif | |
263 | uint32_t slice_idx; | |
264 | for(slice_idx = 0; slice_idx < max_slices; ++slice_idx){ | |
265 | uint32_t e = *(p_even+slice_idx); | |
266 | #ifdef DEBUG_KEY_ELIMINATION | |
267 | if (known_target_key != -1 && e == test_state[EVEN_STATE]) { | |
268 | bucket_contains_test_key[bitsliced_blocks] = true; | |
269 | // printf("bucket %d contains test key even state\n", bitsliced_blocks); | |
270 | // printf("in slice %d\n", slice_idx); | |
271 | } | |
272 | #endif | |
273 | for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){ | |
274 | // set even bits | |
275 | if(e&1){ | |
276 | lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f); | |
277 | } | |
278 | } | |
279 | } | |
280 | // padding with last even state | |
281 | for ( ; slice_idx < MAX_BITSLICES; ++slice_idx) { | |
282 | uint32_t e = *(p_even_end-1); | |
283 | for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){ | |
284 | // set even bits | |
285 | if(e&1){ | |
286 | lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f); | |
287 | } | |
288 | } | |
289 | } | |
290 | bitsliced_even_states[bitsliced_blocks] = lstate_p; | |
291 | // bitsliced_even_feedback[bitsliced_blocks] = bs_ones; | |
292 | bitsliced_even_feedback[bitsliced_blocks] = lstate_p[(47- 0)/2].value ^ | |
293 | lstate_p[(47-10)/2].value ^ lstate_p[(47-12)/2].value ^ lstate_p[(47-14)/2].value ^ | |
294 | lstate_p[(47-24)/2].value ^ lstate_p[(47-42)/2].value; | |
295 | bitsliced_blocks++; | |
296 | } | |
297 | // bitslice every odd state to every block of even states | |
298 | for(uint32_t const *restrict p_odd = p->states[ODD_STATE]; p_odd < p->states[ODD_STATE] + p->len[ODD_STATE]; ++p_odd){ | |
299 | // early abort | |
300 | if(*keys_found){ | |
301 | goto out; | |
302 | } | |
303 | ||
304 | // set odd state bits and pre-compute first keystream bit vector. This is the same for all blocks of even states | |
305 | ||
306 | state_p = &states[KEYSTREAM_SIZE]; | |
307 | uint32_t o = *p_odd; | |
308 | ||
309 | // pre-compute the odd feedback bit | |
310 | bool odd_feedback_bit = evenparity32(o&0x29ce5c); | |
311 | const bitslice_value_t odd_feedback = odd_feedback_bit ? bs_ones.value : bs_zeroes.value; | |
312 | ||
313 | // set odd state bits | |
314 | for (uint32_t state_idx = 0; state_idx < STATE_SIZE; o >>= 1, state_idx += 2) { | |
315 | if (o & 1){ | |
316 | state_p[state_idx] = bs_ones; | |
317 | } else { | |
318 | state_p[state_idx] = bs_zeroes; | |
319 | } | |
320 | } | |
321 | ||
322 | bitslice_value_t crypto1_bs_f20b_2[16]; | |
323 | bitslice_value_t crypto1_bs_f20b_3[8]; | |
324 | ||
325 | crypto1_bs_f20b_2[0] = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); | |
326 | crypto1_bs_f20b_3[0] = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value); | |
327 | ||
328 | bitslice_value_t ksb[8]; | |
329 | ksb[0] = f20c(f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value), | |
330 | f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value), | |
331 | crypto1_bs_f20b_2[0], | |
332 | f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value), | |
333 | crypto1_bs_f20b_3[0]); | |
334 | ||
335 | uint32_t *restrict p_even = p->states[EVEN_STATE]; | |
336 | for (uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx, p_even += MAX_BITSLICES) { | |
337 | ||
338 | #ifdef DEBUG_KEY_ELIMINATION | |
339 | // if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) { | |
340 | // printf("Now testing known target key.\n"); | |
341 | // printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); | |
342 | // } | |
343 | #endif | |
344 | // add the even state bits | |
345 | const bitslice_t *restrict bitsliced_even_state = bitsliced_even_states[block_idx]; | |
346 | for(uint32_t state_idx = 1; state_idx < STATE_SIZE; state_idx += 2) { | |
347 | state_p[state_idx] = bitsliced_even_state[state_idx/2]; | |
348 | } | |
349 | ||
350 | // pre-compute first feedback bit vector. This is the same for all nonces | |
351 | bitslice_value_t fbb[8]; | |
352 | fbb[0] = odd_feedback ^ bitsliced_even_feedback[block_idx]; | |
353 | ||
354 | // vector to contain test results (1 = passed, 0 = failed) | |
355 | bitslice_t results = bs_ones; | |
356 | ||
357 | // parity_bits | |
358 | bitslice_value_t par[8]; | |
359 | par[0] = bs_zeroes.value; | |
360 | uint32_t next_common_bits = 0; | |
361 | ||
362 | for(uint32_t tests = 0; tests < nonces_to_bruteforce; ++tests){ | |
363 | // common bits with preceding test nonce | |
364 | uint32_t common_bits = next_common_bits; //tests ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests-1]) : 0; | |
365 | next_common_bits = tests < nonces_to_bruteforce - 1 ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests+1]) : 0; | |
366 | uint32_t parity_bit_idx = 1; // start checking with the parity of second nonce byte | |
367 | bitslice_value_t fb_bits = fbb[common_bits]; // start with precomputed feedback bits from previous nonce | |
368 | bitslice_value_t ks_bits = ksb[common_bits]; // dito for first keystream bits | |
369 | bitslice_value_t parity_bit_vector = par[common_bits]; // dito for first parity vector | |
370 | // bitslice_value_t fb_bits = fbb[0]; // start with precomputed feedback bits from previous nonce | |
371 | // bitslice_value_t ks_bits = ksb[0]; // dito for first keystream bits | |
372 | // bitslice_value_t parity_bit_vector = par[0]; // dito for first parity vector | |
373 | state_p -= common_bits; // and reuse the already calculated state bits | |
374 | // highest bit is transmitted/received first. We start with Bit 23 (highest bit of second nonce byte), | |
375 | // or the highest bit which differs from the previous nonce | |
376 | for (int32_t ks_idx = KEYSTREAM_SIZE-1-common_bits; ks_idx >= 0; --ks_idx) { | |
377 | ||
378 | // decrypt nonce bits | |
379 | const bitslice_value_t encrypted_nonce_bit_vector = bitsliced_encrypted_nonces[tests][ks_idx].value; | |
380 | const bitslice_value_t decrypted_nonce_bit_vector = encrypted_nonce_bit_vector ^ ks_bits; | |
381 | ||
382 | // compute real parity bits on the fly | |
383 | parity_bit_vector ^= decrypted_nonce_bit_vector; | |
384 | ||
385 | // update state | |
386 | state_p--; | |
387 | state_p[0].value = fb_bits ^ decrypted_nonce_bit_vector; | |
388 | ||
389 | // update crypto1 subfunctions | |
390 | bitslice_value_t f20a_1, f20b_1, f20b_2, f20a_2, f20b_3; | |
391 | f20a_2 = f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value); | |
392 | f20b_3 = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value); | |
393 | if (ks_idx > KEYSTREAM_SIZE - 8) { | |
394 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); | |
395 | f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value); | |
396 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); | |
397 | crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2; | |
398 | crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx] = f20b_3; | |
399 | } else if (ks_idx > KEYSTREAM_SIZE - 16) { | |
400 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); | |
401 | f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8]; | |
402 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); | |
403 | crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2; | |
404 | } else if (ks_idx > KEYSTREAM_SIZE - 24){ | |
405 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); | |
406 | f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8]; | |
407 | f20b_2 = crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx - 16]; | |
408 | } else { | |
409 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); | |
410 | f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value); | |
411 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); | |
412 | } | |
413 | // update keystream bit | |
414 | ks_bits = f20c(f20a_1, f20b_1, f20b_2, f20a_2, f20b_3); | |
415 | ||
416 | // for each completed byte: | |
417 | if ((ks_idx & 0x07) == 0) { | |
418 | // get encrypted parity bits | |
419 | const bitslice_value_t encrypted_parity_bit_vector = bitsliced_encrypted_parity_bits[tests][parity_bit_idx++].value; | |
420 | ||
421 | // decrypt parity bits | |
422 | const bitslice_value_t decrypted_parity_bit_vector = encrypted_parity_bit_vector ^ ks_bits; | |
423 | ||
424 | // compare actual parity bits with decrypted parity bits and take count in results vector | |
425 | results.value &= ~parity_bit_vector ^ decrypted_parity_bit_vector; | |
426 | ||
427 | // make sure we still have a match in our set | |
428 | // if(memcmp(&results, &bs_zeroes, sizeof(bitslice_t)) == 0){ | |
429 | ||
430 | // this is much faster on my gcc, because somehow a memcmp needlessly spills/fills all the xmm registers to/from the stack - ??? | |
431 | // the short-circuiting also helps | |
432 | if(results.bytes64[0] == 0 | |
433 | #if MAX_BITSLICES > 64 | |
434 | && results.bytes64[1] == 0 | |
435 | #endif | |
436 | #if MAX_BITSLICES > 128 | |
437 | && results.bytes64[2] == 0 | |
438 | && results.bytes64[3] == 0 | |
439 | #endif | |
440 | ) { | |
441 | #if defined (DEBUG_BRUTE_FORCE) | |
442 | if (elimination_step < MAX_ELIMINATION_STEP) { | |
443 | keys_eliminated[elimination_step] += MAX_BITSLICES; | |
444 | } | |
445 | #endif | |
446 | #ifdef DEBUG_KEY_ELIMINATION | |
447 | if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) { | |
448 | printf("Known target key eliminated in brute_force.\n"); | |
449 | printf("block_idx = %d/%d, nonce = %d/%d\n", block_idx, bitsliced_blocks, tests, nonces_to_bruteforce); | |
450 | } | |
451 | #endif | |
452 | goto stop_tests; | |
453 | } | |
454 | // prepare for next nonce byte | |
455 | #if defined (DEBUG_BRUTE_FORCE) | |
456 | elimination_step++; | |
457 | #endif | |
458 | parity_bit_vector = bs_zeroes.value; | |
459 | } | |
460 | // update feedback bit vector | |
461 | if (ks_idx != 0) { | |
462 | fb_bits = | |
463 | (state_p[47- 0].value ^ state_p[47- 5].value ^ state_p[47- 9].value ^ | |
464 | state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^ | |
465 | state_p[47-15].value ^ state_p[47-17].value ^ state_p[47-19].value ^ | |
466 | state_p[47-24].value ^ state_p[47-25].value ^ state_p[47-27].value ^ | |
467 | state_p[47-29].value ^ state_p[47-35].value ^ state_p[47-39].value ^ | |
468 | state_p[47-41].value ^ state_p[47-42].value ^ state_p[47-43].value); | |
469 | } | |
470 | // remember feedback and keystream vectors for later use | |
471 | uint8_t bit = KEYSTREAM_SIZE - ks_idx; | |
472 | if (bit <= next_common_bits) { // if needed and not yet stored | |
473 | fbb[bit] = fb_bits; | |
474 | ksb[bit] = ks_bits; | |
475 | par[bit] = parity_bit_vector; | |
476 | } | |
477 | } | |
478 | // prepare for next nonce. Revert to initial state | |
479 | state_p = &states[KEYSTREAM_SIZE]; | |
480 | } | |
481 | ||
482 | // all nonce tests were successful: we've found a possible key in this block! | |
483 | uint32_t *p_even_test = p_even; | |
484 | for (uint32_t results_word = 0; results_word < MAX_BITSLICES / 64; ++results_word) { | |
485 | uint64_t results64 = results.bytes64[results_word]; | |
486 | for (uint32_t results_bit = 0; results_bit < 64; results_bit++) { | |
487 | if (results64 & 0x01) { | |
488 | if (verify_key(cuid, nonces, best_first_bytes, *p_odd, *p_even_test)) { | |
489 | struct Crypto1State pcs; | |
490 | pcs.odd = *p_odd; | |
491 | pcs.even = *p_even_test; | |
492 | lfsr_rollback_byte(&pcs, (cuid >> 24) ^ best_first_bytes[0], true); | |
493 | crypto1_get_lfsr(&pcs, &key); | |
494 | bucket_states_tested += 64 * results_word + results_bit; | |
495 | goto out; | |
496 | } | |
497 | #ifdef DEBUG_KEY_ELIMINATION | |
498 | if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) { | |
499 | printf("Known target key eliminated in brute_force verification.\n"); | |
500 | printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); | |
501 | } | |
502 | #endif | |
503 | } | |
504 | #ifdef DEBUG_KEY_ELIMINATION | |
505 | if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) { | |
506 | printf("Known target key eliminated in brute_force (results_bit == 0).\n"); | |
507 | printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); | |
508 | } | |
509 | #endif | |
510 | results64 >>= 1; | |
511 | p_even_test++; | |
512 | if (p_even_test == p_even_end) { | |
513 | goto stop_tests; | |
514 | } | |
515 | } | |
516 | } | |
517 | stop_tests: | |
518 | #if defined (DEBUG_BRUTE_FORCE) | |
519 | elimination_step = 0; | |
520 | #endif | |
521 | bucket_states_tested += bucket_size[block_idx]; | |
522 | // prepare to set new states | |
523 | state_p = &states[KEYSTREAM_SIZE]; | |
524 | continue; | |
525 | } | |
526 | } | |
527 | out: | |
528 | for(uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){ | |
529 | free_bitslice(bitsliced_even_states[block_idx]); | |
530 | } | |
531 | free(bitsliced_even_states); | |
532 | free_bitslice(bitsliced_even_feedback); | |
533 | __sync_fetch_and_add(num_keys_tested, bucket_states_tested); | |
534 | ||
535 | #if defined (DEBUG_BRUTE_FORCE) | |
536 | for (uint32_t i = 0; i < MAX_ELIMINATION_STEP; i++) { | |
537 | printf("Eliminated after %2u test_bytes: %5.2f%%\n", i+1, (float)keys_eliminated[i] / bucket_states_tested * 100); | |
538 | } | |
539 | #endif | |
540 | return key; | |
541 | } | |
542 | ||
543 | ||
544 | ||
545 | #ifndef __MMX__ | |
546 | ||
547 | // pointers to functions: | |
548 | crack_states_bitsliced_t *crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch; | |
549 | bitslice_test_nonces_t *bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch; | |
550 | ||
551 | static SIMDExecInstr intSIMDInstr = SIMD_AUTO; | |
552 | ||
553 | void SetSIMDInstr(SIMDExecInstr instr) { | |
554 | intSIMDInstr = instr; | |
555 | ||
556 | crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch; | |
557 | bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch; | |
558 | } | |
559 | ||
560 | SIMDExecInstr GetSIMDInstr() { | |
561 | SIMDExecInstr instr = SIMD_NONE; | |
562 | ||
563 | #if defined (__i386__) || defined (__x86_64__) | |
564 | #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1)) | |
565 | #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2) | |
566 | if (__builtin_cpu_supports("avx512f")) instr = SIMD_AVX512; | |
567 | else if (__builtin_cpu_supports("avx2")) instr = SIMD_AVX2; | |
568 | #else | |
569 | if (__builtin_cpu_supports("avx2")) instr = SIMD_AVX2; | |
570 | #endif | |
571 | else if (__builtin_cpu_supports("avx")) instr = SIMD_AVX; | |
572 | else if (__builtin_cpu_supports("sse2")) instr = SIMD_SSE2; | |
573 | else if (__builtin_cpu_supports("mmx")) instr = SIMD_MMX; | |
574 | else | |
575 | #endif | |
576 | #endif | |
577 | instr = SIMD_NONE; | |
578 | ||
579 | return instr; | |
580 | } | |
581 | ||
582 | SIMDExecInstr GetSIMDInstrAuto() { | |
583 | SIMDExecInstr instr = intSIMDInstr; | |
584 | if (instr == SIMD_AUTO) | |
585 | return GetSIMDInstr(); | |
586 | ||
587 | return instr; | |
588 | } | |
589 | ||
590 | // determine the available instruction set at runtime and call the correct function | |
591 | const uint64_t crack_states_bitsliced_dispatch(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) { | |
592 | switch(GetSIMDInstrAuto()) { | |
593 | #if defined (__i386__) || defined (__x86_64__) | |
594 | #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1)) | |
595 | #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2) | |
596 | case SIMD_AVX512: | |
597 | crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX512; | |
598 | break; | |
599 | #endif | |
600 | case SIMD_AVX2: | |
601 | crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX2; | |
602 | break; | |
603 | case SIMD_AVX: | |
604 | crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX; | |
605 | break; | |
606 | case SIMD_SSE2: | |
607 | crack_states_bitsliced_function_p = &crack_states_bitsliced_SSE2; | |
608 | break; | |
609 | case SIMD_MMX: | |
610 | crack_states_bitsliced_function_p = &crack_states_bitsliced_MMX; | |
611 | break; | |
612 | #endif | |
613 | #endif | |
614 | default: | |
615 | crack_states_bitsliced_function_p = &crack_states_bitsliced_NOSIMD; | |
616 | break; | |
617 | } | |
618 | ||
619 | // call the most optimized function for this CPU | |
620 | return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces); | |
621 | } | |
622 | ||
623 | void bitslice_test_nonces_dispatch(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { | |
624 | switch(GetSIMDInstrAuto()) { | |
625 | #if defined (__i386__) || defined (__x86_64__) | |
626 | #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1)) | |
627 | #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2) | |
628 | case SIMD_AVX512: | |
629 | bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX512; | |
630 | break; | |
631 | #endif | |
632 | case SIMD_AVX2: | |
633 | bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX2; | |
634 | break; | |
635 | case SIMD_AVX: | |
636 | bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX; | |
637 | break; | |
638 | case SIMD_SSE2: | |
639 | bitslice_test_nonces_function_p = &bitslice_test_nonces_SSE2; | |
640 | break; | |
641 | case SIMD_MMX: | |
642 | bitslice_test_nonces_function_p = &bitslice_test_nonces_MMX; | |
643 | break; | |
644 | #endif | |
645 | #endif | |
646 | default: | |
647 | bitslice_test_nonces_function_p = &bitslice_test_nonces_NOSIMD; | |
648 | break; | |
649 | } | |
650 | ||
651 | // call the most optimized function for this CPU | |
652 | (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par); | |
653 | } | |
654 | ||
655 | // Entries to dispatched function calls | |
656 | const uint64_t crack_states_bitsliced(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) { | |
657 | return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces); | |
658 | } | |
659 | ||
660 | void bitslice_test_nonces(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { | |
661 | (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par); | |
662 | } | |
663 | ||
664 | #endif |