| 1 | //----------------------------------------------------------------------------- |
| 2 | // Copyright (C) 2016, 2017 by piwi |
| 3 | // |
| 4 | // This code is licensed to you under the terms of the GNU GPL, version 2 or, |
| 5 | // at your option, any later version. See the LICENSE.txt file for the text of |
| 6 | // the license. |
| 7 | //----------------------------------------------------------------------------- |
| 8 | // Implements a card only attack based on crypto text (encrypted nonces |
| 9 | // received during a nested authentication) only. Unlike other card only |
| 10 | // attacks this doesn't rely on implementation errors but only on the |
| 11 | // inherent weaknesses of the crypto1 cypher. Described in |
| 12 | // Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened |
| 13 | // Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on |
| 14 | // Computer and Communications Security, 2015 |
| 15 | //----------------------------------------------------------------------------- |
| 16 | // |
| 17 | // brute forcing is based on @aczids bitsliced brute forcer |
| 18 | // https://github.com/aczid/crypto1_bs with some modifications. Mainly: |
| 19 | // - don't rollback. Start with 2nd byte of nonce instead |
| 20 | // - reuse results of filter subfunctions |
| 21 | // - reuse results of previous nonces if some first bits are identical |
| 22 | // |
| 23 | //----------------------------------------------------------------------------- |
| 24 | // aczid's Copyright notice: |
| 25 | // |
| 26 | // Bit-sliced Crypto-1 brute-forcing implementation |
| 27 | // Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid) |
| 28 | /* |
| 29 | Copyright (c) 2015-2016 Aram Verstegen |
| 30 | |
| 31 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| 32 | of this software and associated documentation files (the "Software"), to deal |
| 33 | in the Software without restriction, including without limitation the rights |
| 34 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 35 | copies of the Software, and to permit persons to whom the Software is |
| 36 | furnished to do so, subject to the following conditions: |
| 37 | |
| 38 | The above copyright notice and this permission notice shall be included in |
| 39 | all copies or substantial portions of the Software. |
| 40 | |
| 41 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 42 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 43 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 44 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 45 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 46 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 47 | THE SOFTWARE. |
| 48 | */ |
| 49 | |
| 50 | #include "hardnested_bf_core.h" |
| 51 | |
| 52 | #include <stdint.h> |
| 53 | #include <stdbool.h> |
| 54 | #include <stdlib.h> |
| 55 | #ifndef __APPLE__ |
| 56 | #include <malloc.h> |
| 57 | #endif |
| 58 | #include <stdio.h> |
| 59 | #include <string.h> |
| 60 | #include "crapto1/crapto1.h" |
| 61 | #include "parity.h" |
| 62 | |
| 63 | // bitslice type |
| 64 | // while AVX supports 256 bit vector floating point operations, we need integer operations for boolean logic |
| 65 | // same for AVX2 and 512 bit vectors |
| 66 | // using larger vectors works but seems to generate more register pressure |
| 67 | #if defined(__AVX512F__) |
| 68 | #define MAX_BITSLICES 512 |
| 69 | #elif defined(__AVX2__) |
| 70 | #define MAX_BITSLICES 256 |
| 71 | #elif defined(__AVX__) |
| 72 | #define MAX_BITSLICES 128 |
| 73 | #elif defined(__SSE2__) |
| 74 | #define MAX_BITSLICES 128 |
| 75 | #else // MMX or SSE or NOSIMD |
| 76 | #define MAX_BITSLICES 64 |
| 77 | #endif |
| 78 | |
| 79 | #define VECTOR_SIZE (MAX_BITSLICES/8) |
| 80 | typedef unsigned int __attribute__((aligned(VECTOR_SIZE))) __attribute__((vector_size(VECTOR_SIZE))) bitslice_value_t; |
| 81 | typedef union { |
| 82 | bitslice_value_t value; |
| 83 | uint64_t bytes64[MAX_BITSLICES/64]; |
| 84 | uint8_t bytes[MAX_BITSLICES/8]; |
| 85 | } bitslice_t; |
| 86 | |
| 87 | // filter function (f20) |
| 88 | // sourced from ``Wirelessly Pickpocketing a Mifare Classic Card'' by Flavio Garcia, Peter van Rossum, Roel Verdult and Ronny Wichers Schreur |
| 89 | #define f20a(a,b,c,d) (((a|b)^(a&d))^(c&((a^b)|d))) |
| 90 | #define f20b(a,b,c,d) (((a&b)|c)^((a^b)&(c|d))) |
| 91 | #define f20c(a,b,c,d,e) ((a|((b|e)&(d^e)))^((a^(b&d))&((c^d)|(b&e)))) |
| 92 | |
| 93 | // bit indexing |
| 94 | #define get_bit(n, word) (((word) >> (n)) & 1) |
| 95 | #define get_vector_bit(slice, value) get_bit((slice)&0x3f, value.bytes64[(slice)>>6]) |
| 96 | |
| 97 | // size of crypto-1 state |
| 98 | #define STATE_SIZE 48 |
| 99 | // size of nonce to be decrypted |
| 100 | #define KEYSTREAM_SIZE 24 |
| 101 | |
| 102 | // endianness conversion |
| 103 | #define rev32(word) ((((word) & 0xff) << 24) | ((((word) >> 8) & 0xff) << 16) | ((((word) >> 16) & 0xff) << 8) | ((((word) >> 24) & 0xff))) |
| 104 | |
| 105 | // this needs to be compiled several times for each instruction set. |
| 106 | // For each instruction set, define a dedicated function name: |
| 107 | #if defined (__AVX512F__) |
| 108 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX512 |
| 109 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX512 |
| 110 | #elif defined (__AVX2__) |
| 111 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX2 |
| 112 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX2 |
| 113 | #elif defined (__AVX__) |
| 114 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX |
| 115 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX |
| 116 | #elif defined (__SSE2__) |
| 117 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_SSE2 |
| 118 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_SSE2 |
| 119 | #elif defined (__MMX__) |
| 120 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_MMX |
| 121 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_MMX |
| 122 | #else |
| 123 | #define BITSLICE_TEST_NONCES bitslice_test_nonces_NOSIMD |
| 124 | #define CRACK_STATES_BITSLICED crack_states_bitsliced_NOSIMD |
| 125 | #endif |
| 126 | |
| 127 | // typedefs and declaration of functions: |
| 128 | typedef const uint64_t crack_states_bitsliced_t(uint32_t, uint8_t*, statelist_t*, uint32_t*, uint64_t*, uint32_t, uint8_t*, noncelist_t*); |
| 129 | crack_states_bitsliced_t crack_states_bitsliced_AVX512; |
| 130 | crack_states_bitsliced_t crack_states_bitsliced_AVX2; |
| 131 | crack_states_bitsliced_t crack_states_bitsliced_AVX; |
| 132 | crack_states_bitsliced_t crack_states_bitsliced_SSE2; |
| 133 | crack_states_bitsliced_t crack_states_bitsliced_MMX; |
| 134 | crack_states_bitsliced_t crack_states_bitsliced_NOSIMD; |
| 135 | crack_states_bitsliced_t crack_states_bitsliced_dispatch; |
| 136 | |
| 137 | typedef void bitslice_test_nonces_t(uint32_t, uint32_t*, uint8_t*); |
| 138 | bitslice_test_nonces_t bitslice_test_nonces_AVX512; |
| 139 | bitslice_test_nonces_t bitslice_test_nonces_AVX2; |
| 140 | bitslice_test_nonces_t bitslice_test_nonces_AVX; |
| 141 | bitslice_test_nonces_t bitslice_test_nonces_SSE2; |
| 142 | bitslice_test_nonces_t bitslice_test_nonces_MMX; |
| 143 | bitslice_test_nonces_t bitslice_test_nonces_NOSIMD; |
| 144 | bitslice_test_nonces_t bitslice_test_nonces_dispatch; |
| 145 | |
| 146 | #if defined (_WIN32) |
| 147 | #define malloc_bitslice(x) __builtin_assume_aligned(_aligned_malloc((x), MAX_BITSLICES/8), MAX_BITSLICES/8) |
| 148 | #define free_bitslice(x) _aligned_free(x) |
| 149 | #elif defined (__APPLE__) |
| 150 | static void *malloc_bitslice(size_t x) { |
| 151 | char *allocated_memory; |
| 152 | if (posix_memalign((void**)&allocated_memory, MAX_BITSLICES/8, x)) { |
| 153 | return NULL; |
| 154 | } else { |
| 155 | return __builtin_assume_aligned(allocated_memory, MAX_BITSLICES/8); |
| 156 | } |
| 157 | } |
| 158 | #define free_bitslice(x) free(x) |
| 159 | #else |
| 160 | #define malloc_bitslice(x) memalign(MAX_BITSLICES/8, (x)) |
| 161 | #define free_bitslice(x) free(x) |
| 162 | #endif |
| 163 | |
| 164 | typedef enum { |
| 165 | EVEN_STATE = 0, |
| 166 | ODD_STATE = 1 |
| 167 | } odd_even_t; |
| 168 | |
| 169 | |
| 170 | // arrays of bitsliced states with identical values in all slices |
| 171 | static bitslice_t bitsliced_encrypted_nonces[256][KEYSTREAM_SIZE]; |
| 172 | static bitslice_t bitsliced_encrypted_parity_bits[256][4]; |
| 173 | // 1 and 0 vectors |
| 174 | static bitslice_t bs_ones; |
| 175 | static bitslice_t bs_zeroes; |
| 176 | |
| 177 | |
| 178 | void BITSLICE_TEST_NONCES(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { |
| 179 | |
| 180 | // initialize 1 and 0 vectors |
| 181 | memset(bs_ones.bytes, 0xff, VECTOR_SIZE); |
| 182 | memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE); |
| 183 | |
| 184 | // bitslice nonces' 2nd to 4th byte |
| 185 | for (uint32_t i = 0; i < nonces_to_bruteforce; i++) { |
| 186 | for(uint32_t bit_idx = 0; bit_idx < KEYSTREAM_SIZE; bit_idx++){ |
| 187 | bool bit = get_bit(KEYSTREAM_SIZE-1-bit_idx, rev32(bf_test_nonce[i] << 8)); |
| 188 | if(bit){ |
| 189 | bitsliced_encrypted_nonces[i][bit_idx].value = bs_ones.value; |
| 190 | } else { |
| 191 | bitsliced_encrypted_nonces[i][bit_idx].value = bs_zeroes.value; |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | // bitslice nonces' parity (4 bits) |
| 196 | for (uint32_t i = 0; i < nonces_to_bruteforce; i++) { |
| 197 | for(uint32_t bit_idx = 0; bit_idx < 4; bit_idx++){ |
| 198 | bool bit = get_bit(4-1-bit_idx, bf_test_nonce_par[i]); |
| 199 | if(bit){ |
| 200 | bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_ones.value; |
| 201 | } else { |
| 202 | bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_zeroes.value; |
| 203 | } |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | } |
| 208 | |
| 209 | |
| 210 | const uint64_t CRACK_STATES_BITSLICED(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces){ |
| 211 | |
| 212 | // Unlike aczid's implementation this doesn't roll back at all when performing bitsliced bruteforce. |
| 213 | // We know that the best first byte is already shifted in. Testing with the remaining three bytes of |
| 214 | // the nonces is sufficient to eliminate most of them. The small rest is tested with a simple unsliced |
| 215 | // brute forcing (including roll back). |
| 216 | |
| 217 | bitslice_t states[KEYSTREAM_SIZE+STATE_SIZE]; |
| 218 | bitslice_t * restrict state_p; |
| 219 | uint64_t key = -1; |
| 220 | uint64_t bucket_states_tested = 0; |
| 221 | uint32_t bucket_size[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1]; |
| 222 | uint32_t bitsliced_blocks = 0; |
| 223 | uint32_t const *restrict p_even_end = p->states[EVEN_STATE] + p->len[EVEN_STATE]; |
| 224 | #if defined (DEBUG_BRUTE_FORCE) |
| 225 | uint32_t elimination_step = 0; |
| 226 | #define MAX_ELIMINATION_STEP 32 |
| 227 | uint64_t keys_eliminated[MAX_ELIMINATION_STEP] = {0}; |
| 228 | #endif |
| 229 | #ifdef DEBUG_KEY_ELIMINATION |
| 230 | bool bucket_contains_test_key[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1]; |
| 231 | #endif |
| 232 | |
| 233 | // constant ones/zeroes |
| 234 | bitslice_t bs_ones; |
| 235 | memset(bs_ones.bytes, 0xff, VECTOR_SIZE); |
| 236 | bitslice_t bs_zeroes; |
| 237 | memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE); |
| 238 | |
| 239 | // bitslice all the even states |
| 240 | bitslice_t **restrict bitsliced_even_states = (bitslice_t **)malloc(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_t *)); |
| 241 | if (bitsliced_even_states == NULL) { |
| 242 | printf("Out of memory error in brute_force. Aborting..."); |
| 243 | exit(4); |
| 244 | } |
| 245 | bitslice_value_t *restrict bitsliced_even_feedback = malloc_bitslice(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_value_t)); |
| 246 | if (bitsliced_even_feedback == NULL) { |
| 247 | printf("Out of memory error in brute_force. Aborting..."); |
| 248 | exit(4); |
| 249 | } |
| 250 | for(uint32_t *restrict p_even = p->states[EVEN_STATE]; p_even < p_even_end; p_even += MAX_BITSLICES){ |
| 251 | bitslice_t *restrict lstate_p = malloc_bitslice(STATE_SIZE/2*sizeof(bitslice_t)); |
| 252 | if (lstate_p == NULL) { |
| 253 | printf("Out of memory error in brute_force. Aborting... \n"); |
| 254 | exit(4); |
| 255 | } |
| 256 | memset(lstate_p, 0x00, STATE_SIZE/2*sizeof(bitslice_t)); // zero even bits |
| 257 | // bitslice even half-states |
| 258 | const uint32_t max_slices = (p_even_end-p_even) < MAX_BITSLICES ? p_even_end-p_even : MAX_BITSLICES; |
| 259 | bucket_size[bitsliced_blocks] = max_slices; |
| 260 | #ifdef DEBUG_KEY_ELIMINATION |
| 261 | bucket_contains_test_key[bitsliced_blocks] = false; |
| 262 | #endif |
| 263 | uint32_t slice_idx; |
| 264 | for(slice_idx = 0; slice_idx < max_slices; ++slice_idx){ |
| 265 | uint32_t e = *(p_even+slice_idx); |
| 266 | #ifdef DEBUG_KEY_ELIMINATION |
| 267 | if (known_target_key != -1 && e == test_state[EVEN_STATE]) { |
| 268 | bucket_contains_test_key[bitsliced_blocks] = true; |
| 269 | // printf("bucket %d contains test key even state\n", bitsliced_blocks); |
| 270 | // printf("in slice %d\n", slice_idx); |
| 271 | } |
| 272 | #endif |
| 273 | for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){ |
| 274 | // set even bits |
| 275 | if(e&1){ |
| 276 | lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f); |
| 277 | } |
| 278 | } |
| 279 | } |
| 280 | // padding with last even state |
| 281 | for ( ; slice_idx < MAX_BITSLICES; ++slice_idx) { |
| 282 | uint32_t e = *(p_even_end-1); |
| 283 | for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){ |
| 284 | // set even bits |
| 285 | if(e&1){ |
| 286 | lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f); |
| 287 | } |
| 288 | } |
| 289 | } |
| 290 | bitsliced_even_states[bitsliced_blocks] = lstate_p; |
| 291 | // bitsliced_even_feedback[bitsliced_blocks] = bs_ones; |
| 292 | bitsliced_even_feedback[bitsliced_blocks] = lstate_p[(47- 0)/2].value ^ |
| 293 | lstate_p[(47-10)/2].value ^ lstate_p[(47-12)/2].value ^ lstate_p[(47-14)/2].value ^ |
| 294 | lstate_p[(47-24)/2].value ^ lstate_p[(47-42)/2].value; |
| 295 | bitsliced_blocks++; |
| 296 | } |
| 297 | // bitslice every odd state to every block of even states |
| 298 | for(uint32_t const *restrict p_odd = p->states[ODD_STATE]; p_odd < p->states[ODD_STATE] + p->len[ODD_STATE]; ++p_odd){ |
| 299 | // early abort |
| 300 | if(*keys_found){ |
| 301 | goto out; |
| 302 | } |
| 303 | |
| 304 | // set odd state bits and pre-compute first keystream bit vector. This is the same for all blocks of even states |
| 305 | |
| 306 | state_p = &states[KEYSTREAM_SIZE]; |
| 307 | uint32_t o = *p_odd; |
| 308 | |
| 309 | // pre-compute the odd feedback bit |
| 310 | bool odd_feedback_bit = evenparity32(o&0x29ce5c); |
| 311 | const bitslice_value_t odd_feedback = odd_feedback_bit ? bs_ones.value : bs_zeroes.value; |
| 312 | |
| 313 | // set odd state bits |
| 314 | for (uint32_t state_idx = 0; state_idx < STATE_SIZE; o >>= 1, state_idx += 2) { |
| 315 | if (o & 1){ |
| 316 | state_p[state_idx] = bs_ones; |
| 317 | } else { |
| 318 | state_p[state_idx] = bs_zeroes; |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | bitslice_value_t crypto1_bs_f20b_2[16]; |
| 323 | bitslice_value_t crypto1_bs_f20b_3[8]; |
| 324 | |
| 325 | crypto1_bs_f20b_2[0] = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); |
| 326 | crypto1_bs_f20b_3[0] = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value); |
| 327 | |
| 328 | bitslice_value_t ksb[8]; |
| 329 | ksb[0] = f20c(f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value), |
| 330 | f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value), |
| 331 | crypto1_bs_f20b_2[0], |
| 332 | f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value), |
| 333 | crypto1_bs_f20b_3[0]); |
| 334 | |
| 335 | uint32_t *restrict p_even = p->states[EVEN_STATE]; |
| 336 | for (uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx, p_even += MAX_BITSLICES) { |
| 337 | |
| 338 | #ifdef DEBUG_KEY_ELIMINATION |
| 339 | // if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) { |
| 340 | // printf("Now testing known target key.\n"); |
| 341 | // printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); |
| 342 | // } |
| 343 | #endif |
| 344 | // add the even state bits |
| 345 | const bitslice_t *restrict bitsliced_even_state = bitsliced_even_states[block_idx]; |
| 346 | for(uint32_t state_idx = 1; state_idx < STATE_SIZE; state_idx += 2) { |
| 347 | state_p[state_idx] = bitsliced_even_state[state_idx/2]; |
| 348 | } |
| 349 | |
| 350 | // pre-compute first feedback bit vector. This is the same for all nonces |
| 351 | bitslice_value_t fbb[8]; |
| 352 | fbb[0] = odd_feedback ^ bitsliced_even_feedback[block_idx]; |
| 353 | |
| 354 | // vector to contain test results (1 = passed, 0 = failed) |
| 355 | bitslice_t results = bs_ones; |
| 356 | |
| 357 | // parity_bits |
| 358 | bitslice_value_t par[8]; |
| 359 | par[0] = bs_zeroes.value; |
| 360 | uint32_t next_common_bits = 0; |
| 361 | |
| 362 | for(uint32_t tests = 0; tests < nonces_to_bruteforce; ++tests){ |
| 363 | // common bits with preceding test nonce |
| 364 | uint32_t common_bits = next_common_bits; //tests ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests-1]) : 0; |
| 365 | next_common_bits = tests < nonces_to_bruteforce - 1 ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests+1]) : 0; |
| 366 | uint32_t parity_bit_idx = 1; // start checking with the parity of second nonce byte |
| 367 | bitslice_value_t fb_bits = fbb[common_bits]; // start with precomputed feedback bits from previous nonce |
| 368 | bitslice_value_t ks_bits = ksb[common_bits]; // dito for first keystream bits |
| 369 | bitslice_value_t parity_bit_vector = par[common_bits]; // dito for first parity vector |
| 370 | // bitslice_value_t fb_bits = fbb[0]; // start with precomputed feedback bits from previous nonce |
| 371 | // bitslice_value_t ks_bits = ksb[0]; // dito for first keystream bits |
| 372 | // bitslice_value_t parity_bit_vector = par[0]; // dito for first parity vector |
| 373 | state_p -= common_bits; // and reuse the already calculated state bits |
| 374 | // highest bit is transmitted/received first. We start with Bit 23 (highest bit of second nonce byte), |
| 375 | // or the highest bit which differs from the previous nonce |
| 376 | for (int32_t ks_idx = KEYSTREAM_SIZE-1-common_bits; ks_idx >= 0; --ks_idx) { |
| 377 | |
| 378 | // decrypt nonce bits |
| 379 | const bitslice_value_t encrypted_nonce_bit_vector = bitsliced_encrypted_nonces[tests][ks_idx].value; |
| 380 | const bitslice_value_t decrypted_nonce_bit_vector = encrypted_nonce_bit_vector ^ ks_bits; |
| 381 | |
| 382 | // compute real parity bits on the fly |
| 383 | parity_bit_vector ^= decrypted_nonce_bit_vector; |
| 384 | |
| 385 | // update state |
| 386 | state_p--; |
| 387 | state_p[0].value = fb_bits ^ decrypted_nonce_bit_vector; |
| 388 | |
| 389 | // update crypto1 subfunctions |
| 390 | bitslice_value_t f20a_1, f20b_1, f20b_2, f20a_2, f20b_3; |
| 391 | f20a_2 = f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value); |
| 392 | f20b_3 = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value); |
| 393 | if (ks_idx > KEYSTREAM_SIZE - 8) { |
| 394 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); |
| 395 | f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value); |
| 396 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); |
| 397 | crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2; |
| 398 | crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx] = f20b_3; |
| 399 | } else if (ks_idx > KEYSTREAM_SIZE - 16) { |
| 400 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); |
| 401 | f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8]; |
| 402 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); |
| 403 | crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2; |
| 404 | } else if (ks_idx > KEYSTREAM_SIZE - 24){ |
| 405 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); |
| 406 | f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8]; |
| 407 | f20b_2 = crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx - 16]; |
| 408 | } else { |
| 409 | f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value); |
| 410 | f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value); |
| 411 | f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value); |
| 412 | } |
| 413 | // update keystream bit |
| 414 | ks_bits = f20c(f20a_1, f20b_1, f20b_2, f20a_2, f20b_3); |
| 415 | |
| 416 | // for each completed byte: |
| 417 | if ((ks_idx & 0x07) == 0) { |
| 418 | // get encrypted parity bits |
| 419 | const bitslice_value_t encrypted_parity_bit_vector = bitsliced_encrypted_parity_bits[tests][parity_bit_idx++].value; |
| 420 | |
| 421 | // decrypt parity bits |
| 422 | const bitslice_value_t decrypted_parity_bit_vector = encrypted_parity_bit_vector ^ ks_bits; |
| 423 | |
| 424 | // compare actual parity bits with decrypted parity bits and take count in results vector |
| 425 | results.value &= ~parity_bit_vector ^ decrypted_parity_bit_vector; |
| 426 | |
| 427 | // make sure we still have a match in our set |
| 428 | // if(memcmp(&results, &bs_zeroes, sizeof(bitslice_t)) == 0){ |
| 429 | |
| 430 | // this is much faster on my gcc, because somehow a memcmp needlessly spills/fills all the xmm registers to/from the stack - ??? |
| 431 | // the short-circuiting also helps |
| 432 | if(results.bytes64[0] == 0 |
| 433 | #if MAX_BITSLICES > 64 |
| 434 | && results.bytes64[1] == 0 |
| 435 | #endif |
| 436 | #if MAX_BITSLICES > 128 |
| 437 | && results.bytes64[2] == 0 |
| 438 | && results.bytes64[3] == 0 |
| 439 | #endif |
| 440 | ) { |
| 441 | #if defined (DEBUG_BRUTE_FORCE) |
| 442 | if (elimination_step < MAX_ELIMINATION_STEP) { |
| 443 | keys_eliminated[elimination_step] += MAX_BITSLICES; |
| 444 | } |
| 445 | #endif |
| 446 | #ifdef DEBUG_KEY_ELIMINATION |
| 447 | if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) { |
| 448 | printf("Known target key eliminated in brute_force.\n"); |
| 449 | printf("block_idx = %d/%d, nonce = %d/%d\n", block_idx, bitsliced_blocks, tests, nonces_to_bruteforce); |
| 450 | } |
| 451 | #endif |
| 452 | goto stop_tests; |
| 453 | } |
| 454 | // prepare for next nonce byte |
| 455 | #if defined (DEBUG_BRUTE_FORCE) |
| 456 | elimination_step++; |
| 457 | #endif |
| 458 | parity_bit_vector = bs_zeroes.value; |
| 459 | } |
| 460 | // update feedback bit vector |
| 461 | if (ks_idx != 0) { |
| 462 | fb_bits = |
| 463 | (state_p[47- 0].value ^ state_p[47- 5].value ^ state_p[47- 9].value ^ |
| 464 | state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^ |
| 465 | state_p[47-15].value ^ state_p[47-17].value ^ state_p[47-19].value ^ |
| 466 | state_p[47-24].value ^ state_p[47-25].value ^ state_p[47-27].value ^ |
| 467 | state_p[47-29].value ^ state_p[47-35].value ^ state_p[47-39].value ^ |
| 468 | state_p[47-41].value ^ state_p[47-42].value ^ state_p[47-43].value); |
| 469 | } |
| 470 | // remember feedback and keystream vectors for later use |
| 471 | uint8_t bit = KEYSTREAM_SIZE - ks_idx; |
| 472 | if (bit <= next_common_bits) { // if needed and not yet stored |
| 473 | fbb[bit] = fb_bits; |
| 474 | ksb[bit] = ks_bits; |
| 475 | par[bit] = parity_bit_vector; |
| 476 | } |
| 477 | } |
| 478 | // prepare for next nonce. Revert to initial state |
| 479 | state_p = &states[KEYSTREAM_SIZE]; |
| 480 | } |
| 481 | |
| 482 | // all nonce tests were successful: we've found a possible key in this block! |
| 483 | uint32_t *p_even_test = p_even; |
| 484 | for (uint32_t results_word = 0; results_word < MAX_BITSLICES / 64; ++results_word) { |
| 485 | uint64_t results64 = results.bytes64[results_word]; |
| 486 | for (uint32_t results_bit = 0; results_bit < 64; results_bit++) { |
| 487 | if (results64 & 0x01) { |
| 488 | if (verify_key(cuid, nonces, best_first_bytes, *p_odd, *p_even_test)) { |
| 489 | struct Crypto1State pcs; |
| 490 | pcs.odd = *p_odd; |
| 491 | pcs.even = *p_even_test; |
| 492 | lfsr_rollback_byte(&pcs, (cuid >> 24) ^ best_first_bytes[0], true); |
| 493 | crypto1_get_lfsr(&pcs, &key); |
| 494 | bucket_states_tested += 64 * results_word + results_bit; |
| 495 | goto out; |
| 496 | } |
| 497 | #ifdef DEBUG_KEY_ELIMINATION |
| 498 | if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) { |
| 499 | printf("Known target key eliminated in brute_force verification.\n"); |
| 500 | printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); |
| 501 | } |
| 502 | #endif |
| 503 | } |
| 504 | #ifdef DEBUG_KEY_ELIMINATION |
| 505 | if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) { |
| 506 | printf("Known target key eliminated in brute_force (results_bit == 0).\n"); |
| 507 | printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks); |
| 508 | } |
| 509 | #endif |
| 510 | results64 >>= 1; |
| 511 | p_even_test++; |
| 512 | if (p_even_test == p_even_end) { |
| 513 | goto stop_tests; |
| 514 | } |
| 515 | } |
| 516 | } |
| 517 | stop_tests: |
| 518 | #if defined (DEBUG_BRUTE_FORCE) |
| 519 | elimination_step = 0; |
| 520 | #endif |
| 521 | bucket_states_tested += bucket_size[block_idx]; |
| 522 | // prepare to set new states |
| 523 | state_p = &states[KEYSTREAM_SIZE]; |
| 524 | continue; |
| 525 | } |
| 526 | } |
| 527 | out: |
| 528 | for(uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){ |
| 529 | free_bitslice(bitsliced_even_states[block_idx]); |
| 530 | } |
| 531 | free(bitsliced_even_states); |
| 532 | free_bitslice(bitsliced_even_feedback); |
| 533 | __sync_fetch_and_add(num_keys_tested, bucket_states_tested); |
| 534 | |
| 535 | #if defined (DEBUG_BRUTE_FORCE) |
| 536 | for (uint32_t i = 0; i < MAX_ELIMINATION_STEP; i++) { |
| 537 | printf("Eliminated after %2u test_bytes: %5.2f%%\n", i+1, (float)keys_eliminated[i] / bucket_states_tested * 100); |
| 538 | } |
| 539 | #endif |
| 540 | return key; |
| 541 | } |
| 542 | |
| 543 | |
| 544 | |
| 545 | #ifndef __MMX__ |
| 546 | |
| 547 | // pointers to functions: |
| 548 | crack_states_bitsliced_t *crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch; |
| 549 | bitslice_test_nonces_t *bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch; |
| 550 | |
| 551 | // determine the available instruction set at runtime and call the correct function |
| 552 | const uint64_t crack_states_bitsliced_dispatch(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) { |
| 553 | #if defined (__i386__) || defined (__x86_64__) |
| 554 | #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1)) |
| 555 | #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2) |
| 556 | if (__builtin_cpu_supports("avx512f")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX512; |
| 557 | else if (__builtin_cpu_supports("avx2")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX2; |
| 558 | #else |
| 559 | if (__builtin_cpu_supports("avx2")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX2; |
| 560 | #endif |
| 561 | else if (__builtin_cpu_supports("avx")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX; |
| 562 | else if (__builtin_cpu_supports("sse2")) crack_states_bitsliced_function_p = &crack_states_bitsliced_SSE2; |
| 563 | else if (__builtin_cpu_supports("mmx")) crack_states_bitsliced_function_p = &crack_states_bitsliced_MMX; |
| 564 | else |
| 565 | #endif |
| 566 | #endif |
| 567 | crack_states_bitsliced_function_p = &crack_states_bitsliced_NOSIMD; |
| 568 | |
| 569 | // call the most optimized function for this CPU |
| 570 | return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces); |
| 571 | } |
| 572 | |
| 573 | void bitslice_test_nonces_dispatch(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { |
| 574 | #if defined (__i386__) || defined (__x86_64__) |
| 575 | #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1)) |
| 576 | #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2) |
| 577 | if (__builtin_cpu_supports("avx512f")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX512; |
| 578 | else if (__builtin_cpu_supports("avx2")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX2; |
| 579 | #else |
| 580 | if (__builtin_cpu_supports("avx2")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX2; |
| 581 | #endif |
| 582 | else if (__builtin_cpu_supports("avx")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX; |
| 583 | else if (__builtin_cpu_supports("sse2")) bitslice_test_nonces_function_p = &bitslice_test_nonces_SSE2; |
| 584 | else if (__builtin_cpu_supports("mmx")) bitslice_test_nonces_function_p = &bitslice_test_nonces_MMX; |
| 585 | else |
| 586 | #endif |
| 587 | #endif |
| 588 | bitslice_test_nonces_function_p = &bitslice_test_nonces_NOSIMD; |
| 589 | |
| 590 | // call the most optimized function for this CPU |
| 591 | (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par); |
| 592 | } |
| 593 | |
| 594 | // Entries to dispatched function calls |
| 595 | const uint64_t crack_states_bitsliced(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) { |
| 596 | return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces); |
| 597 | } |
| 598 | |
| 599 | void bitslice_test_nonces(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) { |
| 600 | (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par); |
| 601 | } |
| 602 | |
| 603 | #endif |