New: implementing hf mf hardnested
authorpwpiwi <pwpiwi@users.noreply.github.com>
Mon, 29 May 2017 08:56:37 +0000 (10:56 +0200)
committerpwpiwi <pwpiwi@users.noreply.github.com>
Wed, 31 May 2017 05:30:56 +0000 (07:30 +0200)
This implements the attack described in
Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
Computer and Communications Security, 2015
It uses precomputed tables for many bitflip properties (not only two as in the paper)
and is therefore quite efficient. To prevent failing it doesn't do
differential analysis with several nonce bytes' Sum(a8) properties (each of them
may be wrongly guessed) - instead it concentrates on one nonce byte and tries all
Sum(a8) property guesses sequentially (ordered by probability). The brute force phase
makes use of aczid's bit sliced brute forcer (https://github.com/aczid/crypto1_bs).
Includes runtime CPU-detection to leverage modern (and old) SIMD instructions
with a single executable.

370 files changed:
.gitattributes
.gitignore
armsrc/appmain.c
armsrc/apps.h
client/Makefile
client/cmdhfmf.c
client/cmdhfmfhard.c [new file with mode: 0644]
client/cmdhfmfhard.h [new file with mode: 0644]
client/hardnested/bf_bench_data.bin [new file with mode: 0644]
client/hardnested/hardnested_bf_core.c [new file with mode: 0644]
client/hardnested/hardnested_bf_core.h [new file with mode: 0644]
client/hardnested/hardnested_bitarray_core.c [new file with mode: 0644]
client/hardnested/hardnested_bitarray_core.h [new file with mode: 0644]
client/hardnested/hardnested_bruteforce.c [new file with mode: 0644]
client/hardnested/hardnested_bruteforce.h [new file with mode: 0644]
client/hardnested/hardnested_tables.c [new file with mode: 0644]
client/hardnested/tables/bitflip_0_001_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_003_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_005_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_007_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_009_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_00b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_00d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_00f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_010_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_014_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_01c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_021_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_023_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_025_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_027_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_029_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_02b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_02d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_02f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_030_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_034_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_03c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_040_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_044_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_04c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_051_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_053_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_055_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_057_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_059_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_05b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_05d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_05f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_064_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_06c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_071_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_073_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_075_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_077_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_079_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_07b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_07f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_081_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_083_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_085_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_087_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_089_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_08b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_08d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_08f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_090_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_094_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_09c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0a1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0a3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0a5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0a7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0a9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0ab_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0ad_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0af_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0b0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0b4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0bc_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0c0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0c4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0cc_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0d1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0d3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0d5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0d7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0d9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0db_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0dd_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0df_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0e4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0ec_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0f3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0f5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0f7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0f9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0fb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0fd_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_0ff_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_104_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_10c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_111_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_113_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_115_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_117_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_119_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_11b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_11d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_11f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_124_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_12c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_131_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_133_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_135_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_137_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_139_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_13b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_13d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_13f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_141_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_143_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_145_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_147_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_149_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_14b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_14d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_14f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_150_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_154_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_15c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_161_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_163_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_165_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_167_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_169_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_16b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_16d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_16f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_170_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_174_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_17c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_184_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_18c_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_191_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_193_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_195_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_197_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_199_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_19b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_19d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_19f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1a4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1ac_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1b1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1b3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1b5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1b7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1b9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1bb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1bd_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1bf_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1c1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1c3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1c5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1c9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1cb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1d0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1d4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1dc_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1e1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1e3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1e5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1e7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1e9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1eb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1ed_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1ef_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1f0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1f4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_1fc_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_210_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_225_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_227_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_22d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_22f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_240_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_275_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_277_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_27f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_294_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2a1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2a3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2a9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2ab_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2c4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2f3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2f9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_2fb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_335_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_337_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_33d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_33f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_350_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_365_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_367_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_36d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_36f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_384_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3b1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3b3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3b9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3bb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3d4_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3e1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3e3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3e9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_0_3eb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_002_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_008_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_00a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_012_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_018_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_01a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_020_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_028_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_02a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_02e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_032_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_036_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_038_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_03a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_03e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_040_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_042_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_046_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_048_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_04a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_04e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_052_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_056_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_058_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_05a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_05e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_060_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_062_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_066_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_068_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_06a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_06e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_072_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_076_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_078_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_07a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_07e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_080_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_082_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_086_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_088_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_08a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_08e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_092_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_096_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_098_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_09a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_09e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0a0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0a2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0a6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0a8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0aa_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0ae_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0b2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0b6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0b8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0ba_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0be_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0c0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0c2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0c6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0c8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0ca_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0ce_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0d2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0d6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0d8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0da_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0de_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0e0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0e8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_0f8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_108_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_111_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_113_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_115_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_117_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_118_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_11a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_11b_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_120_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_122_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_128_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_131_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_135_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_138_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_145_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_147_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_148_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_158_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_160_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_161_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_163_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_165_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_168_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_178_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_180_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_188_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_191_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_198_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_199_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_19d_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_19f_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1a0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1a8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1b3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1b5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1b7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1b8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1b9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1bd_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1c1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1c3_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1c8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1c9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1cd_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1cf_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1d8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e0_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e1_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e5_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e7_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1e9_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1eb_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1ed_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_1f8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_208_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_220_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_24a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_24e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_25a_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_25e_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_262_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_266_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_272_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_276_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_280_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_2a8_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_2c2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_2c6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_2d2_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_2d6_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_328_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_388_states.bin [new file with mode: 0644]
client/hardnested/tables/bitflip_1_3a0_states.bin [new file with mode: 0644]
client/lualibs/usb_cmd.lua [deleted file]
client/obj/hardnested/.dummy [new file with mode: 0644]
client/util.c
client/util.h
include/usb_cmd.h

index ef225ce2ba1f521a4e339cc8875e9f003cd914cf..2a8d66c2c2d63be47229abb16e68fa94b0882b79 100644 (file)
@@ -1,4 +1,4 @@
 # .gitattributes
 # prevent binary files from CRLF handling, diff and merge:
 fpga/fpga.bit -crlf -diff
\ No newline at end of file
+*.bin -crlf -diff
index 422d53e52897fbf19babac164928044232385276..8181884073ad4f3f8d5de491911898ebfce4dbd7 100644 (file)
 *.s19
 *.map
 *.bin
+!client/hardnested/*.bin
 *.dll
 *.moc.cpp
 *.z
+usb_cmd.lua
 version.c
 client/ui/ui_overlays.h
 
 *.exe
+hardnested_stats.txt
 proxmark3
 flasher
 lua
index 4c475541b41ba28e572533b0c7f3121575316091..30d5ed58aed840731bc2e462247fe4d57dde6b42 100644 (file)
@@ -1163,6 +1163,9 @@ void UsbPacketReceived(uint8_t *packet, int len)
                case CMD_MIFAREU_WRITEBL:
                        MifareUWriteBlock(c->arg[0], c->arg[1], c->d.asBytes);
                        break;
+               case CMD_MIFARE_ACQUIRE_ENCRYPTED_NONCES:
+                       MifareAcquireEncryptedNonces(c->arg[0], c->arg[1], c->arg[2], c->d.asBytes);
+                       break;
                case CMD_MIFARE_NESTED:
                        MifareNested(c->arg[0], c->arg[1], c->arg[2], c->d.asBytes);
                        break;
index ee63816951ff469a4d9dc47338b52dc1b3487da6..1f140587d2842184afdda11a48284e3065818005 100644 (file)
@@ -126,6 +126,7 @@ void MifareWriteBlock(uint8_t arg0, uint8_t arg1, uint8_t arg2, uint8_t *datain)
 //void MifareUWriteBlockCompat(uint8_t arg0,uint8_t *datain);
 void MifareUWriteBlock(uint8_t arg0, uint8_t arg1, uint8_t *datain);
 void MifareNested(uint32_t arg0, uint32_t arg1, uint32_t arg2, uint8_t *datain);
+void MifareAcquireEncryptedNonces(uint32_t arg0, uint32_t arg1, uint32_t flags, uint8_t *datain);
 void MifareChkKeys(uint16_t arg0, uint8_t arg1, uint8_t arg2, uint8_t *datain);
 void Mifare1ksim(uint8_t arg0, uint8_t arg1, uint8_t arg2, uint8_t *datain);
 void MifareSetDbgLvl(uint32_t arg0, uint32_t arg1, uint32_t arg2, uint8_t *datain);
index 3faa6f9378efda66d37f8f91b3f121d2f2dad033..cdb414795ed0c0d253f0de6bd3c7361fc274d3a7 100644 (file)
@@ -112,6 +112,8 @@ CMDSRCS =   crapto1/crapto1.c\
                        cmdhficlass.c \
                        cmdhfmf.c \
                        cmdhfmfu.c \
+                       cmdhfmfhard.c \
+                       hardnested/hardnested_bruteforce.c \
                        cmdhftopaz.c \
                        cmdhw.c \
                        cmdlf.c \
@@ -153,6 +155,8 @@ CMDSRCS =   crapto1/crapto1.c\
                        reveng/poly.c\
                        reveng/getopt.c\
 
+MULTIARCHSRCS = hardnested/hardnested_bf_core.c hardnested/hardnested_bitarray_core.c
+
 ZLIBSRCS = deflate.c adler32.c trees.c zutil.c inflate.c inffast.c inftrees.c
 ZLIBFLAGS = -DZ_SOLO -DZ_PREFIX -DNO_GZIP -DZLIB_PM3_TUNED 
 #-DDEBUG -Dverbose=1
@@ -162,10 +166,16 @@ QTGUISRCS = proxgui.cpp proxguiqt.cpp proxguiqt.moc.cpp guidummy.cpp
 COREOBJS = $(CORESRCS:%.c=$(OBJDIR)/%.o)
 CMDOBJS = $(CMDSRCS:%.c=$(OBJDIR)/%.o)
 ZLIBOBJS = $(ZLIBSRCS:%.c=$(OBJDIR)/%.o)
-
+MULTIARCHOBJS = $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_NOSIMD.o) \
+                       $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_MMX.o) \
+                       $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_SSE2.o) \
+                       $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX.o) \
+                       $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX2.o) \
+                       $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX512.o)
+                       
 BINS = proxmark3 flasher fpga_compress
 WINBINS = $(patsubst %, %.exe, $(BINS))
-CLEAN = $(BINS) $(WINBINS) $(COREOBJS) $(CMDOBJS) $(ZLIBOBJS) $(QTGUIOBJS) $(OBJDIR)/*.o *.moc.cpp ui/ui_overlays.h
+CLEAN = $(BINS) $(WINBINS) $(COREOBJS) $(CMDOBJS) $(ZLIBOBJS) $(QTGUIOBJS) $(MULTIARCHOBJS) $(OBJDIR)/*.o *.moc.cpp ui/ui_overlays.h
 
 # need to assign dependancies to build these first...
 all: ui/ui_overlays.h lua_build $(BINS)
@@ -174,8 +184,8 @@ all-static: LDLIBS:=-static $(LDLIBS)
 all-static: proxmark3 flasher fpga_compress
 
 proxmark3: LDLIBS+=$(LUALIB) $(QTLDLIBS)
-proxmark3: $(OBJDIR)/proxmark3.o $(COREOBJS) $(CMDOBJS) $(QTGUIOBJS) lualibs/usb_cmd.lua
-       $(LD) $(LDFLAGS) $(OBJDIR)/proxmark3.o $(COREOBJS) $(CMDOBJS) $(QTGUIOBJS) $(LDLIBS) -o $@
+proxmark3: $(OBJDIR)/proxmark3.o $(COREOBJS) $(CMDOBJS) $(QTGUIOBJS) $(MULTIARCHOBJS) lualibs/usb_cmd.lua
+       $(LD) $(LDFLAGS) $(OBJDIR)/proxmark3.o $(COREOBJS) $(CMDOBJS) $(QTGUIOBJS) $(MULTIARCHOBJS) $(LDLIBS) -o $@
 
 flasher: $(OBJDIR)/flash.o $(OBJDIR)/flasher.o $(COREOBJS)
        $(LD) $(LDFLAGS) $^ $(LDLIBS) -o $@
@@ -205,6 +215,24 @@ lua_build:
 
 .PHONY: all clean
 
+$(OBJDIR)/%_NOSIMD.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mno-mmx -mno-sse2 -mno-avx -mno-avx2 -mno-avx512f -c -o $@ $<
+
+$(OBJDIR)/%_MMX.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mmmx -mno-sse2 -mno-avx -mno-avx2 -mno-avx512f -c -o $@ $<
+
+$(OBJDIR)/%_SSE2.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mmmx -msse2 -mno-avx -mno-avx2 -mno-avx512f -c -o $@ $<
+
+$(OBJDIR)/%_AVX.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mmmx -msse2 -mavx -mno-avx2 -mno-avx512f -c -o $@ $<
+
+$(OBJDIR)/%_AVX2.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mmmx -msse2 -mavx -mavx2 -mno-avx512f -c -o $@ $<
+
+$(OBJDIR)/%_AVX512.o : %.c $(OBJDIR)/%.d
+       $(CC) $(DEPFLAGS) $(CFLAGS) -mmmx -msse2 -mavx -mavx2 -mavx512f -c -o $@ $<
+
 %.o: %.c
 $(OBJDIR)/%.o : %.c $(OBJDIR)/%.d
        $(CC) $(DEPFLAGS) $(CFLAGS) $(ZLIBFLAGS) -c -o $@ $<
index 7a6aaa3b9a3cd367a517d2c1f7a2ea56d49ed392..5b4a0b2a401a48ead9d8e7375c3211047a40061b 100644 (file)
@@ -8,6 +8,8 @@
 // High frequency MIFARE commands\r
 //-----------------------------------------------------------------------------\r
 \r
+#include "cmdhfmf.h"\r
+\r
 #include <inttypes.h>\r
 #include <string.h>\r
 #include <stdio.h>\r
@@ -15,7 +17,9 @@
 #include <ctype.h>\r
 #include "proxmark3.h"\r
 #include "cmdmain.h"\r
+#include "cmdhfmfhard.h"\r
 #include "util.h"\r
+#include "usb_cmd.h"\r
 #include "ui.h"\r
 #include "mifarehost.h"\r
 #include "mifare.h"\r
 \r
 static int CmdHelp(const char *Cmd);\r
 \r
-\r
 int CmdHF14AMifare(const char *Cmd)\r
 {\r
        int isOK = 0;\r
        uint64_t key = 0;\r
-\r
        isOK = mfDarkside(&key);\r
        switch (isOK) {\r
                case -1 : PrintAndLog("Button pressed. Aborted."); return 1;\r
                case -2 : PrintAndLog("Card is not vulnerable to Darkside attack (doesn't send NACK on authentication requests)."); return 1;\r
                case -3 : PrintAndLog("Card is not vulnerable to Darkside attack (its random number generator is not predictable)."); return 1;\r
                case -4 : PrintAndLog("Card is not vulnerable to Darkside attack (its random number generator seems to be based on the wellknown");\r
-                                       PrintAndLog("generating polynomial with 16 effective bits only, but shows unexpected behaviour."); return 1;\r
+                                 PrintAndLog("generating polynomial with 16 effective bits only, but shows unexpected behaviour."); return 1;\r
                case -5 : PrintAndLog("Aborted via keyboard.");  return 1;\r
                default : PrintAndLog("Found valid key:%012" PRIx64 "\n", key);\r
        }\r
@@ -764,6 +766,127 @@ int CmdHF14AMfNested(const char *Cmd)
        return 0;\r
 }\r
 \r
+\r
+int CmdHF14AMfNestedHard(const char *Cmd)\r
+{\r
+       uint8_t blockNo = 0;\r
+       uint8_t keyType = 0;\r
+       uint8_t trgBlockNo = 0;\r
+       uint8_t trgKeyType = 0;\r
+       uint8_t key[6] = {0, 0, 0, 0, 0, 0};\r
+       uint8_t trgkey[6] = {0, 0, 0, 0, 0, 0};\r
+       \r
+       char ctmp;\r
+       ctmp = param_getchar(Cmd, 0);\r
+\r
+       if (ctmp != 'R' && ctmp != 'r' && ctmp != 'T' && ctmp != 't' && strlen(Cmd) < 20) {\r
+               PrintAndLog("Usage:");\r
+               PrintAndLog("      hf mf hardnested <block number> <key A|B> <key (12 hex symbols)>");\r
+               PrintAndLog("                       <target block number> <target key A|B> [known target key (12 hex symbols)] [w] [s]");\r
+               PrintAndLog("  or  hf mf hardnested r [known target key]");\r
+               PrintAndLog(" ");\r
+               PrintAndLog("Options: ");\r
+               PrintAndLog("      w: Acquire nonces and write them to binary file nonces.bin");\r
+               PrintAndLog("      s: Slower acquisition (required by some non standard cards)");\r
+               PrintAndLog("      r: Read nonces.bin and start attack");\r
+               PrintAndLog(" ");\r
+               PrintAndLog("      sample1: hf mf hardnested 0 A FFFFFFFFFFFF 4 A");\r
+               PrintAndLog("      sample2: hf mf hardnested 0 A FFFFFFFFFFFF 4 A w");\r
+               PrintAndLog("      sample3: hf mf hardnested 0 A FFFFFFFFFFFF 4 A w s");\r
+               PrintAndLog("      sample4: hf mf hardnested r");\r
+               PrintAndLog(" ");\r
+               PrintAndLog("Add the known target key to check if it is present in the remaining key space:");\r
+               PrintAndLog("      sample5: hf mf hardnested 0 A A0A1A2A3A4A5 4 A FFFFFFFFFFFF");\r
+               return 0;\r
+       }       \r
+       \r
+       bool know_target_key = false;\r
+       bool nonce_file_read = false;\r
+       bool nonce_file_write = false;\r
+       bool slow = false;\r
+       int tests = 0;\r
+       \r
+       \r
+       if (ctmp == 'R' || ctmp == 'r') {\r
+               nonce_file_read = true;\r
+               if (!param_gethex(Cmd, 1, trgkey, 12)) {\r
+                       know_target_key = true;\r
+               }\r
+       } else if (ctmp == 'T' || ctmp == 't') {\r
+               tests = param_get32ex(Cmd, 1, 100, 10);\r
+               if (!param_gethex(Cmd, 2, trgkey, 12)) {\r
+                       know_target_key = true;\r
+               }\r
+       } else {\r
+               blockNo = param_get8(Cmd, 0);\r
+               ctmp = param_getchar(Cmd, 1);\r
+               if (ctmp != 'a' && ctmp != 'A' && ctmp != 'b' && ctmp != 'B') {\r
+                       PrintAndLog("Key type must be A or B");\r
+                       return 1;\r
+               }\r
+               if (ctmp != 'A' && ctmp != 'a') { \r
+                       keyType = 1;\r
+               }\r
+               \r
+               if (param_gethex(Cmd, 2, key, 12)) {\r
+                       PrintAndLog("Key must include 12 HEX symbols");\r
+                       return 1;\r
+               }\r
+               \r
+               trgBlockNo = param_get8(Cmd, 3);\r
+               ctmp = param_getchar(Cmd, 4);\r
+               if (ctmp != 'a' && ctmp != 'A' && ctmp != 'b' && ctmp != 'B') {\r
+                       PrintAndLog("Target key type must be A or B");\r
+                       return 1;\r
+               }\r
+               if (ctmp != 'A' && ctmp != 'a') {\r
+                       trgKeyType = 1;\r
+               }\r
+\r
+               uint16_t i = 5;\r
+\r
+               if (!param_gethex(Cmd, 5, trgkey, 12)) {\r
+                       know_target_key = true;\r
+                       i++;\r
+               }\r
+\r
+               while ((ctmp = param_getchar(Cmd, i))) {\r
+                       if (ctmp == 's' || ctmp == 'S') {\r
+                               slow = true;\r
+                       } else if (ctmp == 'w' || ctmp == 'W') {\r
+                               nonce_file_write = true;\r
+                       } else {\r
+                               PrintAndLog("Possible options are w and/or s");\r
+                               return 1;\r
+                       }\r
+                       i++;\r
+               }\r
+       }\r
+\r
+       PrintAndLog("--target block no:%3d, target key type:%c, known target key: 0x%02x%02x%02x%02x%02x%02x%s, file action: %s, Slow: %s, Tests: %d ", \r
+                       trgBlockNo, \r
+                       trgKeyType?'B':'A',\r
+                       trgkey[0], trgkey[1], trgkey[2], trgkey[3], trgkey[4], trgkey[5],\r
+                       know_target_key?"":" (not set)",\r
+                       nonce_file_write?"write":nonce_file_read?"read":"none",\r
+                       slow?"Yes":"No",\r
+                       tests);\r
+\r
+       int16_t isOK = mfnestedhard(blockNo, keyType, key, trgBlockNo, trgKeyType, know_target_key?trgkey:NULL, nonce_file_read, nonce_file_write, slow, tests);\r
+\r
+       if (isOK) {\r
+               switch (isOK) {\r
+                       case 1 : PrintAndLog("Error: No response from Proxmark.\n"); break;\r
+                       case 2 : PrintAndLog("Button pressed. Aborted.\n"); break;\r
+                       default : break;\r
+               }\r
+               return 2;\r
+       }\r
+\r
+       return 0;\r
+}\r
+\r
+\r
 int CmdHF14AMfChk(const char *Cmd)\r
 {\r
        if (strlen(Cmd)<3) {\r
@@ -2183,33 +2306,34 @@ int CmdDecryptTraceCmds(const char *Cmd){
 \r
 static command_t CommandTable[] =\r
 {\r
-  {"help",             CmdHelp,                                1, "This help"},\r
-  {"dbg",              CmdHF14AMfDbg,                  0, "Set default debug mode"},\r
-  {"rdbl",             CmdHF14AMfRdBl,                 0, "Read MIFARE classic block"},\r
-  {"rdsc",             CmdHF14AMfRdSc,                 0, "Read MIFARE classic sector"},\r
-  {"dump",             CmdHF14AMfDump,                 0, "Dump MIFARE classic tag to binary file"},\r
-  {"restore",  CmdHF14AMfRestore,              0, "Restore MIFARE classic binary file to BLANK tag"},\r
-  {"wrbl",             CmdHF14AMfWrBl,                 0, "Write MIFARE classic block"},\r
-  {"chk",              CmdHF14AMfChk,                  0, "Test block keys"},\r
-  {"mifare",   CmdHF14AMifare,                 0, "Read parity error messages."},\r
-  {"nested",   CmdHF14AMfNested,               0, "Test nested authentication"},\r
-  {"sniff",            CmdHF14AMfSniff,                0, "Sniff card-reader communication"},\r
-  {"sim",              CmdHF14AMf1kSim,                0, "Simulate MIFARE card"},\r
-  {"eclr",             CmdHF14AMfEClear,               0, "Clear simulator memory block"},\r
-  {"eget",             CmdHF14AMfEGet,                 0, "Get simulator memory block"},\r
-  {"eset",             CmdHF14AMfESet,                 0, "Set simulator memory block"},\r
-  {"eload",            CmdHF14AMfELoad,                0, "Load from file emul dump"},\r
-  {"esave",            CmdHF14AMfESave,                0, "Save to file emul dump"},\r
-  {"ecfill",   CmdHF14AMfECFill,               0, "Fill simulator memory with help of keys from simulator"},\r
-  {"ekeyprn",  CmdHF14AMfEKeyPrn,              0, "Print keys from simulator memory"},\r
-  {"csetuid",  CmdHF14AMfCSetUID,              0, "Set UID for magic Chinese card"},\r
-  {"csetblk",  CmdHF14AMfCSetBlk,              0, "Write block - Magic Chinese card"},\r
-  {"cgetblk",  CmdHF14AMfCGetBlk,              0, "Read block - Magic Chinese card"},\r
-  {"cgetsc",   CmdHF14AMfCGetSc,               0, "Read sector - Magic Chinese card"},\r
-  {"cload",            CmdHF14AMfCLoad,                0, "Load dump into magic Chinese card"},\r
-  {"csave",            CmdHF14AMfCSave,                0, "Save dump from magic Chinese card into file or emulator"},\r
-  {"decrypt", CmdDecryptTraceCmds,1, "[nt] [ar_enc] [at_enc] [data] - to decrypt snoop or trace"},\r
-  {NULL, NULL, 0, NULL}\r
+  {"help",             CmdHelp,                 1, "This help"},\r
+  {"dbg",              CmdHF14AMfDbg,           0, "Set default debug mode"},\r
+  {"rdbl",             CmdHF14AMfRdBl,          0, "Read MIFARE classic block"},\r
+  {"rdsc",             CmdHF14AMfRdSc,          0, "Read MIFARE classic sector"},\r
+  {"dump",             CmdHF14AMfDump,          0, "Dump MIFARE classic tag to binary file"},\r
+  {"restore",                 CmdHF14AMfRestore,       0, "Restore MIFARE classic binary file to BLANK tag"},\r
+  {"wrbl",             CmdHF14AMfWrBl,          0, "Write MIFARE classic block"},\r
+  {"chk",              CmdHF14AMfChk,           0, "Test block keys"},\r
+  {"mifare",           CmdHF14AMifare,          0, "Read parity error messages."},\r
+  {"hardnested",       CmdHF14AMfNestedHard,    0, "Nested attack for hardened Mifare cards"},\r
+  {"nested",           CmdHF14AMfNested,        0, "Test nested authentication"},\r
+  {"sniff",            CmdHF14AMfSniff,         0, "Sniff card-reader communication"},\r
+  {"sim",              CmdHF14AMf1kSim,         0, "Simulate MIFARE card"},\r
+  {"eclr",             CmdHF14AMfEClear,        0, "Clear simulator memory block"},\r
+  {"eget",             CmdHF14AMfEGet,          0, "Get simulator memory block"},\r
+  {"eset",             CmdHF14AMfESet,          0, "Set simulator memory block"},\r
+  {"eload",            CmdHF14AMfELoad,         0, "Load from file emul dump"},\r
+  {"esave",            CmdHF14AMfESave,         0, "Save to file emul dump"},\r
+  {"ecfill",           CmdHF14AMfECFill,        0, "Fill simulator memory with help of keys from simulator"},\r
+  {"ekeyprn",          CmdHF14AMfEKeyPrn,       0, "Print keys from simulator memory"},\r
+  {"csetuid",          CmdHF14AMfCSetUID,       0, "Set UID for magic Chinese card"},\r
+  {"csetblk",          CmdHF14AMfCSetBlk,       0, "Write block - Magic Chinese card"},\r
+  {"cgetblk",          CmdHF14AMfCGetBlk,       0, "Read block - Magic Chinese card"},\r
+  {"cgetsc",           CmdHF14AMfCGetSc,        0, "Read sector - Magic Chinese card"},\r
+  {"cload",            CmdHF14AMfCLoad,         0, "Load dump into magic Chinese card"},\r
+  {"csave",            CmdHF14AMfCSave,         0, "Save dump from magic Chinese card into file or emulator"},\r
+  {"decrypt",          CmdDecryptTraceCmds,     1, "[nt] [ar_enc] [at_enc] [data] - to decrypt snoop or trace"},\r
+  {NULL,               NULL,                    0, NULL}\r
 };\r
 \r
 int CmdHFMF(const char *Cmd)\r
diff --git a/client/cmdhfmfhard.c b/client/cmdhfmfhard.c
new file mode 100644 (file)
index 0000000..59bdc5f
--- /dev/null
@@ -0,0 +1,2687 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2015, 2016 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+
+#include "cmdhfmfhard.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+#include <pthread.h>
+#include <locale.h>
+#include <math.h>
+#include "proxmark3.h"
+#include "cmdmain.h"
+#include "ui.h"
+#include "util.h"
+#include "crapto1/crapto1.h"
+#include "parity.h"
+#include "hardnested/hardnested_bruteforce.h"
+#include "hardnested/hardnested_bitarray_core.h"
+
+#define NUM_CHECK_BITFLIPS_THREADS             (num_CPUs())
+#define NUM_REDUCTION_WORKING_THREADS  (num_CPUs())
+
+#define IGNORE_BITFLIP_THRESHOLD               0.99    // ignore bitflip arrays which have nearly only valid states
+
+#define STATE_FILES_DIRECTORY                  "hardnested/tables/"
+#define STATE_FILE_TEMPLATE                            "bitflip_%d_%03" PRIx16 "_states.bin"
+
+#define DEBUG_KEY_ELIMINATION
+// #define DEBUG_REDUCTION
+
+static uint16_t sums[NUM_SUMS] = {0, 32, 56, 64, 80, 96, 104, 112, 120, 128, 136, 144, 152, 160, 176, 192, 200, 224, 256}; // possible sum property values
+
+#define NUM_PART_SUMS                                  9               // number of possible partial sum property values
+
+typedef enum {
+       EVEN_STATE = 0,
+       ODD_STATE = 1
+} odd_even_t;
+
+static uint32_t num_acquired_nonces = 0;
+static uint64_t start_time = 0;
+static uint16_t effective_bitflip[2][0x400];
+static uint16_t num_effective_bitflips[2] = {0, 0};
+static uint16_t all_effective_bitflip[0x400];
+static uint16_t num_all_effective_bitflips = 0;
+static uint16_t num_1st_byte_effective_bitflips = 0;
+#define CHECK_1ST_BYTES                        0x01
+#define CHECK_2ND_BYTES                        0x02
+static uint8_t hardnested_stage = CHECK_1ST_BYTES;
+static uint64_t known_target_key;
+static uint32_t test_state[2] = {0,0};
+static float brute_force_per_second;
+
+
+static void get_SIMD_instruction_set(char* instruction_set) {
+       if (__builtin_cpu_supports("avx512f")) strcpy(instruction_set, "AVX512F");
+       else if (__builtin_cpu_supports("avx2")) strcpy(instruction_set, "AVX2");
+       else if (__builtin_cpu_supports("avx")) strcpy(instruction_set, "AVX");
+       else if (__builtin_cpu_supports("sse2")) strcpy(instruction_set, "SSE2");
+       else if (__builtin_cpu_supports("mmx")) strcpy(instruction_set, "MMX");
+       else strcpy(instruction_set, "unsupported");
+}
+
+
+static void print_progress_header(void) {
+       char progress_text[80];
+       char instr_set[12] = "";
+       get_SIMD_instruction_set(instr_set);
+       sprintf(progress_text, "Start using %d threads and %s SIMD core", num_CPUs(), instr_set);
+       PrintAndLog("\n\n");
+       PrintAndLog(" time    | #nonces | Activity                                                | expected to brute force");
+       PrintAndLog("         |         |                                                         | #states         | time ");
+       PrintAndLog("------------------------------------------------------------------------------------------------------");
+       PrintAndLog("       0 |       0 | %-55s |                 |", progress_text);
+}
+
+
+void hardnested_print_progress(uint32_t nonces, char *activity, float brute_force, uint64_t min_diff_print_time) {
+       static uint64_t last_print_time = 0;
+       if (msclock() - last_print_time > min_diff_print_time) {
+               last_print_time = msclock();
+               uint64_t total_time = msclock() - start_time;
+               float brute_force_time = brute_force / brute_force_per_second;
+               char brute_force_time_string[20];
+               if (brute_force_time < 90) {
+                       sprintf(brute_force_time_string, "%2.0fs", brute_force_time);
+               } else if (brute_force_time < 60 * 90) {
+                       sprintf(brute_force_time_string, "%2.0fmin", brute_force_time/60);
+               } else if (brute_force_time < 60 * 60 * 36) {
+                       sprintf(brute_force_time_string, "%2.0fh", brute_force_time/(60*60));
+               } else {
+                       sprintf(brute_force_time_string, "%2.0fd", brute_force_time/(60*60*24));
+               }
+               PrintAndLog(" %7.0f | %7d | %-55s | %15.0f | %5s", (float)total_time/1000.0, nonces, activity, brute_force, brute_force_time_string);
+       }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// bitarray functions
+
+static inline void clear_bitarray24(uint32_t *bitarray)
+{
+       memset(bitarray, 0x00, sizeof(uint32_t) * (1<<19));
+}
+
+
+static inline void set_bitarray24(uint32_t *bitarray)
+{
+       memset(bitarray, 0xff, sizeof(uint32_t) * (1<<19));
+}
+
+
+static inline void set_bit24(uint32_t *bitarray, uint32_t index)
+{
+       bitarray[index>>5] |= 0x80000000>>(index&0x0000001f);
+}
+
+
+static inline void clear_bit24(uint32_t *bitarray, uint32_t index)
+{
+       bitarray[index>>5] &= ~(0x80000000>>(index&0x0000001f));
+}
+
+
+static inline uint32_t test_bit24(uint32_t *bitarray, uint32_t index)
+{
+       return  bitarray[index>>5] & (0x80000000>>(index&0x0000001f));
+}
+
+
+static inline uint32_t next_state(uint32_t *bitarray, uint32_t state)
+{
+       if (++state == 1<<24) return 1<<24;
+       uint32_t index = state >> 5;
+       uint_fast8_t bit = state & 0x1f;
+       uint32_t line = bitarray[index] << bit;
+       while (bit <= 0x1f) {
+               if (line & 0x80000000) return state;
+               state++;
+               bit++;
+               line <<= 1;
+       }
+       index++;
+       while (bitarray[index] == 0x00000000 && state < 1<<24) {
+               index++;
+               state += 0x20;
+       }
+       if (state >= 1<<24) return 1<<24;
+#if defined __GNUC__
+       return state + __builtin_clz(bitarray[index]);
+#else
+       bit = 0x00;
+       line = bitarray[index];
+       while (bit <= 0x1f) {
+               if (line & 0x80000000) return state;
+               state++;
+               bit++;
+               line <<= 1;
+       }
+       return 1<<24;
+#endif
+}
+
+
+static inline uint32_t next_not_state(uint32_t *bitarray, uint32_t state)
+{
+       if (++state == 1<<24) return 1<<24;
+       uint32_t index = state >> 5;
+       uint_fast8_t bit = state & 0x1f;
+       uint32_t line = bitarray[index] << bit;
+       while (bit <= 0x1f) {
+               if ((line & 0x80000000) == 0) return state;
+               state++;
+               bit++;
+               line <<= 1;
+       }
+       index++;
+       while (bitarray[index] == 0xffffffff && state < 1<<24) {
+               index++;
+               state += 0x20;
+       }
+       if (state >= 1<<24) return 1<<24;
+#if defined __GNUC__
+       return state + __builtin_clz(~bitarray[index]);
+#else
+       bit = 0x00;
+       line = bitarray[index];
+       while (bit <= 0x1f) {
+               if ((line & 0x80000000) == 0) return state;
+               state++;
+               bit++;
+               line <<= 1;
+       }
+       return 1<<24;
+#endif
+}
+
+
+
+
+#define BITFLIP_2ND_BYTE                               0x0200
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// bitflip property bitarrays
+
+static uint32_t *bitflip_bitarrays[2][0x400];
+static uint32_t count_bitflip_bitarrays[2][0x400];
+
+static int compare_count_bitflip_bitarrays(const void *b1, const void *b2)
+{
+       uint64_t count1 = (uint64_t)count_bitflip_bitarrays[ODD_STATE][*(uint16_t *)b1] * count_bitflip_bitarrays[EVEN_STATE][*(uint16_t *)b1];
+       uint64_t count2 = (uint64_t)count_bitflip_bitarrays[ODD_STATE][*(uint16_t *)b2] * count_bitflip_bitarrays[EVEN_STATE][*(uint16_t *)b2];
+       return (count1 > count2) - (count2 > count1);
+}
+
+
+static void init_bitflip_bitarrays(void)
+{
+#if defined (DEBUG_REDUCTION)
+       uint8_t line = 0;
+#endif 
+
+       char state_files_path[strlen(get_my_executable_directory()) + strlen(STATE_FILES_DIRECTORY) + strlen(STATE_FILE_TEMPLATE) + 1];
+       char state_file_name[strlen(STATE_FILE_TEMPLATE)];
+       
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               num_effective_bitflips[odd_even] = 0;
+               for (uint16_t bitflip = 0x001; bitflip < 0x400; bitflip++) {
+                       bitflip_bitarrays[odd_even][bitflip] = NULL;
+                       count_bitflip_bitarrays[odd_even][bitflip] = 1<<24;
+                       sprintf(state_file_name, STATE_FILE_TEMPLATE, odd_even, bitflip);
+                       strcpy(state_files_path, get_my_executable_directory());
+                       strcat(state_files_path, STATE_FILES_DIRECTORY);
+                       strcat(state_files_path, state_file_name);
+                       FILE *statesfile = fopen(state_files_path, "rb");
+                       if (statesfile == NULL) {
+                               continue;
+                       } else {
+                               uint32_t *bitset = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+                               if (bitset == NULL) {
+                                       printf("Out of memory error in init_bitflip_statelists(). Aborting...\n");
+                                       fclose(statesfile);
+                                       exit(4);
+                               }
+                               size_t bytesread = fread(bitset, 1, sizeof(uint32_t) * (1<<19), statesfile);
+                               if (bytesread != sizeof(uint32_t) * (1<<19)) {
+                                       printf("File read error with %s. Aborting...", state_file_name);
+                                       fclose(statesfile);
+                                       free_bitarray(bitset);
+                                       exit(5);
+                               }
+                               fclose(statesfile);
+                               uint32_t count = count_states(bitset);
+                               if ((float)count/(1<<24) < IGNORE_BITFLIP_THRESHOLD) {
+                                       effective_bitflip[odd_even][num_effective_bitflips[odd_even]++] = bitflip;
+                                       bitflip_bitarrays[odd_even][bitflip] = bitset;
+                                       count_bitflip_bitarrays[odd_even][bitflip] = count;
+#if defined (DEBUG_REDUCTION)
+                                       printf("(%03" PRIx16 " %s:%5.1f%%) ", bitflip, odd_even?"odd ":"even", (float)count/(1<<24)*100.0);
+                                       line++;
+                                       if (line == 8) {
+                                               printf("\n");
+                                               line = 0;
+                                       }
+#endif
+                               } else {
+                                       free_bitarray(bitset);
+                               }
+                       }
+               }
+               effective_bitflip[odd_even][num_effective_bitflips[odd_even]] = 0x400;  // EndOfList marker
+       }
+
+       uint16_t i = 0;
+       uint16_t j = 0;
+       num_all_effective_bitflips = 0;
+       num_1st_byte_effective_bitflips = 0;
+       while (i < num_effective_bitflips[EVEN_STATE] || j < num_effective_bitflips[ODD_STATE]) {
+               if (effective_bitflip[EVEN_STATE][i] < effective_bitflip[ODD_STATE][j]) {
+                       all_effective_bitflip[num_all_effective_bitflips++] = effective_bitflip[EVEN_STATE][i];
+                       i++;
+               } else if (effective_bitflip[EVEN_STATE][i] > effective_bitflip[ODD_STATE][j]) {
+                       all_effective_bitflip[num_all_effective_bitflips++] = effective_bitflip[ODD_STATE][j];
+                       j++;
+               } else {
+                       all_effective_bitflip[num_all_effective_bitflips++] = effective_bitflip[EVEN_STATE][i];
+                       i++; j++;
+               }
+               if (!(all_effective_bitflip[num_all_effective_bitflips-1] & BITFLIP_2ND_BYTE)) {
+                       num_1st_byte_effective_bitflips = num_all_effective_bitflips;
+               }
+       }
+       qsort(all_effective_bitflip, num_1st_byte_effective_bitflips, sizeof(uint16_t), compare_count_bitflip_bitarrays);
+#if defined (DEBUG_REDUCTION)
+       printf("\n1st byte effective bitflips (%d): \n", num_1st_byte_effective_bitflips);
+       for(uint16_t i = 0; i < num_1st_byte_effective_bitflips; i++) {
+               printf("%03x ",  all_effective_bitflip[i]);
+       }
+#endif 
+       qsort(all_effective_bitflip+num_1st_byte_effective_bitflips, num_all_effective_bitflips - num_1st_byte_effective_bitflips, sizeof(uint16_t), compare_count_bitflip_bitarrays);
+#if defined (DEBUG_REDUCTION)
+       printf("\n2nd byte effective bitflips (%d): \n", num_all_effective_bitflips - num_1st_byte_effective_bitflips);
+       for(uint16_t i = num_1st_byte_effective_bitflips; i < num_all_effective_bitflips; i++) {
+               printf("%03x ",  all_effective_bitflip[i]);
+       }
+#endif 
+       char progress_text[80];
+       sprintf(progress_text, "Using %d precalculated bitflip state tables", num_all_effective_bitflips);
+       hardnested_print_progress(0, progress_text, (float)(1LL<<47), 0);
+}
+
+
+static void    free_bitflip_bitarrays(void)
+{
+       for (int16_t bitflip = 0x3ff; bitflip > 0x000; bitflip--) {
+               free_bitarray(bitflip_bitarrays[ODD_STATE][bitflip]);
+       }
+       for (int16_t bitflip = 0x3ff; bitflip > 0x000; bitflip--) {
+               free_bitarray(bitflip_bitarrays[EVEN_STATE][bitflip]);
+       }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sum property bitarrays
+
+static uint32_t *part_sum_a0_bitarrays[2][NUM_PART_SUMS];
+static uint32_t *part_sum_a8_bitarrays[2][NUM_PART_SUMS];
+static uint32_t *sum_a0_bitarrays[2][NUM_SUMS];        
+
+static uint16_t PartialSumProperty(uint32_t state, odd_even_t odd_even)
+{ 
+       uint16_t sum = 0;
+       for (uint16_t j = 0; j < 16; j++) {
+               uint32_t st = state;
+               uint16_t part_sum = 0;
+               if (odd_even == ODD_STATE) {
+                       for (uint16_t i = 0; i < 5; i++) {
+                               part_sum ^= filter(st);
+                               st = (st << 1) | ((j >> (3-i)) & 0x01) ;
+                       }
+                       part_sum ^= 1;          // XOR 1 cancelled out for the other 8 bits
+               } else {
+                       for (uint16_t i = 0; i < 4; i++) {
+                               st = (st << 1) | ((j >> (3-i)) & 0x01) ;
+                               part_sum ^= filter(st);
+                       }
+               }
+               sum += part_sum;
+       }
+       return sum;
+}
+
+
+static void init_part_sum_bitarrays(void)
+{
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               for (uint16_t part_sum_a0 = 0; part_sum_a0 < NUM_PART_SUMS; part_sum_a0++) {
+                       part_sum_a0_bitarrays[odd_even][part_sum_a0] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+                       if (part_sum_a0_bitarrays[odd_even][part_sum_a0] == NULL) {
+                               printf("Out of memory error in init_part_suma0_statelists(). Aborting...\n");
+                               exit(4);
+                       }
+                       clear_bitarray24(part_sum_a0_bitarrays[odd_even][part_sum_a0]);
+               }
+       }
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               //printf("(%d, %" PRIu16 ")...", odd_even, part_sum_a0);                        
+               for (uint32_t state = 0; state < (1<<20); state++) {
+                       uint16_t part_sum_a0 = PartialSumProperty(state, odd_even) / 2;
+                       for (uint16_t low_bits = 0; low_bits < 1<<4; low_bits++) {
+                               set_bit24(part_sum_a0_bitarrays[odd_even][part_sum_a0], state<<4 | low_bits);
+                       }
+               }
+       }
+
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               for (uint16_t part_sum_a8 = 0; part_sum_a8 < NUM_PART_SUMS; part_sum_a8++) {
+                       part_sum_a8_bitarrays[odd_even][part_sum_a8] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+                       if (part_sum_a8_bitarrays[odd_even][part_sum_a8] == NULL) {
+                               printf("Out of memory error in init_part_suma8_statelists(). Aborting...\n");
+                               exit(4);
+                       }
+                       clear_bitarray24(part_sum_a8_bitarrays[odd_even][part_sum_a8]);
+               }
+       }
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               //printf("(%d, %" PRIu16 ")...", odd_even, part_sum_a8); 
+               for (uint32_t state = 0; state < (1<<20); state++) {
+                       uint16_t part_sum_a8 = PartialSumProperty(state, odd_even) / 2;
+                       for (uint16_t high_bits = 0; high_bits < 1<<4; high_bits++) {
+                               set_bit24(part_sum_a8_bitarrays[odd_even][part_sum_a8], state | high_bits<<20);
+                       }
+               }
+       }
+}
+
+
+static void free_part_sum_bitarrays(void) 
+{
+       for (int16_t part_sum_a8 = (NUM_PART_SUMS-1); part_sum_a8 >= 0; part_sum_a8--) {
+               free_bitarray(part_sum_a8_bitarrays[ODD_STATE][part_sum_a8]);
+       }
+       for (int16_t part_sum_a8 = (NUM_PART_SUMS-1); part_sum_a8 >= 0; part_sum_a8--) {
+               free_bitarray(part_sum_a8_bitarrays[EVEN_STATE][part_sum_a8]);
+       }
+       for (int16_t part_sum_a0 = (NUM_PART_SUMS-1); part_sum_a0 >= 0; part_sum_a0--) {
+               free_bitarray(part_sum_a0_bitarrays[ODD_STATE][part_sum_a0]);
+       }
+       for (int16_t part_sum_a0 = (NUM_PART_SUMS-1); part_sum_a0 >= 0; part_sum_a0--) {
+               free_bitarray(part_sum_a0_bitarrays[EVEN_STATE][part_sum_a0]);
+       }
+}
+
+
+static void init_sum_bitarrays(void)
+{
+       for (uint16_t sum_a0 = 0; sum_a0 < NUM_SUMS; sum_a0++) {
+               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       sum_a0_bitarrays[odd_even][sum_a0] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+                       if (sum_a0_bitarrays[odd_even][sum_a0] == NULL) {
+                               printf("Out of memory error in init_sum_bitarrays(). Aborting...\n");
+                               exit(4);
+                       }
+                       clear_bitarray24(sum_a0_bitarrays[odd_even][sum_a0]);
+               }
+       }
+       for (uint8_t p = 0; p < NUM_PART_SUMS; p++) {
+               for (uint8_t q = 0; q < NUM_PART_SUMS; q++) {
+                       uint16_t sum_a0 = 2*p*(16-2*q) + (16-2*p)*2*q;
+                       uint16_t sum_a0_idx = 0;
+                       while (sums[sum_a0_idx] != sum_a0) sum_a0_idx++;
+                       bitarray_OR(sum_a0_bitarrays[EVEN_STATE][sum_a0_idx], part_sum_a0_bitarrays[EVEN_STATE][q]);
+                       bitarray_OR(sum_a0_bitarrays[ODD_STATE][sum_a0_idx], part_sum_a0_bitarrays[ODD_STATE][p]);
+               }
+       }
+       // for (uint16_t sum_a0 = 0; sum_a0 < NUM_SUMS; sum_a0++) {
+               // for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       // uint32_t count = count_states(sum_a0_bitarrays[odd_even][sum_a0]);
+                       // printf("sum_a0_bitarray[%s][%d] has %d states (%5.2f%%)\n", odd_even==EVEN_STATE?"even":"odd ", sums[sum_a0], count, (float)count/(1<<24)*100.0);
+               // }
+       // }
+}
+
+
+static void free_sum_bitarrays(void)
+{
+       for (int8_t sum_a0 = NUM_SUMS-1; sum_a0 >= 0; sum_a0--) {
+               free_bitarray(sum_a0_bitarrays[ODD_STATE][sum_a0]);
+               free_bitarray(sum_a0_bitarrays[EVEN_STATE][sum_a0]);
+       }
+}
+
+
+#ifdef DEBUG_KEY_ELIMINATION
+char failstr[250] = "";
+#endif
+
+static const float p_K0[NUM_SUMS] = {          // the probability that a random nonce has a Sum Property K 
+       0.0290, 0.0083, 0.0006, 0.0339, 0.0048, 0.0934, 0.0119, 0.0489, 0.0602, 0.4180, 0.0602, 0.0489, 0.0119, 0.0934, 0.0048, 0.0339, 0.0006, 0.0083, 0.0290 
+       };
+
+static float my_p_K[NUM_SUMS]; 
+
+static const float *p_K;
+       
+static uint32_t cuid;
+static noncelist_t nonces[256];
+static uint8_t best_first_bytes[256];
+static uint64_t maximum_states = 0;
+static uint8_t best_first_byte_smallest_bitarray = 0;
+static uint16_t first_byte_Sum = 0;
+static uint16_t first_byte_num = 0;
+static bool write_stats = false;
+static FILE *fstats = NULL;
+static uint32_t *all_bitflips_bitarray[2];
+static uint32_t num_all_bitflips_bitarray[2];
+static bool all_bitflips_bitarray_dirty[2];
+static uint64_t last_sample_clock = 0;
+static uint64_t sample_period = 0;
+static uint64_t num_keys_tested = 0;
+static statelist_t *candidates = NULL;
+
+
+static int add_nonce(uint32_t nonce_enc, uint8_t par_enc) 
+{
+       uint8_t first_byte = nonce_enc >> 24;
+       noncelistentry_t *p1 = nonces[first_byte].first;
+       noncelistentry_t *p2 = NULL;
+
+       if (p1 == NULL) {                       // first nonce with this 1st byte
+               first_byte_num++;
+               first_byte_Sum += evenparity32((nonce_enc & 0xff000000) | (par_enc & 0x08));
+       }
+
+       while (p1 != NULL && (p1->nonce_enc & 0x00ff0000) < (nonce_enc & 0x00ff0000)) {
+               p2 = p1;
+               p1 = p1->next;
+       }
+       
+       if (p1 == NULL) {                                                                                                                                       // need to add at the end of the list
+               if (p2 == NULL) {                       // list is empty yet. Add first entry.
+                       p2 = nonces[first_byte].first = malloc(sizeof(noncelistentry_t));
+               } else {                                        // add new entry at end of existing list.
+                       p2 = p2->next = malloc(sizeof(noncelistentry_t));
+               }
+       } else if ((p1->nonce_enc & 0x00ff0000) != (nonce_enc & 0x00ff0000)) {                          // found distinct 2nd byte. Need to insert.
+               if (p2 == NULL) {                       // need to insert at start of list
+                       p2 = nonces[first_byte].first = malloc(sizeof(noncelistentry_t));
+               } else {
+                       p2 = p2->next = malloc(sizeof(noncelistentry_t));
+               }
+       } else {                                                                                                                                                        // we have seen this 2nd byte before. Nothing to add or insert. 
+               return (0);
+       }
+
+       // add or insert new data
+       p2->next = p1;
+       p2->nonce_enc = nonce_enc;
+       p2->par_enc = par_enc;
+
+       nonces[first_byte].num++;
+       nonces[first_byte].Sum += evenparity32((nonce_enc & 0x00ff0000) | (par_enc & 0x04));
+       nonces[first_byte].sum_a8_guess_dirty = true;   // indicates that we need to recalculate the Sum(a8) probability for this first byte
+       return (1);                             // new nonce added
+}
+
+
+static void init_nonce_memory(void)
+{
+       for (uint16_t i = 0; i < 256; i++) {
+               nonces[i].num = 0;
+               nonces[i].Sum = 0;
+               nonces[i].first = NULL;
+               for (uint16_t j = 0; j < NUM_SUMS; j++) {
+                       nonces[i].sum_a8_guess[j].sum_a8_idx = j;
+                       nonces[i].sum_a8_guess[j].prob = 0.0;
+               }
+               nonces[i].sum_a8_guess_dirty = false;
+               for (uint16_t bitflip = 0x000; bitflip < 0x400; bitflip++) {
+                       nonces[i].BitFlips[bitflip] = 0;
+               }
+               nonces[i].states_bitarray[EVEN_STATE] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+               if (nonces[i].states_bitarray[EVEN_STATE] == NULL) {
+                       printf("Out of memory error in init_nonce_memory(). Aborting...\n");
+                       exit(4);
+               }
+               set_bitarray24(nonces[i].states_bitarray[EVEN_STATE]);
+               nonces[i].num_states_bitarray[EVEN_STATE] = 1 << 24;
+               nonces[i].states_bitarray[ODD_STATE] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+               if (nonces[i].states_bitarray[ODD_STATE] == NULL) {
+                       printf("Out of memory error in init_nonce_memory(). Aborting...\n");
+                       exit(4);
+               }
+               set_bitarray24(nonces[i].states_bitarray[ODD_STATE]);
+               nonces[i].num_states_bitarray[ODD_STATE] = 1 << 24;
+               nonces[i].all_bitflips_dirty[EVEN_STATE] = false;
+               nonces[i].all_bitflips_dirty[ODD_STATE] = false;
+       }
+       first_byte_num = 0;
+       first_byte_Sum = 0;
+}
+
+
+static void free_nonce_list(noncelistentry_t *p)
+{
+       if (p == NULL) {
+               return;
+       } else {
+               free_nonce_list(p->next);
+               free(p);
+       }
+}
+
+
+static void free_nonces_memory(void)
+{
+       for (uint16_t i = 0; i < 256; i++) {
+               free_nonce_list(nonces[i].first);
+       }
+       for (int i = 255; i >= 0; i--) {
+               free_bitarray(nonces[i].states_bitarray[ODD_STATE]);
+               free_bitarray(nonces[i].states_bitarray[EVEN_STATE]);
+       }
+}
+
+
+// static double p_hypergeometric_cache[257][NUM_SUMS][257];
+
+// #define CACHE_INVALID -1.0
+// static void init_p_hypergeometric_cache(void)
+// {
+       // for (uint16_t n = 0; n <= 256; n++) {
+               // for (uint16_t i_K = 0; i_K < NUM_SUMS; i_K++) {
+                       // for (uint16_t k = 0; k <= 256; k++) {
+                               // p_hypergeometric_cache[n][i_K][k] = CACHE_INVALID;
+                       // }
+               // }
+       // }
+// }
+
+
+static double p_hypergeometric(uint16_t i_K, uint16_t n, uint16_t k) 
+{
+       // for efficient computation we are using the recursive definition
+       //                                              (K-k+1) * (n-k+1)
+       // P(X=k) = P(X=k-1) * --------------------
+       //                                                 k * (N-K-n+k)
+       // and
+       //           (N-K)*(N-K-1)*...*(N-K-n+1)
+       // P(X=0) = -----------------------------
+       //               N*(N-1)*...*(N-n+1)
+
+       
+       uint16_t const N = 256;
+       uint16_t K = sums[i_K];
+
+       // if (p_hypergeometric_cache[n][i_K][k] != CACHE_INVALID) {
+               // return p_hypergeometric_cache[n][i_K][k];
+       // }
+       
+       if (n-k > N-K || k > K) return 0.0;     // avoids log(x<=0) in calculation below
+       if (k == 0) {
+               // use logarithms to avoid overflow with huge factorials (double type can only hold 170!)
+               double log_result = 0.0;
+               for (int16_t i = N-K; i >= N-K-n+1; i--) {
+                       log_result += log(i);
+               } 
+               for (int16_t i = N; i >= N-n+1; i--) {
+                       log_result -= log(i);
+               }
+               // p_hypergeometric_cache[n][i_K][k] = exp(log_result);
+               return exp(log_result);
+       } else {
+               if (n-k == N-K) {       // special case. The published recursion below would fail with a divide by zero exception
+                       double log_result = 0.0;
+                       for (int16_t i = k+1; i <= n; i++) {
+                               log_result += log(i);
+                       }
+                       for (int16_t i = K+1; i <= N; i++) {
+                               log_result -= log(i);
+                       }
+                       // p_hypergeometric_cache[n][i_K][k] = exp(log_result);
+                       return exp(log_result);
+               } else {                        // recursion
+                       return (p_hypergeometric(i_K, n, k-1) * (K-k+1) * (n-k+1) / (k * (N-K-n+k)));
+               }
+       }
+}
+       
+       
+static float sum_probability(uint16_t i_K, uint16_t n, uint16_t k)
+{
+       if (k > sums[i_K]) return 0.0;
+
+       double p_T_is_k_when_S_is_K = p_hypergeometric(i_K, n, k);
+       double p_S_is_K = p_K[i_K];
+       double p_T_is_k = 0;
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               p_T_is_k += p_K[i] * p_hypergeometric(i, n, k);
+       }
+       return(p_T_is_k_when_S_is_K * p_S_is_K / p_T_is_k);
+}
+
+
+static uint32_t part_sum_count[2][NUM_PART_SUMS][NUM_PART_SUMS];
+
+static void init_allbitflips_array(void)
+{
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               uint32_t *bitset = all_bitflips_bitarray[odd_even] = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+               if (bitset == NULL) {
+                       printf("Out of memory in init_allbitflips_array(). Aborting...");
+                       exit(4);
+               }
+               set_bitarray24(bitset);
+               all_bitflips_bitarray_dirty[odd_even] = false;
+               num_all_bitflips_bitarray[odd_even] = 1<<24;
+       }
+}
+
+
+static void update_allbitflips_array(void)
+{      
+       if (hardnested_stage & CHECK_2ND_BYTES) {
+               for (uint16_t i = 0; i < 256; i++) {
+                       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                               if (nonces[i].all_bitflips_dirty[odd_even]) {
+                                       uint32_t old_count = num_all_bitflips_bitarray[odd_even];
+                                       num_all_bitflips_bitarray[odd_even] = count_bitarray_low20_AND(all_bitflips_bitarray[odd_even], nonces[i].states_bitarray[odd_even]);
+                                       nonces[i].all_bitflips_dirty[odd_even] = false;
+                                       if (num_all_bitflips_bitarray[odd_even] != old_count) {
+                                               all_bitflips_bitarray_dirty[odd_even] = true;
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+       
+static uint32_t estimated_num_states_part_sum_coarse(uint16_t part_sum_a0_idx, uint16_t part_sum_a8_idx, odd_even_t odd_even) 
+{
+       return part_sum_count[odd_even][part_sum_a0_idx][part_sum_a8_idx];
+}
+
+
+static uint32_t estimated_num_states_part_sum(uint8_t first_byte, uint16_t part_sum_a0_idx, uint16_t part_sum_a8_idx, odd_even_t odd_even) 
+{
+       if (odd_even == ODD_STATE) {
+               return count_bitarray_AND3(part_sum_a0_bitarrays[odd_even][part_sum_a0_idx], 
+                                          part_sum_a8_bitarrays[odd_even][part_sum_a8_idx], 
+                                              nonces[first_byte].states_bitarray[odd_even]);
+       } else {
+               return count_bitarray_AND4(part_sum_a0_bitarrays[odd_even][part_sum_a0_idx], 
+                                          part_sum_a8_bitarrays[odd_even][part_sum_a8_idx], 
+                                              nonces[first_byte].states_bitarray[odd_even],
+                                          nonces[first_byte^0x80].states_bitarray[odd_even]);
+       }
+
+       // estimate reduction by all_bitflips_match()
+       // if (odd_even) {
+               // float p_bitflip = (float)nonces[first_byte ^ 0x80].num_states_bitarray[ODD_STATE] / num_all_bitflips_bitarray[ODD_STATE];
+               // return (float)count * p_bitflip;             //(p_bitflip - 0.25*p_bitflip*p_bitflip);
+       // } else {
+               // return count;
+       // }
+}
+
+
+static uint64_t estimated_num_states(uint8_t first_byte, uint16_t sum_a0, uint16_t sum_a8)
+{
+       uint64_t num_states = 0;
+       for (uint8_t p = 0; p < NUM_PART_SUMS; p++) {
+               for (uint8_t q = 0; q < NUM_PART_SUMS; q++) {
+                       if (2*p*(16-2*q) + (16-2*p)*2*q == sum_a0) {
+                               for (uint8_t r = 0; r < NUM_PART_SUMS; r++) {
+                                       for (uint8_t s = 0; s < NUM_PART_SUMS; s++) {
+                                               if (2*r*(16-2*s) + (16-2*r)*2*s == sum_a8) {
+                                                       num_states += (uint64_t)estimated_num_states_part_sum(first_byte, p, r, ODD_STATE) 
+                                                                               * estimated_num_states_part_sum(first_byte, q, s, EVEN_STATE);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       return num_states;
+}
+
+
+static uint64_t estimated_num_states_coarse(uint16_t sum_a0, uint16_t sum_a8)
+{
+       uint64_t num_states = 0;
+       for (uint8_t p = 0; p < NUM_PART_SUMS; p++) {
+               for (uint8_t q = 0; q < NUM_PART_SUMS; q++) {
+                       if (2*p*(16-2*q) + (16-2*p)*2*q == sum_a0) {
+                               for (uint8_t r = 0; r < NUM_PART_SUMS; r++) {
+                                       for (uint8_t s = 0; s < NUM_PART_SUMS; s++) {
+                                               if (2*r*(16-2*s) + (16-2*r)*2*s == sum_a8) {
+                                                       num_states += (uint64_t)estimated_num_states_part_sum_coarse(p, r, ODD_STATE) 
+                                                                               * estimated_num_states_part_sum_coarse(q, s, EVEN_STATE);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       return num_states;
+}
+
+
+static void update_p_K(void)
+{
+       if (hardnested_stage & CHECK_2ND_BYTES) {
+               uint64_t total_count = 0;
+               uint16_t sum_a0 = sums[first_byte_Sum];
+               for (uint8_t sum_a8_idx = 0; sum_a8_idx < NUM_SUMS; sum_a8_idx++) {
+                       uint16_t sum_a8 = sums[sum_a8_idx];
+                       total_count += estimated_num_states_coarse(sum_a0, sum_a8);
+               }
+               for (uint8_t sum_a8_idx = 0; sum_a8_idx < NUM_SUMS; sum_a8_idx++) {
+                       uint16_t sum_a8 = sums[sum_a8_idx];
+                       my_p_K[sum_a8_idx] = (float)estimated_num_states_coarse(sum_a0, sum_a8) / total_count;
+               }
+               // printf("my_p_K = [");
+               // for (uint8_t sum_a8_idx = 0; sum_a8_idx < NUM_SUMS; sum_a8_idx++) {
+                       // printf("%7.4f ", my_p_K[sum_a8_idx]);
+               // }
+               p_K = my_p_K;
+       }
+}
+
+
+static void update_sum_bitarrays(odd_even_t odd_even)
+{
+       if (all_bitflips_bitarray_dirty[odd_even]) {
+               for (uint8_t part_sum = 0; part_sum < NUM_PART_SUMS; part_sum++) {
+                       bitarray_AND(part_sum_a0_bitarrays[odd_even][part_sum], all_bitflips_bitarray[odd_even]);
+                       bitarray_AND(part_sum_a8_bitarrays[odd_even][part_sum], all_bitflips_bitarray[odd_even]);
+               }
+               for (uint16_t i = 0; i < 256; i++) {
+                       nonces[i].num_states_bitarray[odd_even] = count_bitarray_AND(nonces[i].states_bitarray[odd_even], all_bitflips_bitarray[odd_even]);
+               }
+               for (uint8_t part_sum_a0 = 0; part_sum_a0 < NUM_PART_SUMS; part_sum_a0++) {
+                       for (uint8_t part_sum_a8 = 0; part_sum_a8 < NUM_PART_SUMS; part_sum_a8++) {
+                               part_sum_count[odd_even][part_sum_a0][part_sum_a8] 
+                                   += count_bitarray_AND2(part_sum_a0_bitarrays[odd_even][part_sum_a0], part_sum_a8_bitarrays[odd_even][part_sum_a8]);
+                       }
+               }
+               all_bitflips_bitarray_dirty[odd_even] = false;
+       }
+}
+
+
+static int compare_expected_num_brute_force(const void *b1, const void *b2)
+{
+       uint8_t index1 = *(uint8_t *)b1;
+       uint8_t index2 = *(uint8_t *)b2;
+       float score1 = nonces[index1].expected_num_brute_force;
+       float score2 = nonces[index2].expected_num_brute_force;
+       return (score1 > score2) - (score1 < score2);
+}
+
+
+static int compare_sum_a8_guess(const void *b1, const void *b2)
+{
+       float prob1 = ((guess_sum_a8_t *)b1)->prob;
+       float prob2 = ((guess_sum_a8_t *)b2)->prob;
+       return (prob1 < prob2) - (prob1 > prob2);
+
+}
+
+
+static float check_smallest_bitflip_bitarrays(void) 
+{
+       uint32_t num_odd, num_even;
+       uint64_t smallest = 1LL << 48;
+       // initialize best_first_bytes, do a rough estimation on remaining states
+       for (uint16_t i = 0; i < 256; i++) {
+               num_odd = nonces[i].num_states_bitarray[ODD_STATE];
+               num_even = nonces[i].num_states_bitarray[EVEN_STATE];   // * (float)nonces[i^0x80].num_states_bitarray[EVEN_STATE] / num_all_bitflips_bitarray[EVEN_STATE];
+               if ((uint64_t)num_odd * num_even < smallest) {
+                       smallest = (uint64_t)num_odd * num_even;
+                       best_first_byte_smallest_bitarray = i;
+               }
+       }
+
+#if defined (DEBUG_REDUCTION)
+       num_odd = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[ODD_STATE];
+       num_even = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[EVEN_STATE];   // * (float)nonces[best_first_byte_smallest_bitarray^0x80].num_states_bitarray[EVEN_STATE] / num_all_bitflips_bitarray[EVEN_STATE];
+       printf("0x%02x: %8d * %8d = %12" PRIu64 " (2^%1.1f)\n", best_first_byte_smallest_bitarray, num_odd, num_even, (uint64_t)num_odd * num_even, log((uint64_t)num_odd * num_even)/log(2.0));
+#endif
+       return (float)smallest/2.0;
+}
+
+
+static void update_expected_brute_force(uint8_t best_byte) {
+
+       float total_prob = 0.0;
+       for (uint8_t i = 0; i < NUM_SUMS; i++) {
+               total_prob += nonces[best_byte].sum_a8_guess[i].prob;
+       }
+       // linear adjust probabilities to result in total_prob = 1.0;
+       for (uint8_t i = 0; i < NUM_SUMS; i++) {
+               nonces[best_byte].sum_a8_guess[i].prob /= total_prob;
+       }
+       float prob_all_failed = 1.0;
+       nonces[best_byte].expected_num_brute_force = 0.0;
+       for (uint8_t i = 0; i < NUM_SUMS; i++) {
+               nonces[best_byte].expected_num_brute_force += nonces[best_byte].sum_a8_guess[i].prob * (float)nonces[best_byte].sum_a8_guess[i].num_states / 2.0;
+               prob_all_failed -= nonces[best_byte].sum_a8_guess[i].prob;
+               nonces[best_byte].expected_num_brute_force += prob_all_failed * (float)nonces[best_byte].sum_a8_guess[i].num_states / 2.0;
+       }
+       return;
+}
+
+
+static float sort_best_first_bytes(void)
+{
+       
+       // initialize best_first_bytes, do a rough estimation on remaining states for each Sum_a8 property
+       // and the expected number of states to brute force
+       for (uint16_t i = 0; i < 256; i++) {
+               best_first_bytes[i] = i;
+               float prob_all_failed = 1.0;
+               nonces[i].expected_num_brute_force = 0.0;
+               for (uint8_t j = 0; j < NUM_SUMS; j++) {
+                       nonces[i].sum_a8_guess[j].num_states = estimated_num_states_coarse(sums[first_byte_Sum], sums[nonces[i].sum_a8_guess[j].sum_a8_idx]);
+                       nonces[i].expected_num_brute_force += nonces[i].sum_a8_guess[j].prob * (float)nonces[i].sum_a8_guess[j].num_states / 2.0;
+                       prob_all_failed -= nonces[i].sum_a8_guess[j].prob;
+                       nonces[i].expected_num_brute_force += prob_all_failed * (float)nonces[i].sum_a8_guess[j].num_states / 2.0;
+               }
+       }
+       
+       // sort based on expected number of states to brute force
+       qsort(best_first_bytes, 256, 1, compare_expected_num_brute_force);
+
+       // printf("refine estimations: ");
+       #define NUM_REFINES     1
+       // refine scores for the best:
+       for (uint16_t i = 0; i < NUM_REFINES; i++) {
+               // printf("%d...", i);
+               uint16_t first_byte = best_first_bytes[i];
+               for (uint8_t j = 0; j < NUM_SUMS && nonces[first_byte].sum_a8_guess[j].prob > 0.05; j++) {
+                       nonces[first_byte].sum_a8_guess[j].num_states = estimated_num_states(first_byte, sums[first_byte_Sum], sums[nonces[first_byte].sum_a8_guess[j].sum_a8_idx]);
+               }
+               // while (nonces[first_byte].sum_a8_guess[0].num_states == 0
+                               // || nonces[first_byte].sum_a8_guess[1].num_states == 0
+                               // || nonces[first_byte].sum_a8_guess[2].num_states == 0) {
+                       // if (nonces[first_byte].sum_a8_guess[0].num_states == 0) {
+                               // nonces[first_byte].sum_a8_guess[0].prob = 0.0;
+                               // printf("(0x%02x,%d)", first_byte, 0);
+                       // }
+                       // if (nonces[first_byte].sum_a8_guess[1].num_states == 0) {
+                               // nonces[first_byte].sum_a8_guess[1].prob = 0.0;
+                               // printf("(0x%02x,%d)", first_byte, 1);
+                       // }
+                       // if (nonces[first_byte].sum_a8_guess[2].num_states == 0) {
+                               // nonces[first_byte].sum_a8_guess[2].prob = 0.0;
+                               // printf("(0x%02x,%d)", first_byte, 2);
+                       // }
+                       // printf("|");
+                       // qsort(nonces[first_byte].sum_a8_guess, NUM_SUMS, sizeof(guess_sum_a8_t), compare_sum_a8_guess);
+                       // for (uint8_t j = 0; j < NUM_SUMS && nonces[first_byte].sum_a8_guess[j].prob > 0.05; j++) {
+                               // nonces[first_byte].sum_a8_guess[j].num_states = estimated_num_states(first_byte, sums[first_byte_Sum], sums[nonces[first_byte].sum_a8_guess[j].sum_a8_idx]);
+                       // }
+               // }
+               // float fix_probs = 0.0;
+               // for (uint8_t j = 0; j < NUM_SUMS; j++) {
+                       // fix_probs += nonces[first_byte].sum_a8_guess[j].prob;
+               // }
+               // for (uint8_t j = 0; j < NUM_SUMS; j++) {
+                       // nonces[first_byte].sum_a8_guess[j].prob /= fix_probs;
+               // }
+               // for (uint8_t j = 0; j < NUM_SUMS && nonces[first_byte].sum_a8_guess[j].prob > 0.05; j++) {
+                       // nonces[first_byte].sum_a8_guess[j].num_states = estimated_num_states(first_byte, sums[first_byte_Sum], sums[nonces[first_byte].sum_a8_guess[j].sum_a8_idx]);
+               // }
+               float prob_all_failed = 1.0;
+               nonces[first_byte].expected_num_brute_force = 0.0;
+               for (uint8_t j = 0; j < NUM_SUMS; j++) {
+                       nonces[first_byte].expected_num_brute_force += nonces[first_byte].sum_a8_guess[j].prob * (float)nonces[first_byte].sum_a8_guess[j].num_states / 2.0;
+                       prob_all_failed -= nonces[first_byte].sum_a8_guess[j].prob;
+                       nonces[first_byte].expected_num_brute_force += prob_all_failed * (float)nonces[first_byte].sum_a8_guess[j].num_states / 2.0;
+               }
+       }
+
+       // copy best byte to front:
+       float least_expected_brute_force = (1LL << 48);
+       uint8_t best_byte = 0;
+       for (uint16_t i = 0; i < 10; i++) {
+               uint16_t first_byte = best_first_bytes[i];
+               if (nonces[first_byte].expected_num_brute_force < least_expected_brute_force) {
+                       least_expected_brute_force = nonces[first_byte].expected_num_brute_force;
+                       best_byte = i;
+               }
+       }
+       if (best_byte != 0) {
+               // printf("0x%02x <-> 0x%02x", best_first_bytes[0], best_first_bytes[best_byte]);
+               uint8_t tmp = best_first_bytes[0];
+               best_first_bytes[0] = best_first_bytes[best_byte];
+               best_first_bytes[best_byte] = tmp;
+       }
+
+       return nonces[best_first_bytes[0]].expected_num_brute_force;
+}
+
+
+static float update_reduction_rate(float last, bool init) 
+{
+#define QUEUE_LEN      4
+       static float queue[QUEUE_LEN];
+       
+       for (uint16_t i = 0; i < QUEUE_LEN-1; i++) {
+               if (init) {
+                       queue[i] = (float)(1LL << 48);
+               } else {
+                       queue[i] = queue[i+1];
+               }
+       }
+       if (init) {
+               queue[QUEUE_LEN-1] = (float)(1LL << 48);
+       } else {
+               queue[QUEUE_LEN-1] = last;
+       }
+       
+       // linear regression
+       float avg_y = 0.0;
+       float avg_x = 0.0;
+       for (uint16_t i = 0; i < QUEUE_LEN; i++) {
+               avg_x += i;
+               avg_y += queue[i];
+       }
+       avg_x /= QUEUE_LEN;
+       avg_y /= QUEUE_LEN;
+       
+       float dev_xy = 0.0;
+       float dev_x2 = 0.0;
+       for (uint16_t i = 0; i < QUEUE_LEN; i++) {
+               dev_xy += (i - avg_x)*(queue[i] - avg_y);
+               dev_x2 += (i - avg_x)*(i - avg_x);
+       }
+
+       float reduction_rate = -1.0 * dev_xy / dev_x2;  // the negative slope of the linear regression
+
+#if defined (DEBUG_REDUCTION)  
+       printf("update_reduction_rate(%1.0f) = %1.0f per sample, brute_force_per_sample = %1.0f\n", last, reduction_rate, brute_force_per_second * (float)sample_period / 1000.0);
+#endif 
+       return reduction_rate;
+}
+
+
+static bool shrink_key_space(float *brute_forces)
+{
+#if defined(DEBUG_REDUCTION)
+       printf("shrink_key_space() with stage = 0x%02x\n", hardnested_stage);
+#endif
+       float brute_forces1 = check_smallest_bitflip_bitarrays();
+       float brute_forces2 = (float)(1LL << 47);
+       if (hardnested_stage & CHECK_2ND_BYTES) {
+               brute_forces2 = sort_best_first_bytes();
+       }
+       *brute_forces = MIN(brute_forces1, brute_forces2);
+       float reduction_rate = update_reduction_rate(*brute_forces, false);
+       return ((hardnested_stage & CHECK_2ND_BYTES) 
+               && reduction_rate >= 0.0 && reduction_rate < brute_force_per_second * sample_period / 1000.0);
+}
+
+       
+static void estimate_sum_a8(void) 
+{
+       if (first_byte_num == 256) {
+               for (uint16_t i = 0; i < 256; i++) {
+                       if (nonces[i].sum_a8_guess_dirty) {
+                               for (uint16_t j = 0; j < NUM_SUMS; j++ ) {
+                                       uint16_t sum_a8_idx = nonces[i].sum_a8_guess[j].sum_a8_idx;
+                                       nonces[i].sum_a8_guess[j].prob = sum_probability(sum_a8_idx, nonces[i].num, nonces[i].Sum);
+                               }
+                               qsort(nonces[i].sum_a8_guess, NUM_SUMS, sizeof(guess_sum_a8_t), compare_sum_a8_guess);
+                               nonces[i].sum_a8_guess_dirty = false;
+                       }
+               }
+       }
+}      
+
+
+static int read_nonce_file(void)
+{
+       FILE *fnonces = NULL;
+       size_t bytes_read;
+       uint8_t trgBlockNo;
+       uint8_t trgKeyType;
+       uint8_t read_buf[9];
+       uint32_t nt_enc1, nt_enc2;
+       uint8_t par_enc;
+       
+       num_acquired_nonces = 0;
+       if ((fnonces = fopen("nonces.bin","rb")) == NULL) { 
+               PrintAndLog("Could not open file nonces.bin");
+               return 1;
+       }
+
+       hardnested_print_progress(0, "Reading nonces from file nonces.bin...", (float)(1LL<<47), 0);
+       bytes_read = fread(read_buf, 1, 6, fnonces);
+       if (bytes_read != 6) {
+               PrintAndLog("File reading error.");
+               fclose(fnonces);
+               return 1;
+       }
+       cuid = bytes_to_num(read_buf, 4);
+       trgBlockNo = bytes_to_num(read_buf+4, 1);
+       trgKeyType = bytes_to_num(read_buf+5, 1);
+
+       bytes_read = fread(read_buf, 1, 9, fnonces);
+       while (bytes_read == 9) {
+               nt_enc1 = bytes_to_num(read_buf, 4);
+               nt_enc2 = bytes_to_num(read_buf+4, 4);
+               par_enc = bytes_to_num(read_buf+8, 1);
+               add_nonce(nt_enc1, par_enc >> 4);
+               add_nonce(nt_enc2, par_enc & 0x0f);
+               num_acquired_nonces += 2;
+               bytes_read = fread(read_buf, 1, 9, fnonces);
+       }
+       fclose(fnonces);
+       
+       char progress_string[80];
+       sprintf(progress_string, "Read %d nonces from file. cuid=%08x", num_acquired_nonces, cuid); 
+       hardnested_print_progress(num_acquired_nonces, progress_string, (float)(1LL<<47), 0);
+       sprintf(progress_string, "Target Block=%d, Keytype=%c", trgBlockNo, trgKeyType==0?'A':'B');
+       hardnested_print_progress(num_acquired_nonces, progress_string, (float)(1LL<<47), 0);
+
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               if (first_byte_Sum == sums[i]) {
+                       first_byte_Sum = i;
+                       break;
+               }
+       }
+       
+       return 0;
+}
+
+
+noncelistentry_t *SearchFor2ndByte(uint8_t b1, uint8_t b2)
+{
+       noncelistentry_t *p = nonces[b1].first;
+       while (p != NULL) {
+               if ((p->nonce_enc >> 16 & 0xff) == b2) {
+                       return p;
+               }
+               p = p->next;
+       }
+       return NULL;
+}
+
+
+static bool timeout(void)
+{
+       return (msclock() > last_sample_clock + sample_period);
+}
+
+
+static void *check_for_BitFlipProperties_thread(void *args)
+{
+       uint8_t first_byte = ((uint8_t *)args)[0];
+       uint8_t last_byte = ((uint8_t *)args)[1];
+       uint8_t time_budget = ((uint8_t *)args)[2];
+       
+       if (hardnested_stage & CHECK_1ST_BYTES) {
+               // for (uint16_t bitflip = 0x001; bitflip < 0x200; bitflip++) {
+               for (uint16_t bitflip_idx = 0; bitflip_idx < num_1st_byte_effective_bitflips; bitflip_idx++) {
+                       uint16_t bitflip = all_effective_bitflip[bitflip_idx];
+                       if (time_budget & timeout()) {
+#if defined (DEBUG_REDUCTION)                          
+                               printf("break at bitflip_idx %d...", bitflip_idx);
+#endif                         
+                               return NULL;
+                       }
+                       for (uint16_t i = first_byte; i <= last_byte; i++) {
+                               if (nonces[i].BitFlips[bitflip] == 0 && nonces[i].BitFlips[bitflip ^ 0x100] == 0
+                                       && nonces[i].first != NULL && nonces[i^(bitflip&0xff)].first != NULL) {
+                                       uint8_t parity1 = (nonces[i].first->par_enc) >> 3;                                      // parity of first byte
+                                       uint8_t parity2 = (nonces[i^(bitflip&0xff)].first->par_enc) >> 3;       // parity of nonce with bits flipped
+                                       if ((parity1 == parity2 && !(bitflip & 0x100))                  // bitflip
+                                               || (parity1 != parity2 && (bitflip & 0x100))) {         // not bitflip
+                                               nonces[i].BitFlips[bitflip] = 1;
+                                               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                                                       if (bitflip_bitarrays[odd_even][bitflip] != NULL) {
+                                                               uint32_t old_count = nonces[i].num_states_bitarray[odd_even];
+                                                               nonces[i].num_states_bitarray[odd_even] = count_bitarray_AND(nonces[i].states_bitarray[odd_even], bitflip_bitarrays[odd_even][bitflip]);
+                                                               if (nonces[i].num_states_bitarray[odd_even] != old_count) {
+                                                                       nonces[i].all_bitflips_dirty[odd_even] = true;
+                                                               }
+                                                               // printf("bitflip: %d old: %d, new: %d ", bitflip, old_count, nonces[i].num_states_bitarray[odd_even]);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       ((uint8_t *)args)[1] = num_1st_byte_effective_bitflips - bitflip_idx - 1;  // bitflips still to go in stage 1
+               }
+       }
+
+       ((uint8_t *)args)[1] = 0;  // stage 1 definitely completed
+
+       if (hardnested_stage & CHECK_2ND_BYTES) {
+               for (uint16_t bitflip_idx = num_1st_byte_effective_bitflips; bitflip_idx < num_all_effective_bitflips; bitflip_idx++) {
+                       uint16_t bitflip = all_effective_bitflip[bitflip_idx];
+                       if (time_budget & timeout()) {
+#if defined (DEBUG_REDUCTION)
+                               printf("break at bitflip_idx %d...", bitflip_idx);
+#endif
+                               return NULL;
+                       }
+                       for (uint16_t i = first_byte; i <= last_byte; i++) {
+                               // Check for Bit Flip Property of 2nd bytes
+                               if (nonces[i].BitFlips[bitflip] == 0) {
+                                       for (uint16_t j = 0; j < 256; j++) {    // for each 2nd Byte
+                                               noncelistentry_t *byte1 = SearchFor2ndByte(i, j);
+                                               noncelistentry_t *byte2 = SearchFor2ndByte(i, j^(bitflip&0xff));
+                                               if (byte1 != NULL && byte2 != NULL) {
+                                                       uint8_t parity1 = byte1->par_enc >> 2 & 0x01;   // parity of 2nd byte
+                                                       uint8_t parity2 = byte2->par_enc >> 2 & 0x01;   // parity of 2nd byte with bits flipped
+                                                       if ((parity1 == parity2 && !(bitflip&0x100))            // bitflip
+                                                               || (parity1 != parity2 && (bitflip&0x100))) { // not bitflip
+                                                               nonces[i].BitFlips[bitflip] = 1;
+                                                               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                                                                       if (bitflip_bitarrays[odd_even][bitflip] != NULL) {
+                                                                               uint32_t old_count = nonces[i].num_states_bitarray[odd_even];
+                                                                               nonces[i].num_states_bitarray[odd_even] = count_bitarray_AND(nonces[i].states_bitarray[odd_even], bitflip_bitarrays[odd_even][bitflip]);
+                                                                               if (nonces[i].num_states_bitarray[odd_even] != old_count) {
+                                                                                       nonces[i].all_bitflips_dirty[odd_even] = true;
+                                                                               }
+                                                                       }
+                                                               }
+                                                               break;
+                                                       }
+                                               }
+                                       }
+                               }
+                               // printf("states_bitarray[0][%" PRIu16 "] contains %d ones.\n", i, count_states(nonces[i].states_bitarray[EVEN_STATE]));
+                               // printf("states_bitarray[1][%" PRIu16 "] contains %d ones.\n", i, count_states(nonces[i].states_bitarray[ODD_STATE]));
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+
+static void check_for_BitFlipProperties(bool time_budget)
+{
+       // create and run worker threads
+       pthread_t thread_id[NUM_CHECK_BITFLIPS_THREADS];
+               
+       uint8_t args[NUM_CHECK_BITFLIPS_THREADS][3];
+       uint16_t bytes_per_thread = (256 + (NUM_CHECK_BITFLIPS_THREADS/2)) / NUM_CHECK_BITFLIPS_THREADS; 
+       for (uint8_t i = 0; i < NUM_CHECK_BITFLIPS_THREADS; i++) {
+               args[i][0] = i * bytes_per_thread;
+               args[i][1] = MIN(args[i][0]+bytes_per_thread-1, 255);
+               args[i][2] = time_budget;
+       }
+       args[NUM_CHECK_BITFLIPS_THREADS-1][1] = MAX(args[NUM_CHECK_BITFLIPS_THREADS-1][1], 255);
+       
+       // start threads
+       for (uint8_t i = 0; i < NUM_CHECK_BITFLIPS_THREADS; i++) {
+               pthread_create(&thread_id[i], NULL, check_for_BitFlipProperties_thread, args[i]);
+       }
+       
+       // wait for threads to terminate:
+       for (uint8_t i = 0; i < NUM_CHECK_BITFLIPS_THREADS; i++) {
+               pthread_join(thread_id[i], NULL);
+       }
+       
+       if (hardnested_stage & CHECK_2ND_BYTES) {
+               hardnested_stage &= ~CHECK_1ST_BYTES;   // we are done with 1st stage, except...
+               for (uint16_t i = 0; i < NUM_CHECK_BITFLIPS_THREADS; i++) {
+                       if (args[i][1] != 0) {
+                               hardnested_stage |= CHECK_1ST_BYTES;  // ... when any of the threads didn't complete in time
+                               break;
+                       }
+               }
+       }
+#if defined (DEBUG_REDUCTION)  
+       if (hardnested_stage & CHECK_1ST_BYTES) printf("stage 1 not completed yet\n");
+#endif
+}
+
+
+static void update_nonce_data(bool time_budget)
+{
+       check_for_BitFlipProperties(time_budget);
+       update_allbitflips_array();
+       update_sum_bitarrays(EVEN_STATE);
+       update_sum_bitarrays(ODD_STATE);
+       update_p_K();
+       estimate_sum_a8();
+}
+
+
+static void apply_sum_a0(void)
+{
+       uint32_t old_count = num_all_bitflips_bitarray[EVEN_STATE];
+       num_all_bitflips_bitarray[EVEN_STATE] = count_bitarray_AND(all_bitflips_bitarray[EVEN_STATE], sum_a0_bitarrays[EVEN_STATE][first_byte_Sum]);
+       if (num_all_bitflips_bitarray[EVEN_STATE] != old_count) {
+               all_bitflips_bitarray_dirty[EVEN_STATE] = true;
+       }
+       old_count = num_all_bitflips_bitarray[ODD_STATE];
+       num_all_bitflips_bitarray[ODD_STATE] = count_bitarray_AND(all_bitflips_bitarray[ODD_STATE], sum_a0_bitarrays[ODD_STATE][first_byte_Sum]);
+       if (num_all_bitflips_bitarray[ODD_STATE] != old_count) {
+               all_bitflips_bitarray_dirty[ODD_STATE] = true;
+       }
+}
+
+
+static void simulate_MFplus_RNG(uint32_t test_cuid, uint64_t test_key, uint32_t *nt_enc, uint8_t *par_enc)
+{
+       struct Crypto1State sim_cs = {0, 0};
+
+       // init cryptostate with key:
+       for(int8_t i = 47; i > 0; i -= 2) {
+               sim_cs.odd  = sim_cs.odd  << 1 | BIT(test_key, (i - 1) ^ 7);
+               sim_cs.even = sim_cs.even << 1 | BIT(test_key, i ^ 7);
+       }
+
+       *par_enc = 0;
+       uint32_t nt = (rand() & 0xff) << 24 | (rand() & 0xff) << 16 | (rand() & 0xff) << 8 | (rand() & 0xff);
+       for (int8_t byte_pos = 3; byte_pos >= 0; byte_pos--) {
+               uint8_t nt_byte_dec = (nt >> (8*byte_pos)) & 0xff;
+               uint8_t nt_byte_enc = crypto1_byte(&sim_cs, nt_byte_dec ^ (test_cuid >> (8*byte_pos)), false) ^ nt_byte_dec;    // encode the nonce byte
+               *nt_enc = (*nt_enc << 8) | nt_byte_enc;         
+               uint8_t ks_par = filter(sim_cs.odd);                                                                                    // the keystream bit to encode/decode the parity bit
+               uint8_t nt_byte_par_enc = ks_par ^ oddparity8(nt_byte_dec);                                             // determine the nt byte's parity and encode it
+               *par_enc = (*par_enc << 1) | nt_byte_par_enc;
+       }
+       
+}
+
+
+static void simulate_acquire_nonces()
+{
+       time_t time1 = time(NULL);
+       last_sample_clock = 0;
+       sample_period = 1000;           // for simulation
+       hardnested_stage = CHECK_1ST_BYTES;
+       bool acquisition_completed = false;
+       uint32_t total_num_nonces = 0;
+       float brute_force;
+       bool reported_suma8 = false;
+       
+       cuid = (rand() & 0xff) << 24 | (rand() & 0xff) << 16 | (rand() & 0xff) << 8 | (rand() & 0xff);
+       if (known_target_key == -1) {
+               known_target_key = ((uint64_t)rand() & 0xfff) << 36 | ((uint64_t)rand() & 0xfff) << 24 | ((uint64_t)rand() & 0xfff) << 12 | ((uint64_t)rand() & 0xfff);
+       }
+
+       char progress_text[80];
+       sprintf(progress_text, "Simulating key %012" PRIx64 ", cuid %08" PRIx32 " ...", known_target_key, cuid);
+       hardnested_print_progress(0, progress_text, (float)(1LL<<47), 0);
+       fprintf(fstats, "%012" PRIx64 ";%" PRIx32 ";", known_target_key, cuid);
+
+       num_acquired_nonces = 0;
+       
+       do {
+               uint32_t nt_enc = 0;
+               uint8_t par_enc = 0;
+
+               for (uint16_t i = 0; i < 113; i++) {
+                       simulate_MFplus_RNG(cuid, known_target_key, &nt_enc, &par_enc);
+                       num_acquired_nonces += add_nonce(nt_enc, par_enc);
+                       total_num_nonces++;
+               }
+
+               last_sample_clock = msclock();
+       
+               if (first_byte_num == 256 ) {
+                       if (hardnested_stage == CHECK_1ST_BYTES) {
+                               for (uint16_t i = 0; i < NUM_SUMS; i++) {
+                                       if (first_byte_Sum == sums[i]) {
+                                               first_byte_Sum = i;
+                                               break;
+                                       }
+                               }
+                               hardnested_stage |= CHECK_2ND_BYTES;
+                               apply_sum_a0();
+                       } 
+                       update_nonce_data(true);
+                       acquisition_completed = shrink_key_space(&brute_force);
+                       if (!reported_suma8) {
+                               char progress_string[80];
+                               sprintf(progress_string, "Apply Sum property. Sum(a0) = %d", sums[first_byte_Sum]);
+                               hardnested_print_progress(num_acquired_nonces, progress_string, brute_force, 0);
+                               reported_suma8 = true;
+                       } else {
+                               hardnested_print_progress(num_acquired_nonces, "Apply bit flip properties", brute_force, 0);
+                       }
+               } else {
+                       update_nonce_data(true);
+                       acquisition_completed = shrink_key_space(&brute_force);
+                       hardnested_print_progress(num_acquired_nonces, "Apply bit flip properties", brute_force, 0);
+               }
+       } while (!acquisition_completed);
+
+       time_t end_time = time(NULL);
+       // PrintAndLog("Acquired a total of %" PRId32" nonces in %1.0f seconds (%1.0f nonces/minute)", 
+               // num_acquired_nonces, 
+               // difftime(end_time, time1), 
+               // difftime(end_time, time1)!=0.0?(float)total_num_nonces*60.0/difftime(end_time, time1):INFINITY
+               // );
+
+       fprintf(fstats, "%" PRId32 ";%" PRId32 ";%1.0f;", total_num_nonces, num_acquired_nonces, difftime(end_time,time1));
+               
+}
+
+
+static int acquire_nonces(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, bool nonce_file_write, bool slow)
+{
+       last_sample_clock = msclock();
+       sample_period = 2000;   // initial rough estimate. Will be refined.
+       bool initialize = true;
+       bool field_off = false;
+       hardnested_stage = CHECK_1ST_BYTES;
+       bool acquisition_completed = false;
+       uint32_t flags = 0;
+       uint8_t write_buf[9];
+       uint32_t total_num_nonces = 0;
+       float brute_force;
+       bool reported_suma8 = false;
+       FILE *fnonces = NULL;
+       UsbCommand resp;
+
+       num_acquired_nonces = 0;
+       
+       clearCommandBuffer();
+
+       do {
+               flags = 0;
+               flags |= initialize ? 0x0001 : 0;
+               flags |= slow ? 0x0002 : 0;
+               flags |= field_off ? 0x0004 : 0;
+               UsbCommand c = {CMD_MIFARE_ACQUIRE_ENCRYPTED_NONCES, {blockNo + keyType * 0x100, trgBlockNo + trgKeyType * 0x100, flags}};
+               memcpy(c.d.asBytes, key, 6);
+
+               SendCommand(&c);
+               
+               if (field_off) break;
+               
+               if (initialize) {
+                       if (!WaitForResponseTimeout(CMD_ACK, &resp, 3000)) return 1;
+
+                       if (resp.arg[0]) return resp.arg[0];  // error during nested_hard
+
+                       cuid = resp.arg[1];
+                       // PrintAndLog("Acquiring nonces for CUID 0x%08x", cuid); 
+                       if (nonce_file_write && fnonces == NULL) {
+                               if ((fnonces = fopen("nonces.bin","wb")) == NULL) { 
+                                       PrintAndLog("Could not create file nonces.bin");
+                                       return 3;
+                               }
+                               hardnested_print_progress(0, "Writing acquired nonces to binary file nonces.bin", (float)(1LL<<47), 0);
+                               num_to_bytes(cuid, 4, write_buf);
+                               fwrite(write_buf, 1, 4, fnonces);
+                               fwrite(&trgBlockNo, 1, 1, fnonces);
+                               fwrite(&trgKeyType, 1, 1, fnonces);
+                       }
+               }
+
+               if (!initialize) {
+                       uint32_t nt_enc1, nt_enc2;
+                       uint8_t par_enc;
+                       uint16_t num_sampled_nonces = resp.arg[2];
+                       uint8_t *bufp = resp.d.asBytes;
+                       for (uint16_t i = 0; i < num_sampled_nonces; i+=2) {
+                               nt_enc1 = bytes_to_num(bufp, 4);
+                               nt_enc2 = bytes_to_num(bufp+4, 4);
+                               par_enc = bytes_to_num(bufp+8, 1);
+                               
+                               //printf("Encrypted nonce: %08x, encrypted_parity: %02x\n", nt_enc1, par_enc >> 4);
+                               num_acquired_nonces += add_nonce(nt_enc1, par_enc >> 4);
+                               //printf("Encrypted nonce: %08x, encrypted_parity: %02x\n", nt_enc2, par_enc & 0x0f);
+                               num_acquired_nonces += add_nonce(nt_enc2, par_enc & 0x0f);
+
+                               if (nonce_file_write) {
+                                       fwrite(bufp, 1, 9, fnonces);
+                               }
+                               bufp += 9;
+                       }
+                       total_num_nonces += num_sampled_nonces;
+               
+                       if (first_byte_num == 256 ) {
+                               if (hardnested_stage == CHECK_1ST_BYTES) {
+                                       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+                                               if (first_byte_Sum == sums[i]) {
+                                                       first_byte_Sum = i;
+                                                       break;
+                                               }
+                                       }
+                                       hardnested_stage |= CHECK_2ND_BYTES;
+                                       apply_sum_a0();
+                               }
+                               update_nonce_data(true);
+                               acquisition_completed = shrink_key_space(&brute_force);
+                               if (!reported_suma8) {
+                                       char progress_string[80];
+                                       sprintf(progress_string, "Apply Sum property. Sum(a0) = %d", sums[first_byte_Sum]);
+                                       hardnested_print_progress(num_acquired_nonces, progress_string, brute_force, 0);
+                                       reported_suma8 = true;
+                               } else {
+                                       hardnested_print_progress(num_acquired_nonces, "Apply bit flip properties", brute_force, 0);
+                               }
+                       } else {
+                               update_nonce_data(true);
+                               acquisition_completed = shrink_key_space(&brute_force);
+                               hardnested_print_progress(num_acquired_nonces, "Apply bit flip properties", brute_force, 0);
+                       }
+               }
+               
+               if (acquisition_completed) {
+                       field_off = true;       // switch off field with next SendCommand and then finish
+               }
+
+               if (!initialize) {
+                       if (!WaitForResponseTimeout(CMD_ACK, &resp, 3000)) {
+                               if (nonce_file_write) {
+                                       fclose(fnonces);
+                               }
+                               return 1;
+                       }
+                       if (resp.arg[0]) {
+                               if (nonce_file_write) {
+                                       fclose(fnonces);
+                               }
+                               return resp.arg[0];  // error during nested_hard
+                       }
+               }
+
+               initialize = false;
+
+               if (msclock() - last_sample_clock < sample_period) {
+                       sample_period = msclock() - last_sample_clock;
+               }
+               last_sample_clock = msclock();
+
+       } while (!acquisition_completed || field_off);
+
+       if (nonce_file_write) {
+               fclose(fnonces);
+       }
+       
+       // PrintAndLog("Sampled a total of %d nonces in %d seconds (%0.0f nonces/minute)", 
+               // total_num_nonces, 
+               // time(NULL)-time1, 
+               // (float)total_num_nonces*60.0/(time(NULL)-time1));
+       
+       return 0;
+}
+
+
+static inline bool invariant_holds(uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, uint_fast8_t bit, uint_fast8_t state_bit)
+{
+       uint_fast8_t j_1_bit_mask = 0x01 << (bit-1);
+       uint_fast8_t bit_diff = byte_diff & j_1_bit_mask;                                                                               // difference of (j-1)th bit
+       uint_fast8_t filter_diff = filter(state1 >> (4-state_bit)) ^ filter(state2 >> (4-state_bit));   // difference in filter function
+       uint_fast8_t mask_y12_y13 = 0xc0 >> state_bit;
+       uint_fast8_t state_bits_diff = (state1 ^ state2) & mask_y12_y13;                                                // difference in state bits 12 and 13
+       uint_fast8_t all_diff = evenparity8(bit_diff ^ state_bits_diff ^ filter_diff);                  // use parity function to XOR all bits
+       return !all_diff;
+}
+
+
+static inline bool invalid_state(uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, uint_fast8_t bit, uint_fast8_t state_bit)
+{
+       uint_fast8_t j_bit_mask = 0x01 << bit;
+       uint_fast8_t bit_diff = byte_diff & j_bit_mask;                                                                                 // difference of jth bit
+       uint_fast8_t mask_y13_y16 = 0x48 >> state_bit;
+       uint_fast8_t state_bits_diff = (state1 ^ state2) & mask_y13_y16;                                                // difference in state bits 13 and 16
+       uint_fast8_t all_diff = evenparity8(bit_diff ^ state_bits_diff);                                                // use parity function to XOR all bits
+       return all_diff;
+}
+
+
+static inline bool remaining_bits_match(uint_fast8_t num_common_bits, uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, odd_even_t odd_even)
+{
+       if (odd_even) {
+               // odd bits
+               switch (num_common_bits) {
+                       case 0: if (!invariant_holds(byte_diff, state1, state2, 1, 0)) return true;
+                       case 1: if (invalid_state(byte_diff, state1, state2, 1, 0)) return false;
+                       case 2: if (!invariant_holds(byte_diff, state1, state2, 3, 1)) return true;
+                       case 3: if (invalid_state(byte_diff, state1, state2, 3, 1)) return false;
+                       case 4: if (!invariant_holds(byte_diff, state1, state2, 5, 2)) return true;
+                       case 5: if (invalid_state(byte_diff, state1, state2, 5, 2)) return false;
+                       case 6: if (!invariant_holds(byte_diff, state1, state2, 7, 3)) return true;
+                       case 7: if (invalid_state(byte_diff, state1, state2, 7, 3)) return false;
+               }
+       } else {
+               // even bits
+               switch (num_common_bits) {      
+                       case 0: if (invalid_state(byte_diff, state1, state2, 0, 0)) return false;
+                       case 1: if (!invariant_holds(byte_diff, state1, state2, 2, 1)) return true;
+                       case 2: if (invalid_state(byte_diff, state1, state2, 2, 1)) return false;
+                       case 3: if (!invariant_holds(byte_diff, state1, state2, 4, 2)) return true;
+                       case 4: if (invalid_state(byte_diff, state1, state2, 4, 2)) return false;
+                       case 5: if (!invariant_holds(byte_diff, state1, state2, 6, 3)) return true;
+                       case 6: if (invalid_state(byte_diff, state1, state2, 6, 3)) return false;
+               }
+       }
+       
+       return true;                                    // valid state
+}
+
+
+static pthread_mutex_t statelist_cache_mutex;
+static pthread_mutex_t book_of_work_mutex;
+
+
+typedef enum {
+       TO_BE_DONE,
+       WORK_IN_PROGRESS,
+       COMPLETED
+} work_status_t;
+
+static struct sl_cache_entry {
+       uint32_t *sl;
+       uint32_t len;
+       work_status_t cache_status;
+       } sl_cache[NUM_PART_SUMS][NUM_PART_SUMS][2];
+
+
+static void init_statelist_cache(void)
+{
+       pthread_mutex_lock(&statelist_cache_mutex);
+       for (uint16_t i = 0; i < NUM_PART_SUMS; i++) {
+               for (uint16_t j = 0; j < NUM_PART_SUMS; j++) {
+                       for (uint16_t k = 0; k < 2; k++) {
+                               sl_cache[i][j][k].sl = NULL;
+                               sl_cache[i][j][k].len = 0;
+                               sl_cache[i][j][k].cache_status = TO_BE_DONE;
+                       }
+               }
+       }               
+       pthread_mutex_unlock(&statelist_cache_mutex);
+}
+
+
+static void free_statelist_cache(void)
+{
+       pthread_mutex_lock(&statelist_cache_mutex);
+       for (uint16_t i = 0; i < NUM_PART_SUMS; i++) {
+               for (uint16_t j = 0; j < NUM_PART_SUMS; j++) {
+                       for (uint16_t k = 0; k < 2; k++) {
+                               free(sl_cache[i][j][k].sl);
+                       }
+               }
+       }               
+       pthread_mutex_unlock(&statelist_cache_mutex);
+}
+
+
+#ifdef DEBUG_KEY_ELIMINATION
+static inline bool bitflips_match(uint8_t byte, uint32_t state, odd_even_t odd_even, bool quiet)
+#else
+static inline bool bitflips_match(uint8_t byte, uint32_t state, odd_even_t odd_even)
+#endif 
+{
+       uint32_t *bitset = nonces[byte].states_bitarray[odd_even];
+       bool possible = test_bit24(bitset, state);
+       if (!possible) {
+#ifdef DEBUG_KEY_ELIMINATION
+               if (!quiet && known_target_key != -1 && state == test_state[odd_even]) {
+                       printf("Initial state lists: %s test state eliminated by bitflip property.\n", odd_even==EVEN_STATE?"even":"odd");
+                       sprintf(failstr, "Initial %s Byte Bitflip property", odd_even==EVEN_STATE?"even":"odd");
+               }
+#endif
+               return false;
+       } else {
+               return true;
+       }
+}
+       
+       
+static uint_fast8_t reverse(uint_fast8_t byte)
+{
+       uint_fast8_t rev_byte = 0;
+       
+       for (uint8_t i = 0; i < 8; i++) {
+               rev_byte <<= 1;
+               rev_byte |= (byte >> i) & 0x01;
+       }
+       
+       return rev_byte;
+}
+
+
+static bool all_bitflips_match(uint8_t byte, uint32_t state, odd_even_t odd_even) 
+{
+       uint32_t masks[2][8] = {{0x00fffff0, 0x00fffff8, 0x00fffff8, 0x00fffffc, 0x00fffffc, 0x00fffffe, 0x00fffffe, 0x00ffffff},
+                                                       {0x00fffff0, 0x00fffff0, 0x00fffff8, 0x00fffff8, 0x00fffffc, 0x00fffffc, 0x00fffffe, 0x00fffffe} };
+       
+       for (uint16_t i = 1; i < 256; i++) {
+               uint_fast8_t bytes_diff = reverse(i);   // start with most common bits
+               uint_fast8_t byte2 = byte ^ bytes_diff;
+               uint_fast8_t num_common = trailing_zeros(bytes_diff);
+               uint32_t mask = masks[odd_even][num_common];
+               bool found_match = false;
+               for (uint8_t remaining_bits = 0; remaining_bits <= (~mask & 0xff); remaining_bits++) {
+                       if (remaining_bits_match(num_common, bytes_diff, state, (state & mask) | remaining_bits, odd_even)) {
+#ifdef DEBUG_KEY_ELIMINATION
+                               if (bitflips_match(byte2, (state & mask) | remaining_bits, odd_even, true)) {
+#else
+                               if (bitflips_match(byte2, (state & mask) | remaining_bits, odd_even)) {
+#endif                                         
+                                       found_match = true;
+                                       break;
+                               }
+                       }
+               }
+               if (!found_match) {
+#ifdef DEBUG_KEY_ELIMINATION                           
+                       if (known_target_key != -1 && state == test_state[odd_even]) {
+                               printf("all_bitflips_match() 1st Byte: %s test state (0x%06x): Eliminated. Bytes = %02x, %02x, Common Bits = %d\n", 
+                                       odd_even==ODD_STATE?"odd":"even",
+                                       test_state[odd_even],
+                                       byte, byte2, num_common);
+                               if (failstr[0] == '\0') {
+                                       sprintf(failstr, "Other 1st Byte %s, all_bitflips_match(), no match", odd_even?"odd":"even");
+                               }
+                       }
+#endif
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+
+static void    bitarray_to_list(uint8_t byte, uint32_t *bitarray, uint32_t *state_list, uint32_t *len, odd_even_t odd_even)
+{
+       uint32_t *p = state_list;
+       for (uint32_t state = next_state(bitarray, -1L); state < (1<<24); state = next_state(bitarray, state)) {
+               if (all_bitflips_match(byte, state, odd_even)) {
+                       *p++ = state;
+               }
+       }
+       // add End Of List marker
+       *p = 0xffffffff;
+       *len = p - state_list;
+}
+
+
+static void add_cached_states(statelist_t *candidates, uint16_t part_sum_a0, uint16_t part_sum_a8, odd_even_t odd_even)
+{
+       candidates->states[odd_even] = sl_cache[part_sum_a0/2][part_sum_a8/2][odd_even].sl;
+       candidates->len[odd_even] = sl_cache[part_sum_a0/2][part_sum_a8/2][odd_even].len;
+       return;
+}
+
+
+static void add_matching_states(statelist_t *candidates, uint8_t part_sum_a0, uint8_t part_sum_a8, odd_even_t odd_even)
+{
+       uint32_t worstcase_size = 1<<20;
+       candidates->states[odd_even] = (uint32_t *)malloc(sizeof(uint32_t) * worstcase_size);
+       if (candidates->states[odd_even] == NULL) {
+               PrintAndLog("Out of memory error in add_matching_states() - statelist.\n");
+               exit(4);
+       }
+       uint32_t *candidates_bitarray = (uint32_t *)malloc_bitarray(sizeof(uint32_t) * (1<<19));
+       if (candidates_bitarray == NULL) {
+               PrintAndLog("Out of memory error in add_matching_states() - bitarray.\n");
+               free(candidates->states[odd_even]);
+               exit(4);
+       }
+       
+       uint32_t *bitarray_a0 = part_sum_a0_bitarrays[odd_even][part_sum_a0/2];
+       uint32_t *bitarray_a8 = part_sum_a8_bitarrays[odd_even][part_sum_a8/2];
+       uint32_t *bitarray_bitflips = nonces[best_first_bytes[0]].states_bitarray[odd_even];
+
+       // for (uint32_t i = 0; i < (1<<19); i++) {
+               // candidates_bitarray[i] = bitarray_a0[i] & bitarray_a8[i] & bitarray_bitflips[i];
+       // }
+       bitarray_AND4(candidates_bitarray, bitarray_a0, bitarray_a8, bitarray_bitflips);
+       
+       bitarray_to_list(best_first_bytes[0], candidates_bitarray, candidates->states[odd_even], &(candidates->len[odd_even]), odd_even);
+       if (candidates->len[odd_even] == 0) {
+               free(candidates->states[odd_even]);
+               candidates->states[odd_even] = NULL;
+       } else if (candidates->len[odd_even] + 1 < worstcase_size) {
+               candidates->states[odd_even] = realloc(candidates->states[odd_even], sizeof(uint32_t) * (candidates->len[odd_even] + 1));
+       }
+       free_bitarray(candidates_bitarray);
+
+
+       pthread_mutex_lock(&statelist_cache_mutex);
+       sl_cache[part_sum_a0/2][part_sum_a8/2][odd_even].sl = candidates->states[odd_even];
+       sl_cache[part_sum_a0/2][part_sum_a8/2][odd_even].len = candidates->len[odd_even];
+       sl_cache[part_sum_a0/2][part_sum_a8/2][odd_even].cache_status = COMPLETED;
+       pthread_mutex_unlock(&statelist_cache_mutex);
+
+       return;
+}
+
+
+static statelist_t *add_more_candidates(void)
+{
+       statelist_t *new_candidates = candidates;
+       if (candidates == NULL) {
+               candidates = (statelist_t *)malloc(sizeof(statelist_t));
+               new_candidates = candidates;
+       } else {
+               new_candidates = candidates;
+               while (new_candidates->next != NULL) {
+                       new_candidates = new_candidates->next;
+               }
+               new_candidates = new_candidates->next = (statelist_t *)malloc(sizeof(statelist_t));
+       }
+       new_candidates->next = NULL;
+       new_candidates->len[ODD_STATE] = 0;
+       new_candidates->len[EVEN_STATE] = 0;
+       new_candidates->states[ODD_STATE] = NULL;
+       new_candidates->states[EVEN_STATE] = NULL;
+       return new_candidates;
+}
+
+
+static void add_bitflip_candidates(uint8_t byte)
+{
+       statelist_t *candidates = add_more_candidates();
+
+       for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+               uint32_t worstcase_size = nonces[byte].num_states_bitarray[odd_even] + 1;
+               candidates->states[odd_even] = (uint32_t *)malloc(sizeof(uint32_t) * worstcase_size);
+               if (candidates->states[odd_even] == NULL) {
+                       PrintAndLog("Out of memory error in add_bitflip_candidates().\n");
+                       exit(4);
+               }
+       
+               bitarray_to_list(byte, nonces[byte].states_bitarray[odd_even], candidates->states[odd_even], &(candidates->len[odd_even]), odd_even);
+
+               if (candidates->len[odd_even] + 1 < worstcase_size) {
+                       candidates->states[odd_even] = realloc(candidates->states[odd_even], sizeof(uint32_t) * (candidates->len[odd_even] + 1));
+               }
+       }
+       return;
+}
+
+
+static bool TestIfKeyExists(uint64_t key)
+{
+       struct Crypto1State *pcs;
+       pcs = crypto1_create(key);
+       crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+
+       uint32_t state_odd = pcs->odd & 0x00ffffff;
+       uint32_t state_even = pcs->even & 0x00ffffff;
+       
+       uint64_t count = 0;
+       for (statelist_t *p = candidates; p != NULL; p = p->next) {
+               bool found_odd = false;
+               bool found_even = false;
+               uint32_t *p_odd = p->states[ODD_STATE];
+               uint32_t *p_even = p->states[EVEN_STATE];
+               if (p_odd != NULL && p_even != NULL) {
+                       while (*p_odd != 0xffffffff) {
+                               if ((*p_odd & 0x00ffffff) == state_odd) {
+                                       found_odd = true;
+                                       break;
+                               }
+                               p_odd++;
+                       }
+                       while (*p_even != 0xffffffff) {
+                               if ((*p_even & 0x00ffffff) == state_even) {
+                                       found_even = true;
+                               }
+                               p_even++;
+                       }
+                       count += (uint64_t)(p_odd - p->states[ODD_STATE]) * (uint64_t)(p_even - p->states[EVEN_STATE]);
+               }
+               if (found_odd && found_even) {
+                       num_keys_tested += count;
+                       hardnested_print_progress(num_acquired_nonces, "(Test: Key found)", 0.0, 0);
+                       crypto1_destroy(pcs);
+                       return true;
+               }
+       }
+
+       num_keys_tested += count;
+       hardnested_print_progress(num_acquired_nonces, "(Test: Key NOT found)", 0.0, 0);
+
+       crypto1_destroy(pcs);
+       return false;
+}
+
+
+static work_status_t book_of_work[NUM_PART_SUMS][NUM_PART_SUMS][NUM_PART_SUMS][NUM_PART_SUMS];
+
+
+static void init_book_of_work(void)
+{
+       for (uint8_t p = 0; p < NUM_PART_SUMS; p++) {
+               for (uint8_t q = 0; q < NUM_PART_SUMS; q++) {
+                       for (uint8_t r = 0; r < NUM_PART_SUMS; r++) {
+                               for (uint8_t s = 0; s < NUM_PART_SUMS; s++) {
+                                       book_of_work[p][q][r][s] = TO_BE_DONE;
+                               }
+                       }
+               }
+       }
+}
+
+
+static void *generate_candidates_worker_thread(void *args)
+{
+       uint16_t *sum_args = (uint16_t *)args;
+       uint16_t sum_a0 = sums[sum_args[0]];
+       uint16_t sum_a8 = sums[sum_args[1]];
+       // uint16_t my_thread_number = sums[2];
+       
+       bool there_might_be_more_work = true;
+       do {
+               there_might_be_more_work = false;
+               for (uint8_t p = 0; p < NUM_PART_SUMS; p++) {
+                       for (uint8_t q = 0; q < NUM_PART_SUMS; q++) {
+                               if (2*p*(16-2*q) + (16-2*p)*2*q == sum_a0) {
+                                       // printf("Reducing Partial Statelists (p,q) = (%d,%d) with lengths %d, %d\n", 
+                                                       // p, q, partial_statelist[p].len[ODD_STATE], partial_statelist[q].len[EVEN_STATE]);
+                                       for (uint8_t r = 0; r < NUM_PART_SUMS; r++) {
+                                               for (uint8_t s = 0; s < NUM_PART_SUMS; s++) {
+                                                       if (2*r*(16-2*s) + (16-2*r)*2*s == sum_a8) {
+                                                               pthread_mutex_lock(&book_of_work_mutex);
+                                                               if (book_of_work[p][q][r][s] != TO_BE_DONE) {  // this has been done or is currently been done by another thread. Look for some other work.
+                                                                       pthread_mutex_unlock(&book_of_work_mutex);
+                                                                       continue;
+                                                               }
+
+                                                               pthread_mutex_lock(&statelist_cache_mutex);
+                                                               if (sl_cache[p][r][ODD_STATE].cache_status == WORK_IN_PROGRESS
+                                                                       || sl_cache[q][s][EVEN_STATE].cache_status == WORK_IN_PROGRESS) { // defer until not blocked by another thread.
+                                                                       pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                       pthread_mutex_unlock(&book_of_work_mutex);
+                                                                       there_might_be_more_work = true;
+                                                                       continue;
+                                                               }
+
+                                                               // we finally can do some work.
+                                                               book_of_work[p][q][r][s] = WORK_IN_PROGRESS;
+                                                               statelist_t *current_candidates = add_more_candidates();
+
+                                                               // Check for cached results and add them first
+                                                               bool odd_completed = false;
+                                                               if (sl_cache[p][r][ODD_STATE].cache_status == COMPLETED) {
+                                                                       add_cached_states(current_candidates, 2*p, 2*r, ODD_STATE);
+                                                                       odd_completed = true;
+                                                               }
+                                                               bool even_completed = false;
+                                                               if (sl_cache[q][s][EVEN_STATE].cache_status == COMPLETED) {
+                                                                       add_cached_states(current_candidates, 2*q, 2*s, EVEN_STATE);
+                                                                       even_completed = true;
+                                                               }
+                                                               
+                                                               bool work_required = true;
+
+                                                               // if there had been two cached results, there is no more work to do
+                                                               if (even_completed && odd_completed) {
+                                                                       work_required = false;
+                                                               }       
+
+                                                               // if there had been one cached empty result, there is no need to calculate the other part:
+                                                               if (work_required) {
+                                                                       if (even_completed && !current_candidates->len[EVEN_STATE]) {
+                                                                               current_candidates->len[ODD_STATE] = 0;
+                                                                               current_candidates->states[ODD_STATE] = NULL;
+                                                                               work_required = false;
+                                                                       }                                                                       
+                                                                       if (odd_completed && !current_candidates->len[ODD_STATE]) {
+                                                                               current_candidates->len[EVEN_STATE] = 0;
+                                                                               current_candidates->states[EVEN_STATE] = NULL;
+                                                                               work_required = false;
+                                                                       }
+                                                               }
+
+                                                               if (!work_required) {
+                                                                       pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                       pthread_mutex_unlock(&book_of_work_mutex);
+                                                               } else {
+                                                                       // we really need to calculate something
+                                                                       if (even_completed) { // we had one cache hit with non-zero even states
+                                                                               // printf("Thread #%u: start working on  odd states p=%2d, r=%2d...\n", my_thread_number, p, r);
+                                                                               sl_cache[p][r][ODD_STATE].cache_status = WORK_IN_PROGRESS;
+                                                                               pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                               pthread_mutex_unlock(&book_of_work_mutex);
+                                                                               add_matching_states(current_candidates, 2*p, 2*r, ODD_STATE);
+                                                                               work_required = false;
+                                                                       } else if (odd_completed) { // we had one cache hit with non-zero odd_states
+                                                                               // printf("Thread #%u: start working on even states q=%2d, s=%2d...\n", my_thread_number, q, s);
+                                                                               sl_cache[q][s][EVEN_STATE].cache_status = WORK_IN_PROGRESS;
+                                                                               pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                               pthread_mutex_unlock(&book_of_work_mutex);
+                                                                               add_matching_states(current_candidates, 2*q, 2*s, EVEN_STATE);
+                                                                               work_required = false;
+                                                                       }
+                                                               }
+                                                                       
+                                                               if (work_required) { // we had no cached result. Need to calculate both odd and even
+                                                                       sl_cache[p][r][ODD_STATE].cache_status = WORK_IN_PROGRESS;
+                                                                       sl_cache[q][s][EVEN_STATE].cache_status = WORK_IN_PROGRESS;
+                                                                       pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                       pthread_mutex_unlock(&book_of_work_mutex);
+
+                                                                       add_matching_states(current_candidates, 2*p, 2*r, ODD_STATE);
+                                                                       if(current_candidates->len[ODD_STATE]) {
+                                                                               // printf("Thread #%u: start working on even states q=%2d, s=%2d...\n", my_thread_number, q, s);
+                                                                               add_matching_states(current_candidates, 2*q, 2*s, EVEN_STATE);
+                                                                       } else { // no need to calculate even states yet
+                                                                               pthread_mutex_lock(&statelist_cache_mutex);
+                                                                               sl_cache[q][s][EVEN_STATE].cache_status = TO_BE_DONE;
+                                                                               pthread_mutex_unlock(&statelist_cache_mutex);
+                                                                               current_candidates->len[EVEN_STATE] = 0;
+                                                                               current_candidates->states[EVEN_STATE] = NULL;
+                                                                       }
+                                                               }
+
+                                                               // update book of work
+                                                               pthread_mutex_lock(&book_of_work_mutex);
+                                                               book_of_work[p][q][r][s] = COMPLETED;
+                                                               pthread_mutex_unlock(&book_of_work_mutex);
+
+                                                               // if ((uint64_t)current_candidates->len[ODD_STATE] * current_candidates->len[EVEN_STATE]) {
+                                                                       // printf("Candidates for p=%2u, q=%2u, r=%2u, s=%2u: %" PRIu32 " * %" PRIu32 " = %" PRIu64 " (2^%0.1f)\n",
+                                                                               // 2*p, 2*q, 2*r, 2*s, current_candidates->len[ODD_STATE], current_candidates->len[EVEN_STATE],
+                                                                               // (uint64_t)current_candidates->len[ODD_STATE] * current_candidates->len[EVEN_STATE],
+                                                                               // log((uint64_t)current_candidates->len[ODD_STATE] * current_candidates->len[EVEN_STATE])/log(2));
+                                                                       // uint32_t estimated_odd = estimated_num_states_part_sum(best_first_bytes[0], p, r, ODD_STATE);
+                                                                       // uint32_t estimated_even= estimated_num_states_part_sum(best_first_bytes[0], q, s, EVEN_STATE);
+                                                                       // uint64_t estimated_total = (uint64_t)estimated_odd * estimated_even; 
+                                                                       // printf("Estimated: %" PRIu32 " * %" PRIu32 " = %" PRIu64 " (2^%0.1f)\n", estimated_odd, estimated_even, estimated_total, log(estimated_total) / log(2));
+                                                                       // if (estimated_odd < current_candidates->len[ODD_STATE] || estimated_even < current_candidates->len[EVEN_STATE]) {
+                                                                               // printf("############################################################################ERROR! ESTIMATED < REAL !!!\n"); 
+                                                                               // //exit(2);
+                                                                               // }
+                                                               // }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       } while (there_might_be_more_work);
+       
+       return NULL;
+}
+
+
+static void generate_candidates(uint8_t sum_a0_idx, uint8_t sum_a8_idx)
+{
+       // printf("Generating crypto1 state candidates... \n");
+       
+       // estimate maximum candidate states
+       // maximum_states = 0;
+       // for (uint16_t sum_odd = 0; sum_odd <= 16; sum_odd += 2) {
+               // for (uint16_t sum_even = 0; sum_even <= 16; sum_even += 2) {
+                       // if (sum_odd*(16-sum_even) + (16-sum_odd)*sum_even == sum_a0) {
+                               // maximum_states += (uint64_t)count_states(part_sum_a0_bitarrays[EVEN_STATE][sum_even/2]) 
+                                                               // * count_states(part_sum_a0_bitarrays[ODD_STATE][sum_odd/2]);
+                       // }
+               // }
+       // }
+       // printf("Number of possible keys with Sum(a0) = %d: %" PRIu64 " (2^%1.1f)\n", sum_a0, maximum_states, log(maximum_states)/log(2.0));
+       
+       init_statelist_cache();
+       init_book_of_work();
+
+       // create mutexes for accessing the statelist cache and our "book of work"
+       pthread_mutex_init(&statelist_cache_mutex, NULL);
+       pthread_mutex_init(&book_of_work_mutex, NULL);
+
+       // create and run worker threads
+       pthread_t thread_id[NUM_REDUCTION_WORKING_THREADS];
+               
+       uint16_t sums[NUM_REDUCTION_WORKING_THREADS][3];
+       for (uint16_t i = 0; i < NUM_REDUCTION_WORKING_THREADS; i++) {
+               sums[i][0] = sum_a0_idx;
+               sums[i][1] = sum_a8_idx;
+               sums[i][2] = i+1;
+               pthread_create(thread_id + i, NULL, generate_candidates_worker_thread, sums[i]);
+       }
+       
+       // wait for threads to terminate:
+       for (uint16_t i = 0; i < NUM_REDUCTION_WORKING_THREADS; i++) {
+               pthread_join(thread_id[i], NULL);
+       }
+
+       // clean up mutex
+       pthread_mutex_destroy(&statelist_cache_mutex);
+       
+       maximum_states = 0;
+       for (statelist_t *sl = candidates; sl != NULL; sl = sl->next) {
+               maximum_states += (uint64_t)sl->len[ODD_STATE] * sl->len[EVEN_STATE];
+       }
+
+       for (uint8_t i = 0; i < NUM_SUMS; i++) {
+               if (nonces[best_first_bytes[0]].sum_a8_guess[i].sum_a8_idx == sum_a8_idx) {
+                       nonces[best_first_bytes[0]].sum_a8_guess[i].num_states = maximum_states;
+                       break;
+               }
+       }
+       update_expected_brute_force(best_first_bytes[0]);
+
+       hardnested_print_progress(num_acquired_nonces, "Apply Sum(a8) and all bytes bitflip properties", nonces[best_first_bytes[0]].expected_num_brute_force, 0);
+}
+
+
+static void    free_candidates_memory(statelist_t *sl)
+{
+       if (sl == NULL) {
+               return;
+       } else {
+               free_candidates_memory(sl->next);
+               free(sl);
+       }
+}
+
+
+static void pre_XOR_nonces(void)
+{
+       // prepare acquired nonces for faster brute forcing. 
+       
+       // XOR the cryptoUID and its parity
+       for (uint16_t i = 0; i < 256; i++) {
+               noncelistentry_t *test_nonce = nonces[i].first;
+               while (test_nonce != NULL) {
+                       test_nonce->nonce_enc ^= cuid;
+                       test_nonce->par_enc ^= oddparity8(cuid >>  0 & 0xff) << 0;
+                       test_nonce->par_enc ^= oddparity8(cuid >>  8 & 0xff) << 1;
+                       test_nonce->par_enc ^= oddparity8(cuid >> 16 & 0xff) << 2;
+                       test_nonce->par_enc ^= oddparity8(cuid >> 24 & 0xff) << 3;
+                       test_nonce = test_nonce->next;
+               }
+       }
+}
+       
+
+static bool brute_force(void)
+{
+       if (known_target_key != -1) {
+               TestIfKeyExists(known_target_key);
+       }
+       return brute_force_bs(NULL, candidates, cuid, num_acquired_nonces, maximum_states, nonces, best_first_bytes);
+}
+
+
+static uint16_t SumProperty(struct Crypto1State *s)
+{
+       uint16_t sum_odd = PartialSumProperty(s->odd, ODD_STATE);
+       uint16_t sum_even = PartialSumProperty(s->even, EVEN_STATE);
+       return (sum_odd*(16-sum_even) + (16-sum_odd)*sum_even);
+}
+
+
+static void Tests()
+{
+
+/*     #define NUM_STATISTICS 100000
+       uint32_t statistics_odd[17];
+       uint64_t statistics[257];
+       uint32_t statistics_even[17];
+       struct Crypto1State cs;
+       uint64_t time1 = msclock();
+
+       for (uint16_t i = 0; i < 257; i++) {
+               statistics[i] = 0;
+       }
+       for (uint16_t i = 0; i < 17; i++) {
+               statistics_odd[i] = 0;
+               statistics_even[i] = 0;
+       }
+       
+       for (uint64_t i = 0; i < NUM_STATISTICS; i++) {
+               cs.odd = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+               cs.even = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+               uint16_t sum_property = SumProperty(&cs);
+               statistics[sum_property] += 1;
+               sum_property = PartialSumProperty(cs.even, EVEN_STATE);
+               statistics_even[sum_property]++;
+               sum_property = PartialSumProperty(cs.odd, ODD_STATE);
+               statistics_odd[sum_property]++;
+               if (i%(NUM_STATISTICS/100) == 0) printf("."); 
+       }
+       
+       printf("\nTests: Calculated %d Sum properties in %0.3f seconds (%0.0f calcs/second)\n", NUM_STATISTICS, ((float)msclock() - time1)/1000.0, NUM_STATISTICS/((float)msclock() - time1)*1000.0);
+       for (uint16_t i = 0; i < 257; i++) {
+               if (statistics[i] != 0) {
+                       printf("probability[%3d] = %0.5f\n", i, (float)statistics[i]/NUM_STATISTICS);
+               }
+       }
+       for (uint16_t i = 0; i <= 16; i++) {
+               if (statistics_odd[i] != 0) {
+                       printf("probability odd [%2d] = %0.5f\n", i, (float)statistics_odd[i]/NUM_STATISTICS);
+               }
+       }
+       for (uint16_t i = 0; i <= 16; i++) {
+               if (statistics_odd[i] != 0) {
+                       printf("probability even [%2d] = %0.5f\n", i, (float)statistics_even[i]/NUM_STATISTICS);
+               }
+       }
+ */
+
+/*     #define NUM_STATISTICS 100000000LL
+       uint64_t statistics_a0[257];
+       uint64_t statistics_a8[257][257];
+       struct Crypto1State cs;
+       uint64_t time1 = msclock();
+
+       for (uint16_t i = 0; i < 257; i++) {
+               statistics_a0[i] = 0;
+               for (uint16_t j = 0; j < 257; j++) {
+                       statistics_a8[i][j] = 0;
+               }
+       }
+       
+       for (uint64_t i = 0; i < NUM_STATISTICS; i++) {
+               cs.odd = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+               cs.even = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+               uint16_t sum_property_a0 = SumProperty(&cs);
+               statistics_a0[sum_property_a0]++;
+               uint8_t first_byte = rand() & 0xff;
+               crypto1_byte(&cs, first_byte, true);
+               uint16_t sum_property_a8 = SumProperty(&cs);
+               statistics_a8[sum_property_a0][sum_property_a8] += 1;
+               if (i%(NUM_STATISTICS/100) == 0) printf("."); 
+       }
+       
+       printf("\nTests: Probability Distribution of a8 depending on a0:\n");
+       printf("\n      ");
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               printf("%7d ", sums[i]);
+       }
+       printf("\n-------------------------------------------------------------------------------------------------------------------------------------------\n");
+       printf("a0:   ");
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               printf("%7.5f ", (float)statistics_a0[sums[i]] / NUM_STATISTICS);
+       }
+       printf("\n");
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               printf("%3d   ", sums[i]);
+               for (uint16_t j = 0; j < NUM_SUMS; j++) {
+                       printf("%7.5f ", (float)statistics_a8[sums[i]][sums[j]] / statistics_a0[sums[i]]);
+                       }
+               printf("\n");
+       }
+       printf("\nTests: Calculated %"lld" Sum properties in %0.3f seconds (%0.0f calcs/second)\n", NUM_STATISTICS, ((float)msclock() - time1)/1000.0, NUM_STATISTICS/((float)msclock() - time1)*1000.0);
+ */            
+/*     #define NUM_STATISTICS 100000LL
+       uint64_t statistics_a8[257];
+       struct Crypto1State cs;
+       uint64_t time1 = msclock();
+
+       printf("\nTests: Probability Distribution of a8 depending on first byte:\n");
+       printf("\n      ");
+       for (uint16_t i = 0; i < NUM_SUMS; i++) {
+               printf("%7d ", sums[i]);
+       }
+       printf("\n-------------------------------------------------------------------------------------------------------------------------------------------\n");
+       for (uint16_t first_byte = 0; first_byte < 256; first_byte++) {
+               for (uint16_t i = 0; i < 257; i++) {
+                       statistics_a8[i] = 0;
+               }
+               for (uint64_t i = 0; i < NUM_STATISTICS; i++) {
+                       cs.odd = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+                       cs.even = (rand() & 0xfff) << 12 | (rand() & 0xfff);
+                       crypto1_byte(&cs, first_byte, true);
+                       uint16_t sum_property_a8 = SumProperty(&cs);
+                       statistics_a8[sum_property_a8] += 1;
+               }
+               printf("%03x   ", first_byte);
+               for (uint16_t j = 0; j < NUM_SUMS; j++) {
+                       printf("%7.5f ", (float)statistics_a8[sums[j]] / NUM_STATISTICS);
+               }
+               printf("\n");
+       }
+       printf("\nTests: Calculated %"lld" Sum properties in %0.3f seconds (%0.0f calcs/second)\n", NUM_STATISTICS, ((float)msclock() - time1)/1000.0, NUM_STATISTICS/((float)msclock() - time1)*1000.0);
+*/     
+
+/*     printf("Tests: Sum Probabilities based on Partial Sums\n");
+       for (uint16_t i = 0; i < 257; i++) {
+               statistics[i] = 0;
+       }
+       uint64_t num_states = 0;
+       for (uint16_t oddsum = 0; oddsum <= 16; oddsum += 2) {
+               for (uint16_t evensum = 0; evensum <= 16; evensum += 2) {
+                       uint16_t sum = oddsum*(16-evensum) + (16-oddsum)*evensum;
+                       statistics[sum] += (uint64_t)partial_statelist[oddsum].len[ODD_STATE] * partial_statelist[evensum].len[EVEN_STATE] * (1<<8);
+                       num_states += (uint64_t)partial_statelist[oddsum].len[ODD_STATE] * partial_statelist[evensum].len[EVEN_STATE] * (1<<8);
+               }
+       }
+       printf("num_states = %"lld", expected %"lld"\n", num_states, (1LL<<48));
+       for (uint16_t i = 0; i < 257; i++) {
+               if (statistics[i] != 0) {
+                       printf("probability[%3d] = %0.5f\n", i, (float)statistics[i]/num_states);
+               }
+       }
+ */    
+
+/*     struct Crypto1State *pcs;
+       pcs = crypto1_create(0xffffffffffff);
+       printf("\nTests: for key = 0xffffffffffff:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n", 
+               SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+       printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+               best_first_bytes[0],
+               SumProperty(pcs),
+               pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       //test_state_odd = pcs->odd & 0x00ffffff;
+       //test_state_even = pcs->even & 0x00ffffff;
+       crypto1_destroy(pcs);
+       pcs = crypto1_create(0xa0a1a2a3a4a5);
+       printf("Tests: for key = 0xa0a1a2a3a4a5:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+               SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+       printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+               best_first_bytes[0],
+               SumProperty(pcs),
+               pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       //test_state_odd = pcs->odd & 0x00ffffff;
+       //test_state_even = pcs->even & 0x00ffffff;
+       crypto1_destroy(pcs);
+       pcs = crypto1_create(0xa6b9aa97b955);
+       printf("Tests: for key = 0xa6b9aa97b955:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+               SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+       printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+               best_first_bytes[0],
+               SumProperty(pcs),
+               pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+       test_state_odd = pcs->odd & 0x00ffffff;
+       test_state_even = pcs->even & 0x00ffffff;
+       crypto1_destroy(pcs);
+ */
+
+       // printf("\nTests: Sorted First Bytes:\n");
+       // for (uint16_t i = 0; i < 20; i++) {
+               // uint8_t best_byte = best_first_bytes[i];
+               // //printf("#%03d Byte: %02x, n = %3d, k = %3d, Sum(a8): %3d, Confidence: %5.1f%%\n", 
+               // printf("#%03d Byte: %02x, n = %3d, k = %3d, Sum(a8) = ", i, best_byte, nonces[best_byte].num, nonces[best_byte].Sum);
+               // for (uint16_t j = 0; j < 3; j++) {
+                       // printf("%3d @ %4.1f%%, ", sums[nonces[best_byte].sum_a8_guess[j].sum_a8_idx], nonces[best_byte].sum_a8_guess[j].prob * 100.0);
+               // }
+               // printf(" %12" PRIu64 ", %12" PRIu64 ", %12" PRIu64 ", exp_brute: %12.0f\n", 
+                       // nonces[best_byte].sum_a8_guess[0].num_states, 
+                       // nonces[best_byte].sum_a8_guess[1].num_states,
+                       // nonces[best_byte].sum_a8_guess[2].num_states,
+                       // nonces[best_byte].expected_num_brute_force);
+       // }
+
+       // printf("\nTests: Actual BitFlipProperties of best byte:\n");
+       // printf("[%02x]:", best_first_bytes[0]);
+       // for (uint16_t bitflip_idx = 0; bitflip_idx < num_all_effective_bitflips; bitflip_idx++) {
+               // uint16_t bitflip_prop = all_effective_bitflip[bitflip_idx];
+               // if (nonces[best_first_bytes[0]].BitFlips[bitflip_prop]) {
+                       // printf(" %03" PRIx16 , bitflip_prop);
+               // }
+       // }
+       // printf("\n");
+       
+       // printf("\nTests2: Actual BitFlipProperties of first_byte_smallest_bitarray:\n");
+       // printf("[%02x]:", best_first_byte_smallest_bitarray);
+       // for (uint16_t bitflip_idx = 0; bitflip_idx < num_all_effective_bitflips; bitflip_idx++) {
+               // uint16_t bitflip_prop = all_effective_bitflip[bitflip_idx];
+               // if (nonces[best_first_byte_smallest_bitarray].BitFlips[bitflip_prop]) {
+                       // printf(" %03" PRIx16 , bitflip_prop);
+               // }
+       // }
+       // printf("\n");
+
+       if (known_target_key != -1) {
+               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       uint32_t *bitset = nonces[best_first_bytes[0]].states_bitarray[odd_even];
+                       if (!test_bit24(bitset, test_state[odd_even])) {
+                               printf("\nBUG: known target key's %s state is not member of first nonce byte's (0x%02x) states_bitarray!\n", 
+                                       odd_even==EVEN_STATE?"even":"odd ", 
+                                       best_first_bytes[0]);
+                       }
+               }
+       }
+
+       if (known_target_key != -1) {
+               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       uint32_t *bitset = all_bitflips_bitarray[odd_even];
+                       if (!test_bit24(bitset, test_state[odd_even])) {
+                               printf("\nBUG: known target key's %s state is not member of all_bitflips_bitarray!\n", 
+                                       odd_even==EVEN_STATE?"even":"odd ");
+                       }
+               }
+       }       
+
+       // if (known_target_key != -1) {
+               // int16_t p = -1, q = -1, r = -1, s = -1;
+
+               // printf("\nTests: known target key is member of these partial sum_a0 bitsets:\n");
+               // for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       // printf("%s", odd_even==EVEN_STATE?"even:":"odd: ");
+                       // for (uint16_t i = 0; i < NUM_PART_SUMS; i++) {
+                               // uint32_t *bitset = part_sum_a0_bitarrays[odd_even][i];
+                               // if (test_bit24(bitset, test_state[odd_even])) {
+                                       // printf("%d ", i);
+                                       // if (odd_even == ODD_STATE) {
+                                               // p = 2*i;
+                                       // } else {
+                                               // q = 2*i;
+                                       // }
+                               // }
+                       // }
+                       // printf("\n");
+               // }
+
+               // printf("\nTests: known target key is member of these partial sum_a8 bitsets:\n");
+               // for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       // printf("%s", odd_even==EVEN_STATE?"even:":"odd: ");
+                       // for (uint16_t i = 0; i < NUM_PART_SUMS; i++) {
+                               // uint32_t *bitset = part_sum_a8_bitarrays[odd_even][i];
+                               // if (test_bit24(bitset, test_state[odd_even])) {
+                                       // printf("%d ", i);
+                                       // if (odd_even == ODD_STATE) {
+                                               // r = 2*i;
+                                       // } else {
+                                               // s = 2*i;
+                                       // }
+                               // }
+                       // }
+                       // printf("\n");
+               // }
+
+               // printf("Sum(a0) = p*(16-q) + (16-p)*q = %d*(16-%d) + (16-%d)*%d = %d\n", p, q, p, q, p*(16-q)+(16-p)*q);
+               // printf("Sum(a8) = r*(16-s) + (16-r)*s = %d*(16-%d) + (16-%d)*%d = %d\n", r, s, r, s, r*(16-s)+(16-r)*s);
+       // }
+       
+       /*      printf("\nTests: parity performance\n");
+       uint64_t time1p = msclock();
+       uint32_t par_sum = 0;
+       for (uint32_t i = 0; i < 100000000; i++) {
+               par_sum += parity(i);
+       }
+       printf("parsum oldparity = %d, time = %1.5fsec\n", par_sum, (float)(msclock() - time1p)/1000.0);
+
+       time1p = msclock();
+       par_sum = 0;
+       for (uint32_t i = 0; i < 100000000; i++) {
+               par_sum += evenparity32(i);
+       }
+       printf("parsum newparity = %d, time = %1.5fsec\n", par_sum, (float)(msclock() - time1p)/1000.0);
+ */
+
+}
+
+
+static void Tests2(void) 
+{
+       if (known_target_key != -1) {
+               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       uint32_t *bitset = nonces[best_first_byte_smallest_bitarray].states_bitarray[odd_even];
+                       if (!test_bit24(bitset, test_state[odd_even])) {
+                               printf("\nBUG: known target key's %s state is not member of first nonce byte's (0x%02x) states_bitarray!\n",
+                                       odd_even==EVEN_STATE?"even":"odd ", 
+                                       best_first_byte_smallest_bitarray);
+                       }
+               }
+       }       
+
+       if (known_target_key != -1) {
+               for (odd_even_t odd_even = EVEN_STATE; odd_even <= ODD_STATE; odd_even++) {
+                       uint32_t *bitset = all_bitflips_bitarray[odd_even];
+                       if (!test_bit24(bitset, test_state[odd_even])) {
+                               printf("\nBUG: known target key's %s state is not member of all_bitflips_bitarray!\n", 
+                                       odd_even==EVEN_STATE?"even":"odd ");
+                       }
+               }
+       }       
+       
+}
+
+
+static uint16_t real_sum_a8 = 0;
+
+static void set_test_state(uint8_t byte) 
+{
+       struct Crypto1State *pcs;
+       pcs = crypto1_create(known_target_key);
+       crypto1_byte(pcs, (cuid >> 24) ^ byte, true);
+       test_state[ODD_STATE] = pcs->odd & 0x00ffffff;
+       test_state[EVEN_STATE] = pcs->even & 0x00ffffff;
+       real_sum_a8 = SumProperty(pcs);
+       crypto1_destroy(pcs);
+}
+
+
+int mfnestedhard(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, uint8_t *trgkey, bool nonce_file_read, bool nonce_file_write, bool slow, int tests) 
+{
+       char progress_text[80];
+
+       srand((unsigned) time(NULL));
+       brute_force_per_second = brute_force_benchmark();
+       write_stats = false;
+
+       if (tests) {
+               // set the correct locale for the stats printing
+               write_stats = true;
+               setlocale(LC_NUMERIC, "");
+               if ((fstats = fopen("hardnested_stats.txt","a")) == NULL) { 
+                       PrintAndLog("Could not create/open file hardnested_stats.txt");
+                       return 3;
+               }
+               for (uint32_t i = 0; i < tests; i++) {
+                       start_time = msclock();
+                       print_progress_header();
+                       sprintf(progress_text, "Brute force benchmark: %1.0f million (2^%1.1f) keys/s", brute_force_per_second/1000000, log(brute_force_per_second)/log(2.0));
+                       hardnested_print_progress(0, progress_text, (float)(1LL<<47), 0);
+                       sprintf(progress_text, "Starting Test #%" PRIu32 " ...", i+1);
+                       hardnested_print_progress(0, progress_text, (float)(1LL<<47), 0);
+                       if (trgkey != NULL) {
+                               known_target_key = bytes_to_num(trgkey, 6);
+                       } else {
+                               known_target_key = -1;
+                       }
+
+                       init_bitflip_bitarrays();
+                       init_part_sum_bitarrays();
+                       init_sum_bitarrays();
+                       init_allbitflips_array();
+                       init_nonce_memory();
+                       update_reduction_rate(0.0, true);
+                       
+                       simulate_acquire_nonces();
+
+                       set_test_state(best_first_bytes[0]);
+
+                       Tests();
+                       free_bitflip_bitarrays();
+
+                       fprintf(fstats, "%" PRIu16 ";%1.1f;", sums[first_byte_Sum], log(p_K0[first_byte_Sum])/log(2.0));
+                       fprintf(fstats, "%" PRIu16 ";%1.1f;", sums[nonces[best_first_bytes[0]].sum_a8_guess[0].sum_a8_idx], log(p_K[nonces[best_first_bytes[0]].sum_a8_guess[0].sum_a8_idx])/log(2.0));
+                       fprintf(fstats, "%" PRIu16 ";", real_sum_a8);
+       
+#ifdef DEBUG_KEY_ELIMINATION
+                       failstr[0] = '\0';
+#endif
+                       bool key_found = false;
+                       num_keys_tested = 0;
+                       uint32_t num_odd = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[ODD_STATE];
+                       uint32_t num_even = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[EVEN_STATE];
+                       float expected_brute_force1 = (float)num_odd * num_even / 2.0;
+                       float expected_brute_force2 = nonces[best_first_bytes[0]].expected_num_brute_force;
+                       fprintf(fstats, "%1.1f;%1.1f;", log(expected_brute_force1)/log(2.0), log(expected_brute_force2)/log(2.0));
+                       if (expected_brute_force1 < expected_brute_force2) {
+                               hardnested_print_progress(num_acquired_nonces, "(Ignoring Sum(a8) properties)", expected_brute_force1, 0);
+                               set_test_state(best_first_byte_smallest_bitarray);
+                               add_bitflip_candidates(best_first_byte_smallest_bitarray);
+                               Tests2();
+                               maximum_states = 0;
+                               for (statelist_t *sl = candidates; sl != NULL; sl = sl->next) {
+                                       maximum_states += (uint64_t)sl->len[ODD_STATE] * sl->len[EVEN_STATE];
+                               }
+                               //printf("Number of remaining possible keys: %" PRIu64 " (2^%1.1f)\n", maximum_states, log(maximum_states)/log(2.0));
+                               // fprintf("fstats, "%" PRIu64 ";", maximum_states);
+                               best_first_bytes[0] = best_first_byte_smallest_bitarray;
+                               pre_XOR_nonces();
+                               prepare_bf_test_nonces(nonces, best_first_bytes[0]);
+                               key_found = brute_force();
+                               free(candidates->states[ODD_STATE]);
+                               free(candidates->states[EVEN_STATE]);
+                               free_candidates_memory(candidates);
+                               candidates = NULL;
+                       } else {
+                               pre_XOR_nonces();
+                               prepare_bf_test_nonces(nonces, best_first_bytes[0]);
+                               for (uint8_t j = 0; j < NUM_SUMS && !key_found; j++) {
+                                       float expected_brute_force = nonces[best_first_bytes[0]].expected_num_brute_force;
+                                       sprintf(progress_text, "(%d. guess: Sum(a8) = %" PRIu16 ")", j+1, sums[nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx]);
+                                       hardnested_print_progress(num_acquired_nonces, progress_text, expected_brute_force, 0); 
+                                       if (sums[nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx] != real_sum_a8) {
+                                               sprintf(progress_text, "(Estimated Sum(a8) is WRONG! Correct Sum(a8) = %" PRIu16 ")", real_sum_a8);
+                                               hardnested_print_progress(num_acquired_nonces, progress_text, expected_brute_force, 0);
+                                       }
+                                       // printf("Estimated remaining states: %" PRIu64 " (2^%1.1f)\n", nonces[best_first_bytes[0]].sum_a8_guess[j].num_states, log(nonces[best_first_bytes[0]].sum_a8_guess[j].num_states)/log(2.0));
+                                       generate_candidates(first_byte_Sum, nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx);
+                                       // printf("Time for generating key candidates list: %1.0f sec (%1.1f sec CPU)\n", difftime(time(NULL), start_time), (float)(msclock() - start_clock)/1000.0);
+                                       key_found = brute_force();
+                                       free_statelist_cache();
+                                       free_candidates_memory(candidates);
+                                       candidates = NULL;
+                                       if (!key_found) {
+                                               // update the statistics
+                                               nonces[best_first_bytes[0]].sum_a8_guess[j].prob = 0;
+                                               nonces[best_first_bytes[0]].sum_a8_guess[j].num_states = 0;
+                                               // and calculate new expected number of brute forces
+                                               update_expected_brute_force(best_first_bytes[0]);
+                                       }
+                               }
+                       }
+                       #ifdef DEBUG_KEY_ELIMINATION
+                       fprintf(fstats, "%1.1f;%1.0f;%d;%s\n", log(num_keys_tested)/log(2.0), (float)num_keys_tested/brute_force_per_second, key_found, failstr);
+                       #else
+                       fprintf(fstats, "%1.0f;%d\n", log(num_keys_tested)/log(2.0), (float)num_keys_tested/brute_force_per_second, key_found);
+                       #endif
+                       
+                       free_nonces_memory();
+                       free_bitarray(all_bitflips_bitarray[ODD_STATE]);
+                       free_bitarray(all_bitflips_bitarray[EVEN_STATE]);
+                       free_sum_bitarrays();
+                       free_part_sum_bitarrays();
+               }
+               fclose(fstats);
+       } else {
+               start_time = msclock();
+               print_progress_header();
+               sprintf(progress_text, "Brute force benchmark: %1.0f million (2^%1.1f) keys/s", brute_force_per_second/1000000, log(brute_force_per_second)/log(2.0));
+               hardnested_print_progress(0, progress_text, (float)(1LL<<47), 0);
+               init_bitflip_bitarrays();
+               init_part_sum_bitarrays();
+               init_sum_bitarrays();
+               init_allbitflips_array();
+               init_nonce_memory();
+               update_reduction_rate(0.0, true);
+
+               if (nonce_file_read) {          // use pre-acquired data from file nonces.bin
+                       if (read_nonce_file() != 0) {
+                               return 3;
+                       }
+                       hardnested_stage = CHECK_1ST_BYTES | CHECK_2ND_BYTES;
+                       update_nonce_data(false);
+                       float brute_force;
+                       shrink_key_space(&brute_force);
+               } else {                                        // acquire nonces.
+                       uint16_t is_OK = acquire_nonces(blockNo, keyType, key, trgBlockNo, trgKeyType, nonce_file_write, slow);
+                       if (is_OK != 0) {
+                               return is_OK;
+                       }
+               }
+
+               if (trgkey != NULL) {
+                       known_target_key = bytes_to_num(trgkey, 6);
+                       set_test_state(best_first_bytes[0]);
+               } else {
+                       known_target_key = -1;
+               }
+               
+               Tests();
+
+               free_bitflip_bitarrays();
+               bool key_found = false;
+               num_keys_tested = 0;
+               uint32_t num_odd = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[ODD_STATE];
+               uint32_t num_even = nonces[best_first_byte_smallest_bitarray].num_states_bitarray[EVEN_STATE];
+               float expected_brute_force1 = (float)num_odd * num_even / 2.0;
+               float expected_brute_force2 = nonces[best_first_bytes[0]].expected_num_brute_force;
+               if (expected_brute_force1 < expected_brute_force2) {
+                       hardnested_print_progress(num_acquired_nonces, "(Ignoring Sum(a8) properties)", expected_brute_force1, 0);
+                       set_test_state(best_first_byte_smallest_bitarray);
+                       add_bitflip_candidates(best_first_byte_smallest_bitarray);
+                       Tests2();
+                       maximum_states = 0;
+                       for (statelist_t *sl = candidates; sl != NULL; sl = sl->next) {
+                               maximum_states += (uint64_t)sl->len[ODD_STATE] * sl->len[EVEN_STATE];
+                       }
+                       printf("Number of remaining possible keys: %" PRIu64 " (2^%1.1f)\n", maximum_states, log(maximum_states)/log(2.0));
+                       best_first_bytes[0] = best_first_byte_smallest_bitarray;
+                       pre_XOR_nonces();
+                       prepare_bf_test_nonces(nonces, best_first_bytes[0]);
+                       key_found = brute_force();
+                       free(candidates->states[ODD_STATE]);
+                       free(candidates->states[EVEN_STATE]);
+                       free_candidates_memory(candidates);
+                       candidates = NULL;
+               } else {
+                       pre_XOR_nonces();
+                       prepare_bf_test_nonces(nonces, best_first_bytes[0]);
+                       for (uint8_t j = 0; j < NUM_SUMS && !key_found; j++) {
+                               float expected_brute_force = nonces[best_first_bytes[0]].expected_num_brute_force;
+                               sprintf(progress_text, "(%d. guess: Sum(a8) = %" PRIu16 ")", j+1, sums[nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx]);
+                               hardnested_print_progress(num_acquired_nonces, progress_text, expected_brute_force, 0); 
+                               if (trgkey != NULL && sums[nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx] != real_sum_a8) {
+                                       sprintf(progress_text, "(Estimated Sum(a8) is WRONG! Correct Sum(a8) = %" PRIu16 ")", real_sum_a8);
+                                       hardnested_print_progress(num_acquired_nonces, progress_text, expected_brute_force, 0);
+                               }
+                               // printf("Estimated remaining states: %" PRIu64 " (2^%1.1f)\n", nonces[best_first_bytes[0]].sum_a8_guess[j].num_states, log(nonces[best_first_bytes[0]].sum_a8_guess[j].num_states)/log(2.0));
+                               generate_candidates(first_byte_Sum, nonces[best_first_bytes[0]].sum_a8_guess[j].sum_a8_idx);
+                               // printf("Time for generating key candidates list: %1.0f sec (%1.1f sec CPU)\n", difftime(time(NULL), start_time), (float)(msclock() - start_clock)/1000.0);
+                               key_found = brute_force();
+                               free_statelist_cache();
+                               free_candidates_memory(candidates);
+                               candidates = NULL;
+                               if (!key_found) {
+                                       // update the statistics
+                                       nonces[best_first_bytes[0]].sum_a8_guess[j].prob = 0;
+                                       nonces[best_first_bytes[0]].sum_a8_guess[j].num_states = 0;
+                                       // and calculate new expected number of brute forces
+                                       update_expected_brute_force(best_first_bytes[0]);
+                               }
+
+                       }
+               }
+               
+               free_nonces_memory();
+               free_bitarray(all_bitflips_bitarray[ODD_STATE]);
+               free_bitarray(all_bitflips_bitarray[EVEN_STATE]);
+               free_sum_bitarrays();
+               free_part_sum_bitarrays();
+       }
+
+       return 0;
+}
diff --git a/client/cmdhfmfhard.h b/client/cmdhfmfhard.h
new file mode 100644 (file)
index 0000000..bcbc3db
--- /dev/null
@@ -0,0 +1,48 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2015 piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// hf mf hardnested command
+//-----------------------------------------------------------------------------
+
+#ifndef CMDHFMFHARD_H__
+#define CMDHFMFHARD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#define NUM_SUMS                                               19              // number of possible sum property values
+
+typedef struct guess_sum_a8 {
+       float prob;
+       uint64_t num_states;
+       uint8_t sum_a8_idx;
+} guess_sum_a8_t;
+
+typedef struct noncelistentry {
+       uint32_t nonce_enc;
+       uint8_t par_enc;
+       void *next;
+} noncelistentry_t;
+
+typedef struct noncelist {
+       uint16_t num;
+       uint16_t Sum;
+       guess_sum_a8_t sum_a8_guess[NUM_SUMS];
+       bool sum_a8_guess_dirty;
+       float expected_num_brute_force;
+       uint8_t BitFlips[0x400];
+       uint32_t *states_bitarray[2];
+       uint32_t num_states_bitarray[2];
+       bool all_bitflips_dirty[2];
+       noncelistentry_t *first;
+} noncelist_t;
+
+int mfnestedhard(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, uint8_t *trgkey, bool nonce_file_read, bool nonce_file_write, bool slow, int tests);
+void hardnested_print_progress(uint32_t nonces, char *activity, float brute_force, uint64_t min_diff_print_time);
+
+#endif
+
diff --git a/client/hardnested/bf_bench_data.bin b/client/hardnested/bf_bench_data.bin
new file mode 100644 (file)
index 0000000..30734da
Binary files /dev/null and b/client/hardnested/bf_bench_data.bin differ
diff --git a/client/hardnested/hardnested_bf_core.c b/client/hardnested/hardnested_bf_core.c
new file mode 100644 (file)
index 0000000..39a936d
--- /dev/null
@@ -0,0 +1,573 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+//
+// brute forcing is based on @aczids bitsliced brute forcer
+// https://github.com/aczid/crypto1_bs with some modifications. Mainly:
+// - don't rollback. Start with 2nd byte of nonce instead
+// - reuse results of filter subfunctions
+// - reuse results of previous nonces if some first bits are identical
+// 
+//-----------------------------------------------------------------------------
+// aczid's Copyright notice:
+//
+// Bit-sliced Crypto-1 brute-forcing implementation
+// Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid)
+/*
+Copyright (c) 2015-2016 Aram Verstegen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hardnested_bf_core.h"
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <string.h>
+#include "crapto1/crapto1.h"
+#include "parity.h"
+
+// bitslice type
+// while AVX supports 256 bit vector floating point operations, we need integer operations for boolean logic
+// same for AVX2 and 512 bit vectors
+// using larger vectors works but seems to generate more register pressure
+#if defined(__AVX512F__)
+#define MAX_BITSLICES 512
+#elif defined(__AVX2__)
+#define MAX_BITSLICES 256
+#elif defined(__AVX__)
+#define MAX_BITSLICES 128
+#elif defined(__SSE2__)
+#define MAX_BITSLICES 128
+#else // MMX or SSE
+#define MAX_BITSLICES 64
+#endif
+
+#define VECTOR_SIZE (MAX_BITSLICES/8)
+typedef unsigned int __attribute__((aligned(VECTOR_SIZE))) __attribute__((vector_size(VECTOR_SIZE))) bitslice_value_t;
+typedef union {
+        bitslice_value_t value;
+        uint64_t bytes64[MAX_BITSLICES/64];
+        uint8_t bytes[MAX_BITSLICES/8];
+} bitslice_t;
+
+// filter function (f20)
+// sourced from ``Wirelessly Pickpocketing a Mifare Classic Card'' by Flavio Garcia, Peter van Rossum, Roel Verdult and Ronny Wichers Schreur
+#define f20a(a,b,c,d) (((a|b)^(a&d))^(c&((a^b)|d)))
+#define f20b(a,b,c,d) (((a&b)|c)^((a^b)&(c|d)))
+#define f20c(a,b,c,d,e) ((a|((b|e)&(d^e)))^((a^(b&d))&((c^d)|(b&e))))
+
+// bit indexing
+#define get_bit(n, word) (((word) >> (n)) & 1)
+#define get_vector_bit(slice, value) get_bit((slice)&0x3f, value.bytes64[(slice)>>6])
+
+// size of crypto-1 state
+#define STATE_SIZE 48
+// size of nonce to be decrypted
+#define KEYSTREAM_SIZE 24
+
+// endianness conversion
+#define rev32(word) ((((word) & 0xff) << 24) | ((((word) >> 8) & 0xff) << 16) | ((((word) >> 16) & 0xff) << 8) | ((((word) >> 24) & 0xff)))
+
+// this needs to be compiled several times for each instruction set. 
+// For each instruction set, define a dedicated function name:
+#if defined (__AVX512F__)
+#define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX512
+#define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX512
+#elif defined (__AVX2__)
+#define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX2
+#define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX2
+#elif defined (__AVX__)
+#define BITSLICE_TEST_NONCES bitslice_test_nonces_AVX
+#define CRACK_STATES_BITSLICED crack_states_bitsliced_AVX
+#elif defined (__SSE2__)
+#define BITSLICE_TEST_NONCES bitslice_test_nonces_SSE2
+#define CRACK_STATES_BITSLICED crack_states_bitsliced_SSE2
+#elif defined (__MMX__) 
+#define BITSLICE_TEST_NONCES bitslice_test_nonces_MMX
+#define CRACK_STATES_BITSLICED crack_states_bitsliced_MMX
+#endif
+
+// typedefs and declaration of functions:
+typedef const uint64_t crack_states_bitsliced_t(uint32_t, uint8_t*, statelist_t*, uint32_t*, uint64_t*, uint32_t, uint8_t*, noncelist_t*);
+crack_states_bitsliced_t crack_states_bitsliced_AVX512;
+crack_states_bitsliced_t crack_states_bitsliced_AVX2;
+crack_states_bitsliced_t crack_states_bitsliced_AVX;
+crack_states_bitsliced_t crack_states_bitsliced_SSE2;
+crack_states_bitsliced_t crack_states_bitsliced_MMX;
+crack_states_bitsliced_t crack_states_bitsliced_dispatch;
+
+typedef void bitslice_test_nonces_t(uint32_t, uint32_t*, uint8_t*);
+bitslice_test_nonces_t bitslice_test_nonces_AVX512;
+bitslice_test_nonces_t bitslice_test_nonces_AVX2;
+bitslice_test_nonces_t bitslice_test_nonces_AVX;
+bitslice_test_nonces_t bitslice_test_nonces_SSE2;
+bitslice_test_nonces_t bitslice_test_nonces_MMX;
+bitslice_test_nonces_t bitslice_test_nonces_dispatch;
+
+#ifdef _WIN32
+#define malloc_bitslice(x) __builtin_assume_aligned(_aligned_malloc((x), MAX_BITSLICES/8), MAX_BITSLICES/8)
+#define free_bitslice(x) _aligned_free(x)
+#else
+#define malloc_bitslice(x) memalign(MAX_BITSLICES/8, (x))
+#define free_bitslice(x) free(x)
+#endif
+
+#if defined (__MMX__)          // (including more sophisticated instruction sets)
+typedef enum {
+       EVEN_STATE = 0,
+       ODD_STATE = 1
+} odd_even_t;
+
+
+// arrays of bitsliced states with identical values in all slices
+static bitslice_t bitsliced_encrypted_nonces[256][KEYSTREAM_SIZE];
+static bitslice_t bitsliced_encrypted_parity_bits[256][4];
+// 1 and 0 vectors
+static bitslice_t bs_ones;
+static bitslice_t bs_zeroes;
+
+
+void BITSLICE_TEST_NONCES(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
+
+       // initialize 1 and 0 vectors
+    memset(bs_ones.bytes, 0xff, VECTOR_SIZE);
+    memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE);
+
+       // bitslice nonces' 2nd to 4th byte
+       for (uint32_t i = 0; i < nonces_to_bruteforce; i++) {
+               for(uint32_t bit_idx = 0; bit_idx < KEYSTREAM_SIZE; bit_idx++){
+                       bool bit = get_bit(KEYSTREAM_SIZE-1-bit_idx, rev32(bf_test_nonce[i] << 8));
+                       if(bit){
+                               bitsliced_encrypted_nonces[i][bit_idx].value = bs_ones.value;
+                       } else {
+                               bitsliced_encrypted_nonces[i][bit_idx].value = bs_zeroes.value;
+                       }
+               }
+       }
+       // bitslice nonces' parity (4 bits)
+       for (uint32_t i = 0; i < nonces_to_bruteforce; i++) {
+               for(uint32_t bit_idx = 0; bit_idx < 4; bit_idx++){
+                       bool bit = get_bit(4-1-bit_idx, bf_test_nonce_par[i]);
+                       if(bit){
+                               bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_ones.value;
+                       } else {
+                               bitsliced_encrypted_parity_bits[i][bit_idx].value = bs_zeroes.value;
+                       }
+               }
+       }
+
+}
+
+
+const uint64_t CRACK_STATES_BITSLICED(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces){
+
+    // Unlike aczid's implementation this doesn't roll back at all when performing bitsliced bruteforce.
+       // We know that the best first byte is already shifted in. Testing with the remaining three bytes of 
+       // the nonces is sufficient to eliminate most of them. The small rest is tested with a simple unsliced
+       // brute forcing (including roll back).
+
+       bitslice_t states[KEYSTREAM_SIZE+STATE_SIZE];
+       bitslice_t * restrict state_p;
+    uint64_t key = -1;
+    uint64_t bucket_states_tested = 0;
+    uint32_t bucket_size[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1];
+    uint32_t bitsliced_blocks = 0;
+    uint32_t const *restrict p_even_end = p->states[EVEN_STATE] + p->len[EVEN_STATE];
+#if defined (DEBUG_BRUTE_FORCE)
+       uint32_t elimination_step = 0;
+       #define MAX_ELIMINATION_STEP    32
+       uint64_t keys_eliminated[MAX_ELIMINATION_STEP] = {0};
+#endif 
+#ifdef DEBUG_KEY_ELIMINATION
+       bool bucket_contains_test_key[(p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1];
+#endif
+
+       // constant ones/zeroes
+       bitslice_t bs_ones;
+    memset(bs_ones.bytes, 0xff, VECTOR_SIZE);
+       bitslice_t bs_zeroes;
+    memset(bs_zeroes.bytes, 0x00, VECTOR_SIZE);
+       
+    // bitslice all the even states
+    bitslice_t **restrict bitsliced_even_states = (bitslice_t **)malloc(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_t *));
+       if (bitsliced_even_states == NULL) {
+               printf("Out of memory error in brute_force. Aborting...");
+               exit(4);
+       }
+    bitslice_value_t *restrict bitsliced_even_feedback = malloc_bitslice(((p->len[EVEN_STATE] - 1)/MAX_BITSLICES + 1) * sizeof(bitslice_value_t));
+       if (bitsliced_even_feedback == NULL) {
+               printf("Out of memory error in brute_force. Aborting...");
+               exit(4);
+       }
+    for(uint32_t *restrict p_even = p->states[EVEN_STATE]; p_even < p_even_end; p_even += MAX_BITSLICES){
+        bitslice_t *restrict lstate_p = malloc_bitslice(STATE_SIZE/2*sizeof(bitslice_t));
+               if (lstate_p == NULL) {
+                       printf("Out of memory error in brute_force. Aborting... \n");
+                       exit(4);
+               }
+        memset(lstate_p, 0x00, STATE_SIZE/2*sizeof(bitslice_t)); // zero even bits
+        // bitslice even half-states
+        const uint32_t max_slices = (p_even_end-p_even) < MAX_BITSLICES ? p_even_end-p_even : MAX_BITSLICES;
+        bucket_size[bitsliced_blocks] = max_slices;
+#ifdef DEBUG_KEY_ELIMINATION
+               bucket_contains_test_key[bitsliced_blocks] = false;
+#endif
+               uint32_t slice_idx;
+        for(slice_idx = 0; slice_idx < max_slices; ++slice_idx){
+            uint32_t e = *(p_even+slice_idx);
+#ifdef DEBUG_KEY_ELIMINATION
+                       if (known_target_key != -1 && e == test_state[EVEN_STATE]) {
+                               bucket_contains_test_key[bitsliced_blocks] = true;
+                               // printf("bucket %d contains test key even state\n", bitsliced_blocks);
+                               // printf("in slice %d\n", slice_idx);
+                       }
+#endif
+            for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){
+                // set even bits
+                if(e&1){
+                    lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f);
+                }
+            }
+        }
+               // padding with last even state
+               for ( ; slice_idx < MAX_BITSLICES; ++slice_idx) {
+            uint32_t e = *(p_even_end-1);
+            for(uint32_t bit_idx = 0; bit_idx < STATE_SIZE/2; bit_idx++, e >>= 1){
+                // set even bits
+                if(e&1){
+                    lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx & 0x3f);
+                }
+            }
+               }                       
+        bitsliced_even_states[bitsliced_blocks] = lstate_p;
+               // bitsliced_even_feedback[bitsliced_blocks] = bs_ones;
+               bitsliced_even_feedback[bitsliced_blocks] = lstate_p[(47- 0)/2].value ^ 
+                                                    lstate_p[(47-10)/2].value ^ lstate_p[(47-12)/2].value ^ lstate_p[(47-14)/2].value ^
+                                                    lstate_p[(47-24)/2].value ^ lstate_p[(47-42)/2].value;
+               bitsliced_blocks++;
+    }
+    // bitslice every odd state to every block of even states
+    for(uint32_t const *restrict p_odd = p->states[ODD_STATE]; p_odd < p->states[ODD_STATE] + p->len[ODD_STATE]; ++p_odd){
+        // early abort
+        if(*keys_found){
+            goto out;
+        }
+               
+               // set odd state bits and pre-compute first keystream bit vector. This is the same for all blocks of even states
+               
+               state_p = &states[KEYSTREAM_SIZE];
+               uint32_t o = *p_odd;
+
+        // pre-compute the odd feedback bit
+        bool odd_feedback_bit = evenparity32(o&0x29ce5c);
+        const bitslice_value_t odd_feedback = odd_feedback_bit ? bs_ones.value : bs_zeroes.value;
+
+               // set odd state bits
+               for (uint32_t state_idx = 0; state_idx < STATE_SIZE; o >>= 1, state_idx += 2) {
+                       if (o & 1){
+                               state_p[state_idx] = bs_ones;
+                       } else {
+                               state_p[state_idx] = bs_zeroes;
+                       }
+               }
+               
+               bitslice_value_t crypto1_bs_f20b_2[16];
+               bitslice_value_t crypto1_bs_f20b_3[8];
+
+               crypto1_bs_f20b_2[0] = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
+               crypto1_bs_f20b_3[0] = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value);
+               
+               bitslice_value_t ksb[8];
+               ksb[0] = f20c(f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value),
+                             f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value),
+                             crypto1_bs_f20b_2[0],
+                             f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value),
+                             crypto1_bs_f20b_3[0]);
+
+               uint32_t *restrict p_even = p->states[EVEN_STATE];
+        for (uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx, p_even += MAX_BITSLICES) {
+
+#ifdef DEBUG_KEY_ELIMINATION
+                       // if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) {
+                               // printf("Now testing known target key.\n");
+                               // printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
+                       // }
+#endif
+            // add the even state bits
+                       const bitslice_t const *restrict bitsliced_even_state = bitsliced_even_states[block_idx];
+                       for(uint32_t state_idx = 1; state_idx < STATE_SIZE; state_idx += 2) {
+                               state_p[state_idx] = bitsliced_even_state[state_idx/2];
+                       }
+
+                       // pre-compute first feedback bit vector. This is the same for all nonces
+                       bitslice_value_t fbb[8];
+            fbb[0] = odd_feedback ^ bitsliced_even_feedback[block_idx]; 
+
+            // vector to contain test results (1 = passed, 0 = failed)
+            bitslice_t results = bs_ones;
+                       
+                       // parity_bits
+                       bitslice_value_t par[8];
+                       par[0] = bs_zeroes.value;
+                       uint32_t next_common_bits = 0;
+
+            for(uint32_t tests = 0; tests < nonces_to_bruteforce; ++tests){
+                               // common bits with preceding test nonce
+                               uint32_t common_bits = next_common_bits; //tests ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests-1]) : 0;
+                               next_common_bits = tests < nonces_to_bruteforce - 1 ? trailing_zeros(bf_test_nonce_2nd_byte[tests] ^ bf_test_nonce_2nd_byte[tests+1]) : 0;
+                uint32_t parity_bit_idx = 1;                                                   // start checking with the parity of second nonce byte
+                bitslice_value_t fb_bits = fbb[common_bits];           // start with precomputed feedback bits from previous nonce
+                bitslice_value_t ks_bits = ksb[common_bits];           // dito for first keystream bits
+                bitslice_value_t parity_bit_vector = par[common_bits]; // dito for first parity vector
+                               // bitslice_value_t fb_bits = fbb[0];           // start with precomputed feedback bits from previous nonce
+                               // bitslice_value_t ks_bits = ksb[0];           // dito for first keystream bits
+                               // bitslice_value_t parity_bit_vector = par[0]; // dito for first parity vector
+                               state_p -= common_bits;                                                         // and reuse the already calculated state bits
+                // highest bit is transmitted/received first. We start with Bit 23 (highest bit of second nonce byte),
+                               // or the highest bit which differs from the previous nonce
+                for (int32_t ks_idx = KEYSTREAM_SIZE-1-common_bits; ks_idx >= 0; --ks_idx) {
+
+                    // decrypt nonce bits
+                    const bitslice_value_t encrypted_nonce_bit_vector = bitsliced_encrypted_nonces[tests][ks_idx].value;
+                    const bitslice_value_t decrypted_nonce_bit_vector = encrypted_nonce_bit_vector ^ ks_bits;
+
+                    // compute real parity bits on the fly
+                    parity_bit_vector ^= decrypted_nonce_bit_vector;
+
+                    // update state
+                                       state_p--;
+                    state_p[0].value = fb_bits ^ decrypted_nonce_bit_vector;
+
+                                       // update crypto1 subfunctions
+                                       bitslice_value_t f20a_1, f20b_1, f20b_2, f20a_2, f20b_3;
+                                       f20a_2 = f20a(state_p[47-33].value, state_p[47-35].value, state_p[47-37].value, state_p[47-39].value);
+                                       f20b_3 = f20b(state_p[47-41].value, state_p[47-43].value, state_p[47-45].value, state_p[47-47].value);
+                                       if (ks_idx > KEYSTREAM_SIZE - 8) {
+                                               f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
+                                               f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value);
+                                               f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
+                                               crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2;
+                                               crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx] = f20b_3;
+                                       } else if (ks_idx > KEYSTREAM_SIZE - 16) {
+                                               f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
+                                               f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8];
+                                               f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
+                                               crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx] = f20b_2; 
+                                       } else if (ks_idx > KEYSTREAM_SIZE - 24){
+                                               f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
+                                               f20b_1 = crypto1_bs_f20b_2[KEYSTREAM_SIZE - ks_idx - 8];
+                                               f20b_2 = crypto1_bs_f20b_3[KEYSTREAM_SIZE - ks_idx - 16];
+                                       } else {
+                                               f20a_1 = f20a(state_p[47- 9].value, state_p[47-11].value, state_p[47-13].value, state_p[47-15].value);
+                                               f20b_1 = f20b(state_p[47-17].value, state_p[47-19].value, state_p[47-21].value, state_p[47-23].value);
+                                               f20b_2 = f20b(state_p[47-25].value, state_p[47-27].value, state_p[47-29].value, state_p[47-31].value);
+                                       }                                               
+                                       // update keystream bit
+                                       ks_bits = f20c(f20a_1, f20b_1, f20b_2, f20a_2, f20b_3);
+
+                    // for each completed byte:
+                    if ((ks_idx & 0x07) == 0) {
+                        // get encrypted parity bits
+                        const bitslice_value_t encrypted_parity_bit_vector = bitsliced_encrypted_parity_bits[tests][parity_bit_idx++].value;
+
+                        // decrypt parity bits
+                        const bitslice_value_t decrypted_parity_bit_vector = encrypted_parity_bit_vector ^ ks_bits;
+
+                        // compare actual parity bits with decrypted parity bits and take count in results vector
+                        results.value &= ~parity_bit_vector ^ decrypted_parity_bit_vector;
+
+                        // make sure we still have a match in our set
+                        // if(memcmp(&results, &bs_zeroes, sizeof(bitslice_t)) == 0){
+
+                        // this is much faster on my gcc, because somehow a memcmp needlessly spills/fills all the xmm registers to/from the stack - ???
+                        // the short-circuiting also helps
+                        if(results.bytes64[0] == 0
+#if MAX_BITSLICES > 64
+                           && results.bytes64[1] == 0
+#endif
+#if MAX_BITSLICES > 128
+                           && results.bytes64[2] == 0
+                           && results.bytes64[3] == 0
+#endif
+                          ) {
+#if defined (DEBUG_BRUTE_FORCE)                                                  
+                                                       if (elimination_step < MAX_ELIMINATION_STEP) {
+                                                               keys_eliminated[elimination_step] += MAX_BITSLICES;
+                                                       }
+#endif
+#ifdef DEBUG_KEY_ELIMINATION
+                                                       if (known_target_key != -1 && bucket_contains_test_key[block_idx] && *p_odd == test_state[ODD_STATE]) {
+                                                               printf("Known target key eliminated in brute_force.\n");
+                                                               printf("block_idx = %d/%d, nonce = %d/%d\n", block_idx, bitsliced_blocks, tests, nonces_to_bruteforce);
+                                                       }
+#endif
+                                                       goto stop_tests;
+                                               }
+                                               // prepare for next nonce byte
+#if defined (DEBUG_BRUTE_FORCE)                                                          
+                                               elimination_step++;
+#endif
+                                               parity_bit_vector = bs_zeroes.value;
+                                       }                                               
+                                       // update feedback bit vector
+                                       if (ks_idx != 0) {
+                                               fb_bits = 
+                                                                 (state_p[47- 0].value ^ state_p[47- 5].value ^ state_p[47- 9].value ^
+                                                                  state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^
+                                                                  state_p[47-15].value ^ state_p[47-17].value ^ state_p[47-19].value ^
+                                                                  state_p[47-24].value ^ state_p[47-25].value ^ state_p[47-27].value ^
+                                                                  state_p[47-29].value ^ state_p[47-35].value ^ state_p[47-39].value ^
+                                                                  state_p[47-41].value ^ state_p[47-42].value ^ state_p[47-43].value);
+                                       }
+                                       // remember feedback and keystream vectors for later use
+                                       uint8_t bit = KEYSTREAM_SIZE - ks_idx;
+                                       if (bit <= next_common_bits) {  // if needed and not yet stored
+                                               fbb[bit] = fb_bits;
+                                               ksb[bit] = ks_bits;
+                                               par[bit] = parity_bit_vector;
+                                       }
+                }
+                               // prepare for next nonce. Revert to initial state
+                               state_p = &states[KEYSTREAM_SIZE];
+            }
+
+            // all nonce tests were successful: we've found a possible key in this block!
+                       uint32_t *p_even_test = p_even;
+            for (uint32_t results_word = 0; results_word < MAX_BITSLICES / 64; ++results_word) {
+                               uint64_t results64 = results.bytes64[results_word];
+                               for (uint32_t results_bit = 0; results_bit < 64; results_bit++) {
+                                       if (results64 & 0x01) {
+                                               if (verify_key(cuid, nonces, best_first_bytes, *p_odd, *p_even_test)) {
+                                                       struct Crypto1State pcs;
+                                                       pcs.odd = *p_odd;
+                                                       pcs.even = *p_even_test;
+                                                       lfsr_rollback_byte(&pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+                                                       crypto1_get_lfsr(&pcs, &key);
+                                                       bucket_states_tested += 64 * results_word + results_bit;
+                                                       goto out;
+                                               }
+#ifdef DEBUG_KEY_ELIMINATION
+                                               if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) {
+                                                       printf("Known target key eliminated in brute_force verification.\n");
+                                                       printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
+                                               }
+#endif
+                                       }
+#ifdef DEBUG_KEY_ELIMINATION
+                                       if (known_target_key != -1 && *p_even_test == test_state[EVEN_STATE] && *p_odd == test_state[ODD_STATE]) {
+                                               printf("Known target key eliminated in brute_force (results_bit == 0).\n");
+                                               printf("block_idx = %d/%d\n", block_idx, bitsliced_blocks);
+                                       }
+#endif
+                                       results64 >>= 1;
+                                       p_even_test++;
+                                       if (p_even_test == p_even_end) {
+                                               goto stop_tests;
+                                       }
+                               }
+            }
+stop_tests:
+#if defined (DEBUG_BRUTE_FORCE)                                                          
+                       elimination_step = 0;
+#endif                 
+            bucket_states_tested += bucket_size[block_idx];
+            // prepare to set new states
+                       state_p = &states[KEYSTREAM_SIZE];
+            continue;
+        }
+    }
+out:
+    for(uint32_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){
+        free_bitslice(bitsliced_even_states[block_idx]);
+    }
+       free(bitsliced_even_states);
+       free_bitslice(bitsliced_even_feedback);
+    __sync_fetch_and_add(num_keys_tested, bucket_states_tested);
+       
+#if defined (DEBUG_BRUTE_FORCE)        
+       for (uint32_t i = 0; i < MAX_ELIMINATION_STEP; i++) {
+               printf("Eliminated after %2u test_bytes: %5.2f%%\n", i+1, (float)keys_eliminated[i] / bucket_states_tested * 100);
+       }
+#endif 
+    return key;
+}
+#endif
+
+
+#ifndef __MMX__
+
+// pointers to functions:
+crack_states_bitsliced_t *crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch;
+bitslice_test_nonces_t *bitslice_test_nonces_function_p = &bitslice_test_nonces_dispatch;
+
+// determine the available instruction set at runtime and call the correct function
+const uint64_t crack_states_bitsliced_dispatch(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) {
+       if (__builtin_cpu_supports("avx512f")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX512;
+       else if (__builtin_cpu_supports("avx2")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX2;
+       else if (__builtin_cpu_supports("avx")) crack_states_bitsliced_function_p = &crack_states_bitsliced_AVX;
+       else if (__builtin_cpu_supports("sse2")) crack_states_bitsliced_function_p = &crack_states_bitsliced_SSE2;
+       else if (__builtin_cpu_supports("mmx")) crack_states_bitsliced_function_p = &crack_states_bitsliced_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces);
+}
+
+void bitslice_test_nonces_dispatch(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
+       if (__builtin_cpu_supports("avx512f")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitslice_test_nonces_function_p = &bitslice_test_nonces_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitslice_test_nonces_function_p = &bitslice_test_nonces_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitslice_test_nonces_function_p = &bitslice_test_nonces_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par);
+}
+
+// Entries to dispatched function calls
+const uint64_t crack_states_bitsliced(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonce_2nd_byte, noncelist_t *nonces) {
+    return (*crack_states_bitsliced_function_p)(cuid, best_first_bytes, p, keys_found, num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, nonces);
+}
+
+void bitslice_test_nonces(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonce, uint8_t *bf_test_nonce_par) {
+    (*bitslice_test_nonces_function_p)(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par);
+}
+
+#endif
diff --git a/client/hardnested/hardnested_bf_core.h b/client/hardnested/hardnested_bf_core.h
new file mode 100644 (file)
index 0000000..7a44599
--- /dev/null
@@ -0,0 +1,58 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+//
+// brute forcing is based on @aczids bitsliced brute forcer
+// https://github.com/aczid/crypto1_bs with some modifications. Mainly:
+// - don't rollback. Start with 2nd byte of nonce instead
+// - reuse results of filter subfunctions
+// - reuse results of previous nonces if some first bits are identical
+// 
+//-----------------------------------------------------------------------------
+// aczid's Copyright notice:
+//
+// Bit-sliced Crypto-1 brute-forcing implementation
+// Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid)
+/*
+Copyright (c) 2015-2016 Aram Verstegen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HARDNESTED_BF_CORE_H__
+#define HARDNESTED_BF_CORE_H__
+
+#include "hardnested_bruteforce.h"                     // statelist_t
+
+extern const uint64_t crack_states_bitsliced(uint32_t cuid, uint8_t *best_first_bytes, statelist_t *p, uint32_t *keys_found, uint64_t *num_keys_tested, uint32_t nonces_to_bruteforce, uint8_t *bf_test_nonces_2nd_byte, noncelist_t *nonces);
+extern void bitslice_test_nonces(uint32_t nonces_to_bruteforce, uint32_t *bf_test_nonces, uint8_t *bf_test_nonce_par);
+
+#endif
diff --git a/client/hardnested/hardnested_bitarray_core.c b/client/hardnested/hardnested_bitarray_core.c
new file mode 100644 (file)
index 0000000..d2fdb89
--- /dev/null
@@ -0,0 +1,545 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.ch b
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+// some helper functions which can benefit from SIMD instructions or other special instructions
+//
+
+#include "hardnested_bitarray_core.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+// #include <stdint.h>
+// #include <stdbool.h>
+// #include <stdlib.h>
+// #include <stdio.h>
+// #include <malloc.h>
+// #include <string.h>
+// #include "crapto1/crapto1.h"
+// #include "parity.h"
+
+
+// this needs to be compiled several times for each instruction set. 
+// For each instruction set, define a dedicated function name:
+#if defined (__AVX512F__)
+#define MALLOC_BITARRAY malloc_bitarray_AVX512
+#define FREE_BITARRAY free_bitarray_AVX512
+#define BITCOUNT bitcount_AVX512
+#define COUNT_STATES count_states_AVX512
+#define BITARRAY_AND bitarray_AND_AVX512
+#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX512
+#define COUNT_BITARRAY_AND count_bitarray_AND_AVX512
+#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX512
+#define BITARRAY_AND4 bitarray_AND4_AVX512
+#define BITARRAY_OR bitarray_OR_AVX512
+#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX512
+#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX512
+#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX512
+#elif defined (__AVX2__)
+#define MALLOC_BITARRAY malloc_bitarray_AVX2
+#define FREE_BITARRAY free_bitarray_AVX2
+#define BITCOUNT bitcount_AVX2
+#define COUNT_STATES count_states_AVX2
+#define BITARRAY_AND bitarray_AND_AVX2
+#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX2
+#define COUNT_BITARRAY_AND count_bitarray_AND_AVX2
+#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX2
+#define BITARRAY_AND4 bitarray_AND4_AVX2
+#define BITARRAY_OR bitarray_OR_AVX2
+#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX2
+#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX2
+#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX2
+#elif defined (__AVX__)
+#define MALLOC_BITARRAY malloc_bitarray_AVX
+#define FREE_BITARRAY free_bitarray_AVX
+#define BITCOUNT bitcount_AVX
+#define COUNT_STATES count_states_AVX
+#define BITARRAY_AND bitarray_AND_AVX
+#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX
+#define COUNT_BITARRAY_AND count_bitarray_AND_AVX
+#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX
+#define BITARRAY_AND4 bitarray_AND4_AVX
+#define BITARRAY_OR bitarray_OR_AVX
+#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX
+#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX
+#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX
+#elif defined (__SSE2__)
+#define MALLOC_BITARRAY malloc_bitarray_SSE2
+#define FREE_BITARRAY free_bitarray_SSE2
+#define BITCOUNT bitcount_SSE2
+#define COUNT_STATES count_states_SSE2
+#define BITARRAY_AND bitarray_AND_SSE2
+#define BITARRAY_LOW20_AND bitarray_low20_AND_SSE2
+#define COUNT_BITARRAY_AND count_bitarray_AND_SSE2
+#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_SSE2
+#define BITARRAY_AND4 bitarray_AND4_SSE2
+#define BITARRAY_OR bitarray_OR_SSE2
+#define COUNT_BITARRAY_AND2 count_bitarray_AND2_SSE2
+#define COUNT_BITARRAY_AND3 count_bitarray_AND3_SSE2
+#define COUNT_BITARRAY_AND4 count_bitarray_AND4_SSE2
+#elif defined (__MMX__) 
+#define MALLOC_BITARRAY malloc_bitarray_MMX
+#define FREE_BITARRAY free_bitarray_MMX
+#define BITCOUNT bitcount_MMX
+#define COUNT_STATES count_states_MMX
+#define BITARRAY_AND bitarray_AND_MMX
+#define BITARRAY_LOW20_AND bitarray_low20_AND_MMX
+#define COUNT_BITARRAY_AND count_bitarray_AND_MMX
+#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_MMX
+#define BITARRAY_AND4 bitarray_AND4_MMX
+#define BITARRAY_OR bitarray_OR_MMX
+#define COUNT_BITARRAY_AND2 count_bitarray_AND2_MMX
+#define COUNT_BITARRAY_AND3 count_bitarray_AND3_MMX
+#define COUNT_BITARRAY_AND4 count_bitarray_AND4_MMX
+#endif
+
+
+// typedefs and declaration of functions:
+typedef uint32_t* malloc_bitarray_t(uint32_t);
+malloc_bitarray_t malloc_bitarray_AVX512, malloc_bitarray_AVX2, malloc_bitarray_AVX, malloc_bitarray_SSE2, malloc_bitarray_MMX, malloc_bitarray_dispatch;
+typedef void free_bitarray_t(uint32_t*);
+free_bitarray_t free_bitarray_AVX512, free_bitarray_AVX2, free_bitarray_AVX, free_bitarray_SSE2, free_bitarray_MMX, free_bitarray_dispatch;
+typedef uint32_t bitcount_t(uint32_t);
+bitcount_t bitcount_AVX512, bitcount_AVX2, bitcount_AVX, bitcount_SSE2, bitcount_MMX, bitcount_dispatch;
+typedef uint32_t count_states_t(uint32_t*);
+count_states_t count_states_AVX512, count_states_AVX2, count_states_AVX, count_states_SSE2, count_states_MMX, count_states_dispatch;
+typedef void bitarray_AND_t(uint32_t[], uint32_t[]);
+bitarray_AND_t bitarray_AND_AVX512, bitarray_AND_AVX2, bitarray_AND_AVX, bitarray_AND_SSE2, bitarray_AND_MMX, bitarray_AND_dispatch;
+typedef void bitarray_low20_AND_t(uint32_t*, uint32_t*);
+bitarray_low20_AND_t bitarray_low20_AND_AVX512, bitarray_low20_AND_AVX2, bitarray_low20_AND_AVX, bitarray_low20_AND_SSE2, bitarray_low20_AND_MMX, bitarray_low20_AND_dispatch;
+typedef uint32_t count_bitarray_AND_t(uint32_t*, uint32_t*);
+count_bitarray_AND_t count_bitarray_AND_AVX512, count_bitarray_AND_AVX2, count_bitarray_AND_AVX, count_bitarray_AND_SSE2, count_bitarray_AND_MMX, count_bitarray_AND_dispatch;
+typedef uint32_t count_bitarray_low20_AND_t(uint32_t*, uint32_t*);
+count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512, count_bitarray_low20_AND_AVX2, count_bitarray_low20_AND_AVX, count_bitarray_low20_AND_SSE2, count_bitarray_low20_AND_MMX, count_bitarray_low20_AND_dispatch;
+typedef void bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
+bitarray_AND4_t bitarray_AND4_AVX512, bitarray_AND4_AVX2, bitarray_AND4_AVX, bitarray_AND4_SSE2, bitarray_AND4_MMX, bitarray_AND4_dispatch;
+typedef void bitarray_OR_t(uint32_t[], uint32_t[]);
+bitarray_OR_t bitarray_OR_AVX512, bitarray_OR_AVX2, bitarray_OR_AVX, bitarray_OR_SSE2, bitarray_OR_MMX, bitarray_OR_dispatch;
+typedef uint32_t count_bitarray_AND2_t(uint32_t*, uint32_t*);
+count_bitarray_AND2_t count_bitarray_AND2_AVX512, count_bitarray_AND2_AVX2, count_bitarray_AND2_AVX, count_bitarray_AND2_SSE2, count_bitarray_AND2_MMX, count_bitarray_AND2_dispatch;
+typedef uint32_t count_bitarray_AND3_t(uint32_t*, uint32_t*, uint32_t*);
+count_bitarray_AND3_t count_bitarray_AND3_AVX512, count_bitarray_AND3_AVX2, count_bitarray_AND3_AVX, count_bitarray_AND3_SSE2, count_bitarray_AND3_MMX, count_bitarray_AND3_dispatch;
+typedef uint32_t count_bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
+count_bitarray_AND4_t count_bitarray_AND4_AVX512, count_bitarray_AND4_AVX2, count_bitarray_AND4_AVX, count_bitarray_AND4_SSE2, count_bitarray_AND4_MMX, count_bitarray_AND4_dispatch;
+
+
+inline uint32_t *MALLOC_BITARRAY(uint32_t x)
+{
+#ifdef _WIN32
+       return __builtin_assume_aligned(_aligned_malloc((x), __BIGGEST_ALIGNMENT__), __BIGGEST_ALIGNMENT__);
+#else
+       return __builtin_assume_aligned(memalign(__BIGGEST_ALIGNMENT__, (x)), __BIGGEST_ALIGNMENT__);
+#endif
+}
+
+
+inline void FREE_BITARRAY(uint32_t *x)
+{
+#ifdef _WIN32
+       _aligned_free(x);
+#else
+       free(x);
+#endif
+}
+
+       
+inline uint32_t BITCOUNT(uint32_t a)
+{
+       return __builtin_popcountl(a);
+}
+
+
+inline uint32_t COUNT_STATES(uint32_t *A)
+{
+       uint32_t count = 0;
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               count += BITCOUNT(A[i]);
+       }
+       return count;
+}
+
+
+inline void BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               A[i] &= B[i];
+       }
+}
+
+
+inline void BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
+{
+       uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       
+       for (uint32_t i = 0; i < (1<<20); i++) {
+               if (!b[i]) {
+                       a[i] = 0;
+               }
+       }       
+}
+
+
+inline uint32_t COUNT_BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       uint32_t count = 0;
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               A[i] &= B[i];
+               count += BITCOUNT(A[i]);
+       }
+       return count;
+}
+
+
+inline uint32_t COUNT_BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
+{
+       uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       uint32_t count = 0;
+       
+       for (uint32_t i = 0; i < (1<<20); i++) {
+               if (!b[i]) {
+                       a[i] = 0;
+               }
+               count += BITCOUNT(a[i]);
+       }
+       return count;   
+}
+
+
+inline void BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
+       D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               A[i] = B[i] & C[i] & D[i];
+       }
+}
+
+
+inline void BITARRAY_OR(uint32_t *restrict A, uint32_t *restrict B)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               A[i] |= B[i];
+       }
+}
+
+
+inline uint32_t COUNT_BITARRAY_AND2(uint32_t *restrict A, uint32_t *restrict B)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       uint32_t count = 0;
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               count += BITCOUNT(A[i] & B[i]);
+       }
+       return count;
+}
+
+
+inline uint32_t COUNT_BITARRAY_AND3(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
+       uint32_t count = 0;
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               count += BITCOUNT(A[i] & B[i] & C[i]);
+       }
+       return count;
+}
+
+
+inline uint32_t COUNT_BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
+{
+       A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
+       B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
+       C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
+       D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
+       uint32_t count = 0;
+       for (uint32_t i = 0; i < (1<<19); i++) {
+               count += BITCOUNT(A[i] & B[i] & C[i] & D[i]);
+       }
+       return count;
+}
+
+#ifndef __MMX__
+
+// pointers to functions:
+malloc_bitarray_t *malloc_bitarray_function_p = &malloc_bitarray_dispatch;
+free_bitarray_t *free_bitarray_function_p = &free_bitarray_dispatch;
+bitcount_t *bitcount_function_p = &bitcount_dispatch;
+count_states_t *count_states_function_p = &count_states_dispatch;
+bitarray_AND_t *bitarray_AND_function_p = &bitarray_AND_dispatch;
+bitarray_low20_AND_t *bitarray_low20_AND_function_p = &bitarray_low20_AND_dispatch;
+count_bitarray_AND_t *count_bitarray_AND_function_p = &count_bitarray_AND_dispatch;
+count_bitarray_low20_AND_t *count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_dispatch;
+bitarray_AND4_t *bitarray_AND4_function_p = &bitarray_AND4_dispatch;
+bitarray_OR_t *bitarray_OR_function_p = &bitarray_OR_dispatch;
+count_bitarray_AND2_t *count_bitarray_AND2_function_p = &count_bitarray_AND2_dispatch;
+count_bitarray_AND3_t *count_bitarray_AND3_function_p = &count_bitarray_AND3_dispatch;
+count_bitarray_AND4_t *count_bitarray_AND4_function_p = &count_bitarray_AND4_dispatch;
+
+// determine the available instruction set at runtime and call the correct function
+uint32_t *malloc_bitarray_dispatch(uint32_t x) {
+       if (__builtin_cpu_supports("avx512f")) malloc_bitarray_function_p = &malloc_bitarray_AVX512;
+       else if (__builtin_cpu_supports("avx2")) malloc_bitarray_function_p = &malloc_bitarray_AVX2;
+       else if (__builtin_cpu_supports("avx")) malloc_bitarray_function_p = &malloc_bitarray_AVX;
+       else if (__builtin_cpu_supports("sse2")) malloc_bitarray_function_p = &malloc_bitarray_SSE2;
+       else if (__builtin_cpu_supports("mmx")) malloc_bitarray_function_p = &malloc_bitarray_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*malloc_bitarray_function_p)(x);
+}
+
+void free_bitarray_dispatch(uint32_t *x) {
+       if (__builtin_cpu_supports("avx512f")) free_bitarray_function_p = &free_bitarray_AVX512;
+       else if (__builtin_cpu_supports("avx2")) free_bitarray_function_p = &free_bitarray_AVX2;
+       else if (__builtin_cpu_supports("avx")) free_bitarray_function_p = &free_bitarray_AVX;
+       else if (__builtin_cpu_supports("sse2")) free_bitarray_function_p = &free_bitarray_SSE2;
+       else if (__builtin_cpu_supports("mmx")) free_bitarray_function_p = &free_bitarray_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*free_bitarray_function_p)(x);
+}
+
+uint32_t bitcount_dispatch(uint32_t a) {
+       if (__builtin_cpu_supports("avx512f")) bitcount_function_p = &bitcount_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitcount_function_p = &bitcount_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitcount_function_p = &bitcount_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitcount_function_p = &bitcount_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitcount_function_p = &bitcount_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*bitcount_function_p)(a);
+}
+
+uint32_t count_states_dispatch(uint32_t *bitarray) {
+       if (__builtin_cpu_supports("avx512f")) count_states_function_p = &count_states_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_states_function_p = &count_states_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_states_function_p = &count_states_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_states_function_p = &count_states_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_states_function_p = &count_states_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_states_function_p)(bitarray);
+}
+
+void bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) bitarray_AND_function_p = &bitarray_AND_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitarray_AND_function_p = &bitarray_AND_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitarray_AND_function_p = &bitarray_AND_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitarray_AND_function_p = &bitarray_AND_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitarray_AND_function_p = &bitarray_AND_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*bitarray_AND_function_p)(A,B);
+}
+
+void bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*bitarray_low20_AND_function_p)(A, B);
+}
+
+uint32_t count_bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_bitarray_AND_function_p = &count_bitarray_AND_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_bitarray_AND_function_p = &count_bitarray_AND_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_bitarray_AND_function_p)(A, B);
+}
+
+uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_bitarray_low20_AND_function_p)(A, B);
+}
+
+void bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
+       if (__builtin_cpu_supports("avx512f")) bitarray_AND4_function_p = &bitarray_AND4_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitarray_AND4_function_p = &bitarray_AND4_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitarray_AND4_function_p = &bitarray_AND4_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitarray_AND4_function_p = &bitarray_AND4_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitarray_AND4_function_p = &bitarray_AND4_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*bitarray_AND4_function_p)(A, B, C, D);
+}
+
+void bitarray_OR_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) bitarray_OR_function_p = &bitarray_OR_AVX512;
+       else if (__builtin_cpu_supports("avx2")) bitarray_OR_function_p = &bitarray_OR_AVX2;
+       else if (__builtin_cpu_supports("avx")) bitarray_OR_function_p = &bitarray_OR_AVX;
+       else if (__builtin_cpu_supports("sse2")) bitarray_OR_function_p = &bitarray_OR_SSE2;
+       else if (__builtin_cpu_supports("mmx")) bitarray_OR_function_p = &bitarray_OR_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    (*bitarray_OR_function_p)(A,B);
+}
+
+uint32_t count_bitarray_AND2_dispatch(uint32_t *A, uint32_t *B) {
+       if (__builtin_cpu_supports("avx512f")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_bitarray_AND2_function_p)(A, B);
+}
+
+uint32_t count_bitarray_AND3_dispatch(uint32_t *A, uint32_t *B, uint32_t *C) {
+       if (__builtin_cpu_supports("avx512f")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_bitarray_AND3_function_p)(A, B, C);
+}
+
+uint32_t count_bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
+       if (__builtin_cpu_supports("avx512f")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX512;
+       else if (__builtin_cpu_supports("avx2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX2;
+       else if (__builtin_cpu_supports("avx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX;
+       else if (__builtin_cpu_supports("sse2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_SSE2;
+       else if (__builtin_cpu_supports("mmx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_MMX;
+    else {
+        printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
+        exit(5);
+    }
+    // call the most optimized function for this CPU
+    return (*count_bitarray_AND4_function_p)(A, B, C, D);
+}
+
+
+///////////////////////////////////////////////77
+// Entries to dispatched function calls
+
+uint32_t *malloc_bitarray(uint32_t x) {
+    return (*malloc_bitarray_function_p)(x);
+}
+
+void free_bitarray(uint32_t *x) {
+    (*free_bitarray_function_p)(x);
+}
+
+uint32_t bitcount(uint32_t a) {
+    return (*bitcount_function_p)(a);
+}
+
+uint32_t count_states(uint32_t *bitarray) {
+    return (*count_states_function_p)(bitarray);
+}
+
+void bitarray_AND(uint32_t *A, uint32_t *B) {
+    (*bitarray_AND_function_p)(A, B);
+}
+
+void bitarray_low20_AND(uint32_t *A, uint32_t *B) {
+    (*bitarray_low20_AND_function_p)(A, B);
+}
+
+uint32_t count_bitarray_AND(uint32_t *A, uint32_t *B) {
+    return (*count_bitarray_AND_function_p)(A, B);
+}
+
+uint32_t count_bitarray_low20_AND(uint32_t *A, uint32_t *B) {
+    return (*count_bitarray_low20_AND_function_p)(A, B);
+}
+
+void bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
+    (*bitarray_AND4_function_p)(A, B, C, D);
+}
+
+void bitarray_OR(uint32_t *A, uint32_t *B) {
+    (*bitarray_OR_function_p)(A, B);
+}
+
+uint32_t count_bitarray_AND2(uint32_t *A, uint32_t *B) {
+    return (*count_bitarray_AND2_function_p)(A, B);
+}
+
+uint32_t count_bitarray_AND3(uint32_t *A, uint32_t *B, uint32_t *C) {
+    return (*count_bitarray_AND3_function_p)(A, B, C);
+}
+
+uint32_t count_bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
+    return (*count_bitarray_AND4_function_p)(A, B, C, D);
+}
+
+#endif
+
diff --git a/client/hardnested/hardnested_bitarray_core.h b/client/hardnested/hardnested_bitarray_core.h
new file mode 100644 (file)
index 0000000..0d92d6b
--- /dev/null
@@ -0,0 +1,69 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+//
+// brute forcing is based on @aczids bitsliced brute forcer
+// https://github.com/aczid/crypto1_bs with some modifications. Mainly:
+// - don't rollback. Start with 2nd byte of nonce instead
+// - reuse results of filter subfunctions
+// - reuse results of previous nonces if some first bits are identical
+// 
+//-----------------------------------------------------------------------------
+// aczid's Copyright notice:
+//
+// Bit-sliced Crypto-1 brute-forcing implementation
+// Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid)
+/*
+Copyright (c) 2015-2016 Aram Verstegen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HARDNESTED_BITARRAY_CORE_H__
+#define HARDNESTED_BITARRAY_CORE_H__
+
+#include <stdint.h>
+
+extern uint32_t *malloc_bitarray(uint32_t x);
+extern void free_bitarray(uint32_t *x);
+extern uint32_t bitcount(uint32_t a);
+extern uint32_t count_states(uint32_t *A);
+extern void bitarray_AND(uint32_t *A, uint32_t *B);
+extern void bitarray_low20_AND(uint32_t *A, uint32_t *B);
+extern uint32_t count_bitarray_AND(uint32_t *A, uint32_t *B);
+extern uint32_t count_bitarray_low20_AND(uint32_t *A, uint32_t *B);
+extern void bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D);
+extern void bitarray_OR(uint32_t *A, uint32_t *B);
+extern uint32_t count_bitarray_AND2(uint32_t *A, uint32_t *B);
+extern uint32_t count_bitarray_AND3(uint32_t *A, uint32_t *B, uint32_t *C);
+extern uint32_t count_bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D);
+
+#endif
diff --git a/client/hardnested/hardnested_bruteforce.c b/client/hardnested/hardnested_bruteforce.c
new file mode 100644 (file)
index 0000000..3218c1a
--- /dev/null
@@ -0,0 +1,471 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+//
+// brute forcing is based on @aczids bitsliced brute forcer
+// https://github.com/aczid/crypto1_bs with some modifications. Mainly:
+// - don't rollback. Start with 2nd byte of nonce instead
+// - reuse results of filter subfunctions
+// - reuse results of previous nonces if some first bits are identical
+// 
+//-----------------------------------------------------------------------------
+// aczid's Copyright notice:
+//
+// Bit-sliced Crypto-1 brute-forcing implementation
+// Builds on the data structures returned by CraptEV1 craptev1_get_space(nonces, threshold, uid)
+/*
+Copyright (c) 2015-2016 Aram Verstegen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hardnested_bruteforce.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <malloc.h>
+#include "proxmark3.h"
+#include "cmdhfmfhard.h"
+#include "hardnested_bf_core.h"
+#include "ui.h"
+#include "util.h"
+#include "crapto1/crapto1.h"
+#include "parity.h"
+
+#define NUM_BRUTE_FORCE_THREADS                        (num_CPUs())
+#define DEFAULT_BRUTE_FORCE_RATE               (120000000.0)           // if benchmark doesn't succeed
+#define TEST_BENCH_SIZE                                        (6000)                          // number of odd and even states for brute force benchmark
+#define TEST_BENCH_FILENAME                            "hardnested/bf_bench_data.bin"
+//#define WRITE_BENCH_FILE
+
+// debugging options
+#define DEBUG_KEY_ELIMINATION
+// #define DEBUG_BRUTE_FORCE
+
+typedef enum {
+       EVEN_STATE = 0,
+       ODD_STATE = 1
+} odd_even_t;
+
+static uint32_t nonces_to_bruteforce = 0;
+static uint32_t bf_test_nonce[256];
+static uint8_t bf_test_nonce_2nd_byte[256];
+static uint8_t bf_test_nonce_par[256];
+static uint32_t bucket_count = 0;
+static statelist_t* buckets[128];
+static uint32_t keys_found = 0;
+static uint64_t num_keys_tested;
+
+
+uint8_t trailing_zeros(uint8_t byte) 
+{
+       static const uint8_t trailing_zeros_LUT[256] = {
+               8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+               4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+       };
+
+       return trailing_zeros_LUT[byte];
+}
+
+
+bool verify_key(uint32_t cuid, noncelist_t *nonces, uint8_t *best_first_bytes, uint32_t odd, uint32_t even)
+{
+       struct Crypto1State pcs;
+       for (uint16_t test_first_byte = 1; test_first_byte < 256; test_first_byte++) {
+               noncelistentry_t *test_nonce = nonces[best_first_bytes[test_first_byte]].first;
+               while (test_nonce != NULL) {
+                       pcs.odd = odd;
+                       pcs.even = even;
+                       lfsr_rollback_byte(&pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+                       for (int8_t byte_pos = 3; byte_pos >= 0; byte_pos--) {
+                               uint8_t test_par_enc_bit = (test_nonce->par_enc >> byte_pos) & 0x01;                    // the encoded parity bit
+                               uint8_t test_byte_enc = (test_nonce->nonce_enc >> (8*byte_pos)) & 0xff;                 // the encoded nonce byte
+                               uint8_t test_byte_dec = crypto1_byte(&pcs, test_byte_enc /* ^ (cuid >> (8*byte_pos)) */, true) ^ test_byte_enc;         // decode the nonce byte        
+                               uint8_t ks_par = filter(pcs.odd);                                                                                               // the keystream bit to encode/decode the parity bit
+                               uint8_t test_par_enc2 = ks_par ^ evenparity8(test_byte_dec);                                    // determine the decoded byte's parity and encode it
+                               if (test_par_enc_bit != test_par_enc2) {
+                                       return false;
+                               }
+                       }
+                       test_nonce = test_nonce->next;
+               }
+       }
+       return true;
+}
+
+
+static void* crack_states_thread(void* x){
+
+       struct arg {
+               bool silent;
+               int thread_ID;
+               uint32_t cuid;
+               uint32_t num_acquired_nonces;
+               uint64_t maximum_states;
+               noncelist_t *nonces;
+               uint8_t* best_first_bytes;
+       } *thread_arg;
+
+       thread_arg = (struct arg *)x;
+    const int thread_id = thread_arg->thread_ID;
+    uint32_t current_bucket = thread_id;
+    while(current_bucket < bucket_count){
+        statelist_t *bucket = buckets[current_bucket];
+        if(bucket){
+#if defined (DEBUG_BRUTE_FORCE)        
+                       printf("Thread %u starts working on bucket %u\n", thread_id, current_bucket);
+#endif                 
+            const uint64_t key = crack_states_bitsliced(thread_arg->cuid, thread_arg->best_first_bytes, bucket, &keys_found, &num_keys_tested, nonces_to_bruteforce, bf_test_nonce_2nd_byte, thread_arg->nonces);
+            if(key != -1){
+                __sync_fetch_and_add(&keys_found, 1);
+                               char progress_text[80];
+                               sprintf(progress_text, "Brute force phase completed. Key found: %012" PRIx64, key);
+                               hardnested_print_progress(thread_arg->num_acquired_nonces, progress_text, 0.0, 0);
+                break;
+            } else if(keys_found){
+                break;
+            } else {
+                               if (!thread_arg->silent) {
+                                       char progress_text[80];
+                                       sprintf(progress_text, "Brute force phase: %6.02f%%", 100.0*(float)num_keys_tested/(float)(thread_arg->maximum_states));
+                                       float remaining_bruteforce = thread_arg->nonces[thread_arg->best_first_bytes[0]].expected_num_brute_force - (float)num_keys_tested/2;
+                                       hardnested_print_progress(thread_arg->num_acquired_nonces, progress_text, remaining_bruteforce, 5000);
+                               }
+            }
+        }
+        current_bucket += NUM_BRUTE_FORCE_THREADS;
+    }
+    return NULL;
+}
+
+
+void prepare_bf_test_nonces(noncelist_t *nonces, uint8_t best_first_byte)
+{
+       // we do bitsliced brute forcing with best_first_bytes[0] only.
+       // Extract the corresponding 2nd bytes
+       noncelistentry_t *test_nonce = nonces[best_first_byte].first;
+       uint32_t i = 0;
+       while (test_nonce != NULL) {
+               bf_test_nonce[i] = test_nonce->nonce_enc;
+               bf_test_nonce_par[i] = test_nonce->par_enc;
+               bf_test_nonce_2nd_byte[i] = (test_nonce->nonce_enc >> 16) & 0xff;
+               test_nonce = test_nonce->next;
+               i++;
+       }
+       nonces_to_bruteforce = i;
+
+       // printf("Nonces to bruteforce: %d\n", nonces_to_bruteforce);
+       // printf("Common bits of first 4 2nd nonce bytes (before sorting): %u %u %u\n",
+                       // trailing_zeros(bf_test_nonce_2nd_byte[1] ^ bf_test_nonce_2nd_byte[0]),
+                       // trailing_zeros(bf_test_nonce_2nd_byte[2] ^ bf_test_nonce_2nd_byte[1]),
+                       // trailing_zeros(bf_test_nonce_2nd_byte[3] ^ bf_test_nonce_2nd_byte[2]));
+       
+       uint8_t best_4[4] = {0};
+       int sum_best = -1;
+       for (uint16_t n1 = 0; n1 < nonces_to_bruteforce; n1++) {
+               for (uint16_t n2 = 0; n2 < nonces_to_bruteforce; n2++) {
+                       if (n2 != n1) {
+                               for (uint16_t n3 = 0; n3 < nonces_to_bruteforce; n3++) {
+                                       if ((n3 != n2 && n3 != n1) || nonces_to_bruteforce < 3 
+                                           // && trailing_zeros(bf_test_nonce_2nd_byte[n1] ^ bf_test_nonce_2nd_byte[n2]) 
+                                                   // > trailing_zeros(bf_test_nonce_2nd_byte[n2] ^ bf_test_nonce_2nd_byte[n3])
+                                                       ) {
+                                               for (uint16_t n4 = 0; n4 < nonces_to_bruteforce; n4++) {
+                                                       if ((n4 != n3 && n4 != n2 && n4 != n1) || nonces_to_bruteforce < 4
+                                                               // && trailing_zeros(bf_test_nonce_2nd_byte[n2] ^ bf_test_nonce_2nd_byte[n3]) 
+                                                               // > trailing_zeros(bf_test_nonce_2nd_byte[n3] ^ bf_test_nonce_2nd_byte[n4])
+                                                               ) {
+                                                               int sum = nonces_to_bruteforce > 1 ? trailing_zeros(bf_test_nonce_2nd_byte[n1] ^ bf_test_nonce_2nd_byte[n2]) : 0.0
+                                                                         + nonces_to_bruteforce > 2 ? trailing_zeros(bf_test_nonce_2nd_byte[n2] ^ bf_test_nonce_2nd_byte[n3]) : 0.0
+                                                                                 + nonces_to_bruteforce > 3 ? trailing_zeros(bf_test_nonce_2nd_byte[n3] ^ bf_test_nonce_2nd_byte[n4]) : 0.0;
+                                                               if (sum > sum_best) {
+                                                                       sum_best = sum;
+                                                                       best_4[0] = n1;
+                                                                       best_4[1] = n2;
+                                                                       best_4[2] = n3;
+                                                                       best_4[3] = n4;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       uint32_t bf_test_nonce_temp[4];
+       uint8_t bf_test_nonce_par_temp[4];
+       uint8_t bf_test_nonce_2nd_byte_temp[4];
+       for (uint8_t i = 0; i < 4 && i < nonces_to_bruteforce; i++) {
+               bf_test_nonce_temp[i] = bf_test_nonce[best_4[i]]; 
+               
+               bf_test_nonce_par_temp[i] = bf_test_nonce_par[best_4[i]];
+               bf_test_nonce_2nd_byte_temp[i] = bf_test_nonce_2nd_byte[best_4[i]];
+       }
+       for (uint8_t i = 0; i < 4 && i < nonces_to_bruteforce; i++) {
+               bf_test_nonce[i] = bf_test_nonce_temp[i];
+               bf_test_nonce_par[i] = bf_test_nonce_par_temp[i];
+               bf_test_nonce_2nd_byte[i] = bf_test_nonce_2nd_byte_temp[i];
+       }
+}
+
+
+#if defined (WRITE_BENCH_FILE)
+static void write_benchfile(statelist_t *candidates) {
+
+       printf("Writing brute force benchmark data...");
+       FILE *benchfile = fopen(TEST_BENCH_FILENAME, "wb");
+       fwrite(&nonces_to_bruteforce, 1, sizeof(nonces_to_bruteforce), benchfile);
+       for (uint32_t i = 0; i < nonces_to_bruteforce; i++) {
+               fwrite(&(bf_test_nonce[i]), 1, sizeof(bf_test_nonce[i]), benchfile);
+               fwrite(&(bf_test_nonce_par[i]), 1, sizeof(bf_test_nonce_par[i]), benchfile);
+       }
+       uint32_t num_states = MIN(candidates->len[EVEN_STATE], TEST_BENCH_SIZE);
+       fwrite(&num_states, 1, sizeof(num_states), benchfile);
+       for (uint32_t i = 0; i < num_states; i++) {
+               fwrite(&(candidates->states[EVEN_STATE][i]), 1, sizeof(uint32_t), benchfile);
+       }
+       num_states = MIN(candidates->len[ODD_STATE], TEST_BENCH_SIZE);
+       fwrite(&num_states, 1, sizeof(num_states), benchfile);
+       for (uint32_t i = 0; i < num_states; i++) {
+               fwrite(&(candidates->states[ODD_STATE][i]), 1, sizeof(uint32_t), benchfile);
+       }
+       fclose(benchfile);
+       printf("done.\n");
+}
+#endif
+
+
+bool brute_force_bs(float *bf_rate, statelist_t *candidates, uint32_t cuid, uint32_t num_acquired_nonces, uint64_t maximum_states, noncelist_t *nonces, uint8_t *best_first_bytes)
+{
+#if defined (WRITE_BENCH_FILE)
+       write_benchfile(candidates);
+#endif
+       bool silent = (bf_rate != NULL);
+       
+       // if (!silent) {
+               // PrintAndLog("Brute force phase starting.");
+               // PrintAndLog("Using %u-bit bitslices", MAX_BITSLICES);
+       // }
+       
+       keys_found = 0;
+       num_keys_tested = 0;
+
+       bitslice_test_nonces(nonces_to_bruteforce, bf_test_nonce, bf_test_nonce_par);
+       
+       // count number of states to go
+       bucket_count = 0;
+       for (statelist_t *p = candidates; p != NULL; p = p->next) {
+               if (p->states[ODD_STATE] != NULL && p->states[EVEN_STATE] != NULL) {
+                       buckets[bucket_count] = p;
+                       bucket_count++;
+               }
+       }
+
+       uint64_t start_time = msclock();
+       // enumerate states using all hardware threads, each thread handles one bucket
+       // if (!silent) {
+               // PrintAndLog("Starting %u cracking threads to search %u buckets containing a total of %" PRIu64" states...\n", NUM_BRUTE_FORCE_THREADS, bucket_count, maximum_states);
+               // printf("Common bits of first 4 2nd nonce bytes: %u %u %u\n",
+                       // trailing_zeros(bf_test_nonce_2nd_byte[1] ^ bf_test_nonce_2nd_byte[0]),
+                       // trailing_zeros(bf_test_nonce_2nd_byte[2] ^ bf_test_nonce_2nd_byte[1]),
+                       // trailing_zeros(bf_test_nonce_2nd_byte[3] ^ bf_test_nonce_2nd_byte[2]));
+       // }
+
+       pthread_t threads[NUM_BRUTE_FORCE_THREADS];
+       struct args {
+               bool silent;
+               int thread_ID;
+               uint32_t cuid;
+               uint32_t num_acquired_nonces;
+               uint64_t maximum_states;
+               noncelist_t *nonces;
+               uint8_t *best_first_bytes;
+       } thread_args[NUM_BRUTE_FORCE_THREADS];
+       
+       for(uint32_t i = 0; i < NUM_BRUTE_FORCE_THREADS; i++){
+               thread_args[i].thread_ID = i;
+               thread_args[i].silent = silent;
+               thread_args[i].cuid = cuid;
+               thread_args[i].num_acquired_nonces = num_acquired_nonces;
+               thread_args[i].maximum_states = maximum_states;
+               thread_args[i].nonces = nonces;
+               thread_args[i].best_first_bytes = best_first_bytes;
+               pthread_create(&threads[i], NULL, crack_states_thread, (void*)&thread_args[i]);
+       }
+       for(uint32_t i = 0; i < NUM_BRUTE_FORCE_THREADS; i++){
+               pthread_join(threads[i], 0);
+       }
+
+       uint64_t elapsed_time = msclock() - start_time;
+
+       // if (!silent) {
+               // printf("Brute force completed after testing %" PRIu64" (2^%1.1f) keys in %1.1f seconds at a rate of %1.0f (2^%1.1f) keys per second.\n", 
+                       // num_keys_tested,
+                       // log(num_keys_tested) / log(2.0),
+                       // (float)elapsed_time/1000.0,
+                       // (float)num_keys_tested / ((float)elapsed_time / 1000.0), 
+                       // log((float)num_keys_tested / ((float)elapsed_time/1000.0)) / log(2.0));
+       // }
+
+       if (bf_rate != NULL) {
+               *bf_rate = (float)num_keys_tested / ((float)elapsed_time / 1000.0);
+       }
+       
+       return (keys_found != 0);
+}
+
+
+static bool read_bench_data(statelist_t *test_candidates) {
+
+       size_t bytes_read = 0;
+       uint32_t temp = 0;
+       uint32_t num_states = 0;
+       uint32_t states_read = 0;
+       
+       char bench_file_path[strlen(get_my_executable_directory()) + strlen(TEST_BENCH_FILENAME) + 1];
+       strcpy(bench_file_path, get_my_executable_directory());
+       strcat(bench_file_path, TEST_BENCH_FILENAME);
+
+       FILE *benchfile = fopen(bench_file_path, "rb");
+       if (benchfile == NULL) {
+               return false;
+       }
+       bytes_read = fread(&nonces_to_bruteforce, 1, sizeof(nonces_to_bruteforce), benchfile);
+       if (bytes_read != sizeof(nonces_to_bruteforce)) { 
+               fclose(benchfile);
+               return false;
+       }
+       for (uint16_t i = 0; i < nonces_to_bruteforce && i < 256; i++) {
+               bytes_read = fread(&bf_test_nonce[i], 1, sizeof(uint32_t), benchfile);
+               if (bytes_read != sizeof(uint32_t)) {
+                       fclose(benchfile);
+                       return false;
+               }
+               bf_test_nonce_2nd_byte[i] = (bf_test_nonce[i] >> 16) & 0xff;
+               bytes_read = fread(&bf_test_nonce_par[i], 1, sizeof(uint8_t), benchfile);
+               if (bytes_read != sizeof(uint8_t)) {
+                       fclose(benchfile);
+                       return false;
+               }
+       }
+       bytes_read = fread(&num_states, 1, sizeof(uint32_t), benchfile); 
+       if (bytes_read != sizeof(uint32_t)) {
+               fclose(benchfile);
+               return false;
+       }
+       for (states_read = 0; states_read < MIN(num_states, TEST_BENCH_SIZE); states_read++) {
+               bytes_read = fread(test_candidates->states[EVEN_STATE] + states_read, 1, sizeof(uint32_t), benchfile);
+               if (bytes_read != sizeof(uint32_t)) {
+                       fclose(benchfile);
+                       return false;
+               }
+       }
+       for (uint32_t i = states_read; i < TEST_BENCH_SIZE; i++) {
+               test_candidates->states[EVEN_STATE][i] = test_candidates->states[EVEN_STATE][i-states_read];
+       }
+       for (uint32_t i = states_read; i < num_states; i++) {
+               bytes_read = fread(&temp, 1, sizeof(uint32_t), benchfile);
+               if (bytes_read != sizeof(uint32_t)) {
+                       fclose(benchfile);
+                       return false;
+               }
+       }
+       for (states_read = 0; states_read < MIN(num_states, TEST_BENCH_SIZE); states_read++) {
+               bytes_read = fread(test_candidates->states[ODD_STATE] + states_read, 1, sizeof(uint32_t), benchfile);
+               if (bytes_read != sizeof(uint32_t)) {
+                       fclose(benchfile);
+                       return false;
+               }
+       }
+       for (uint32_t i = states_read; i < TEST_BENCH_SIZE; i++) {
+               test_candidates->states[ODD_STATE][i] = test_candidates->states[ODD_STATE][i-states_read];
+       }
+               
+       fclose(benchfile);
+       return true;    
+}
+
+
+float brute_force_benchmark()
+{
+       statelist_t test_candidates[NUM_BRUTE_FORCE_THREADS];
+
+       test_candidates[0].states[ODD_STATE] = malloc((TEST_BENCH_SIZE+1) * sizeof(uint32_t));
+       test_candidates[0].states[EVEN_STATE] = malloc((TEST_BENCH_SIZE+1) * sizeof(uint32_t));
+       for (uint8_t i = 0; i < NUM_BRUTE_FORCE_THREADS - 1; i++){
+               test_candidates[i].next = test_candidates + i + 1;
+               test_candidates[i+1].states[ODD_STATE] = test_candidates[0].states[ODD_STATE];
+               test_candidates[i+1].states[EVEN_STATE] = test_candidates[0].states[EVEN_STATE];
+       } 
+       test_candidates[NUM_BRUTE_FORCE_THREADS-1].next = NULL;
+
+       if (!read_bench_data(test_candidates)) {
+               PrintAndLog("Couldn't read benchmark data. Assuming brute force rate of %1.0f states per second", DEFAULT_BRUTE_FORCE_RATE);
+               return DEFAULT_BRUTE_FORCE_RATE;
+       }
+
+       for (uint8_t i = 0; i < NUM_BRUTE_FORCE_THREADS; i++) {
+               test_candidates[i].len[ODD_STATE] = TEST_BENCH_SIZE;
+               test_candidates[i].len[EVEN_STATE] = TEST_BENCH_SIZE;
+               test_candidates[i].states[ODD_STATE][TEST_BENCH_SIZE] = -1;
+               test_candidates[i].states[EVEN_STATE][TEST_BENCH_SIZE] = -1;
+       }
+       
+       uint64_t maximum_states = TEST_BENCH_SIZE*TEST_BENCH_SIZE*(uint64_t)NUM_BRUTE_FORCE_THREADS;
+
+       float bf_rate;
+       brute_force_bs(&bf_rate, test_candidates, 0, 0, maximum_states, NULL, 0);
+       
+       free(test_candidates[0].states[ODD_STATE]);
+       free(test_candidates[0].states[EVEN_STATE]);
+
+       return bf_rate;
+}
+
+
diff --git a/client/hardnested/hardnested_bruteforce.h b/client/hardnested/hardnested_bruteforce.h
new file mode 100644 (file)
index 0000000..d4f6348
--- /dev/null
@@ -0,0 +1,36 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2016, 2017 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+
+#ifndef HARDNESTED_BRUTEFORCE_H__
+#define HARDNESTED_BRUTEFORCE_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "cmdhfmfhard.h"
+
+typedef struct {
+       uint32_t *states[2];
+       uint32_t len[2];
+       void* next;
+} statelist_t;
+
+extern void prepare_bf_test_nonces(noncelist_t *nonces, uint8_t best_first_byte);
+extern bool brute_force_bs(float *bf_rate, statelist_t *candidates, uint32_t cuid, uint32_t num_acquired_nonces, uint64_t maximum_states, noncelist_t *nonces, uint8_t *best_first_bytes);
+extern float brute_force_benchmark();
+extern uint8_t trailing_zeros(uint8_t byte); 
+extern bool verify_key(uint32_t cuid, noncelist_t *nonces, uint8_t *best_first_bytes, uint32_t odd, uint32_t even);
+
+#endif
diff --git a/client/hardnested/hardnested_tables.c b/client/hardnested/hardnested_tables.c
new file mode 100644 (file)
index 0000000..732b386
--- /dev/null
@@ -0,0 +1,591 @@
+//-----------------------------------------------------------------------------
+// Copyright (C) 2015, 2016 by piwi
+//
+// This code is licensed to you under the terms of the GNU GPL, version 2 or,
+// at your option, any later version. See the LICENSE.txt file for the text of
+// the license.
+//-----------------------------------------------------------------------------
+// Implements a card only attack based on crypto text (encrypted nonces
+// received during a nested authentication) only. Unlike other card only
+// attacks this doesn't rely on implementation errors but only on the
+// inherent weaknesses of the crypto1 cypher. Described in
+//   Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
+//   Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on 
+//   Computer and Communications Security, 2015
+//-----------------------------------------------------------------------------
+//
+// This program calculates tables with possible states for a given
+// bitflip property.
+//
+//-----------------------------------------------------------------------------
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "crapto1/crapto1.h"
+#include "parity.h"
+
+
+#define NUM_PART_SUMS          9
+#define BITFLIP_2ND_BYTE       0x0200
+
+typedef enum {
+       EVEN_STATE = 0,
+       ODD_STATE = 1
+} odd_even_t;
+
+
+static uint16_t PartialSumProperty(uint32_t state, odd_even_t odd_even)
+{ 
+       uint16_t sum = 0;
+       for (uint16_t j = 0; j < 16; j++) {
+               uint32_t st = state;
+               uint16_t part_sum = 0;
+               if (odd_even == ODD_STATE) {
+                       for (uint16_t i = 0; i < 5; i++) {
+                               part_sum ^= filter(st);
+                               st = (st << 1) | ((j >> (3-i)) & 0x01) ;
+                       }
+                       part_sum ^= 1;          // XOR 1 cancelled out for the other 8 bits
+               } else {
+                       for (uint16_t i = 0; i < 4; i++) {
+                               st = (st << 1) | ((j >> (3-i)) & 0x01) ;
+                               part_sum ^= filter(st);
+                       }
+               }
+               sum += part_sum;
+       }
+       return sum;
+}
+
+