X-Git-Url: https://git.zerfleddert.de/cgi-bin/gitweb.cgi/proxmark3-svn/blobdiff_plain/a531720ae6e8c9882c96ba4968e5c7ce98e7fb4c..b4a6775b5e9ee1c50047da597a3cc66ce752ba4f:/client/cmdhfmfhard.c?ds=sidebyside

diff --git a/client/cmdhfmfhard.c b/client/cmdhfmfhard.c
index 6e1ebc85..acdea715 100644
--- a/client/cmdhfmfhard.c
+++ b/client/cmdhfmfhard.c
@@ -1,6 +1,6 @@
 //-----------------------------------------------------------------------------
 // Copyright (C) 2015 piwi
-//
+// fiddled with 2016 Azcid (hardnested bitsliced Bruteforce imp)
 // This code is licensed to you under the terms of the GNU GPL, version 2 or,
 // at your option, any later version. See the LICENSE.txt file for the text of
 // the license.
@@ -14,24 +14,32 @@
 //   Computer and Communications Security, 2015
 //-----------------------------------------------------------------------------
 
-#include <stdio.h>
 #include <stdlib.h> 
+#include <stdio.h>
 #include <string.h>
 #include <pthread.h>
+#include <locale.h>
 #include <math.h>
 #include "proxmark3.h"
 #include "cmdmain.h"
 #include "ui.h"
 #include "util.h"
 #include "nonce2key/crapto1.h"
+#include "nonce2key/crypto1_bs.h"
 #include "parity.h"
-
-// uint32_t test_state_odd = 0;
-// uint32_t test_state_even = 0;
+#ifdef __WIN32
+	#include <windows.h>
+#endif
+// don't include for APPLE/mac which has malloc stuff elsewhere.
+#ifndef __APPLE__
+	#include <malloc.h>
+#endif
+#include <assert.h>
 
 #define CONFIDENCE_THRESHOLD	0.95		// Collect nonces until we are certain enough that the following brute force is successfull
-#define GOOD_BYTES_REQUIRED		20
+#define GOOD_BYTES_REQUIRED		13          // default 28, could be smaller == faster
 
+#define END_OF_LIST_MARKER		0xFFFFFFFF
 
 static const float p_K[257] = {		// the probability that a random nonce has a Sum Property == K 
 	0.0290, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 
@@ -67,7 +75,6 @@ static const float p_K[257] = {		// the probability that a random nonce has a Su
 	0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
 	0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
 	0.0290 };
-
 		
 typedef struct noncelistentry {
 	uint32_t nonce_enc;
@@ -86,17 +93,18 @@ typedef struct noncelist {
 	float score1, score2;
 } noncelist_t;
 
-
-static uint32_t cuid;
+static size_t nonces_to_bruteforce = 0;
+static noncelistentry_t *brute_force_nonces[256];
+static uint32_t cuid = 0;
 static noncelist_t nonces[256];
+static uint8_t best_first_bytes[256];
 static uint16_t first_byte_Sum = 0;
 static uint16_t first_byte_num = 0;
 static uint16_t num_good_first_bytes = 0;
 static uint64_t maximum_states = 0;
 static uint64_t known_target_key;
-
-#define MAX_BEST_BYTES 256
-static uint8_t best_first_bytes[MAX_BEST_BYTES];
+static bool write_stats = false;
+static FILE *fstats = NULL;
 
 
 typedef enum {
@@ -122,10 +130,8 @@ typedef struct {
 
 static partial_indexed_statelist_t partial_statelist[17];
 static partial_indexed_statelist_t statelist_bitflip;
-
 static statelist_t *candidates = NULL;
 
-
 static int add_nonce(uint32_t nonce_enc, uint8_t par_enc) 
 {
 	uint8_t first_byte = nonce_enc >> 24;
@@ -168,6 +174,11 @@ static int add_nonce(uint32_t nonce_enc, uint8_t par_enc)
 	p2->nonce_enc = nonce_enc;
 	p2->par_enc = par_enc;
 
+    if(nonces_to_bruteforce < 256){
+        brute_force_nonces[nonces_to_bruteforce] = p2;
+        nonces_to_bruteforce++;
+    }
+
 	nonces[first_byte].num++;
 	nonces[first_byte].Sum += evenparity32((nonce_enc & 0x00ff0000) | (par_enc & 0x04));
 	nonces[first_byte].updated = true;   // indicates that we need to recalculate the Sum(a8) probability for this first byte
@@ -175,6 +186,37 @@ static int add_nonce(uint32_t nonce_enc, uint8_t par_enc)
 	return (1);				// new nonce added
 }
 
+static void init_nonce_memory(void)
+{
+	for (uint16_t i = 0; i < 256; i++) {
+		nonces[i].num = 0;
+		nonces[i].Sum = 0;
+		nonces[i].Sum8_guess = 0;
+		nonces[i].Sum8_prob = 0.0;
+		nonces[i].updated = true;
+		nonces[i].first = NULL;
+	}
+	first_byte_num = 0;
+	first_byte_Sum = 0;
+	num_good_first_bytes = 0;
+}
+
+static void free_nonce_list(noncelistentry_t *p)
+{
+	if (p == NULL) {
+		return;
+	} else {
+		free_nonce_list(p->next);
+		free(p);
+	}
+}
+
+static void free_nonces_memory(void)
+{
+	for (uint16_t i = 0; i < 256; i++) {
+		free_nonce_list(nonces[i].first);
+	}
+}
 
 static uint16_t PartialSumProperty(uint32_t state, odd_even_t odd_even)
 { 
@@ -199,14 +241,12 @@ static uint16_t PartialSumProperty(uint32_t state, odd_even_t odd_even)
 	return sum;
 }
 
-
-static uint16_t SumProperty(struct Crypto1State *s)
-{
-	uint16_t sum_odd = PartialSumProperty(s->odd, ODD_STATE);
-	uint16_t sum_even = PartialSumProperty(s->even, EVEN_STATE);
-	return (sum_odd*(16-sum_even) + (16-sum_odd)*sum_even);
-}
-
+// static uint16_t SumProperty(struct Crypto1State *s)
+// {
+	// uint16_t sum_odd = PartialSumProperty(s->odd, ODD_STATE);
+	// uint16_t sum_even = PartialSumProperty(s->even, EVEN_STATE);
+	// return (sum_odd*(16-sum_even) + (16-sum_odd)*sum_even);
+// }
 
 static double p_hypergeometric(uint16_t N, uint16_t K, uint16_t n, uint16_t k) 
 {
@@ -245,29 +285,24 @@ static double p_hypergeometric(uint16_t N, uint16_t K, uint16_t n, uint16_t k)
 		}
 	}
 }
-	
-	
+
 static float sum_probability(uint16_t K, uint16_t n, uint16_t k)
 {
 	const uint16_t N = 256;
-	
-	
 
-		if (k > K || p_K[K] == 0.0) return 0.0;
+	if (k > K || p_K[K] == 0.0) return 0.0;
 
-		double p_T_is_k_when_S_is_K = p_hypergeometric(N, K, n, k);
-		double p_S_is_K = p_K[K];
-		double p_T_is_k = 0;
-		for (uint16_t i = 0; i <= 256; i++) {
-			if (p_K[i] != 0.0) {
-				p_T_is_k += p_K[i] * p_hypergeometric(N, i, n, k);
-			}
+	double p_T_is_k_when_S_is_K = p_hypergeometric(N, K, n, k);
+	double p_S_is_K = p_K[K];
+	double p_T_is_k = 0;
+	for (uint16_t i = 0; i <= 256; i++) {
+		if (p_K[i] != 0.0) {
+			p_T_is_k += p_K[i] * p_hypergeometric(N, i, n, k);
 		}
-		return(p_T_is_k_when_S_is_K * p_S_is_K / p_T_is_k);
+	}
+	return(p_T_is_k_when_S_is_K * p_S_is_K / p_T_is_k);
 }
 
-		
-
 	
 static inline uint_fast8_t common_bits(uint_fast8_t bytes_diff) 
 {
@@ -293,16 +328,15 @@ static inline uint_fast8_t common_bits(uint_fast8_t bytes_diff)
 	return common_bits_LUT[bytes_diff];
 }
 
-
 static void Tests()
 {
-	printf("Tests: Partial Statelist sizes\n");
-	for (uint16_t i = 0; i <= 16; i+=2) {
-		printf("Partial State List Odd [%2d] has %8d entries\n", i, partial_statelist[i].len[ODD_STATE]);
-	}
-	for (uint16_t i = 0; i <= 16; i+=2) {
-		printf("Partial State List Even	[%2d] has %8d entries\n", i, partial_statelist[i].len[EVEN_STATE]);
-	}
+	// printf("Tests: Partial Statelist sizes\n");
+	// for (uint16_t i = 0; i <= 16; i+=2) {
+		// printf("Partial State List Odd [%2d] has %8d entries\n", i, partial_statelist[i].len[ODD_STATE]);
+	// }
+	// for (uint16_t i = 0; i <= 16; i+=2) {
+		// printf("Partial State List Even	[%2d] has %8d entries\n", i, partial_statelist[i].len[EVEN_STATE]);
+	// }
 	
  	// #define NUM_STATISTICS 100000
 	// uint32_t statistics_odd[17];
@@ -375,69 +409,67 @@ static void Tests()
 	// printf("p_hypergeometric(256, 1, 1, 1) = %0.8f\n", p_hypergeometric(256, 1, 1, 1));
 	// printf("p_hypergeometric(256, 1, 1, 0) = %0.8f\n", p_hypergeometric(256, 1, 1, 0));
 	
-	struct Crypto1State *pcs;
-	pcs = crypto1_create(0xffffffffffff);
-	printf("\nTests: for key = 0xffffffffffff:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n", 
-		SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
-	crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
-	printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
-		best_first_bytes[0],
-		SumProperty(pcs),
-		pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
-	//test_state_odd = pcs->odd & 0x00ffffff;
-	//test_state_even = pcs->even & 0x00ffffff;
-	crypto1_destroy(pcs);
-	pcs = crypto1_create(0xa0a1a2a3a4a5);
-	printf("Tests: for key = 0xa0a1a2a3a4a5:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
-		SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
-	crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
-	printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
-		best_first_bytes[0],
-		SumProperty(pcs),
-		pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
-	// test_state_odd = pcs->odd & 0x00ffffff;
-	// test_state_even = pcs->even & 0x00ffffff;
-	crypto1_destroy(pcs);
-	pcs = crypto1_create(0xa6b9aa97b955);
-	printf("Tests: for key = 0xa6b9aa97b955:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
-		SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
-	crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
-	printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
-		best_first_bytes[0],
-		SumProperty(pcs),
-		pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// struct Crypto1State *pcs;
+	// pcs = crypto1_create(0xffffffffffff);
+	// printf("\nTests: for key = 0xffffffffffff:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n", 
+		// SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+	// printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+		// best_first_bytes[0],
+		// SumProperty(pcs),
+		// pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// //test_state_odd = pcs->odd & 0x00ffffff;
+	// //test_state_even = pcs->even & 0x00ffffff;
+	// crypto1_destroy(pcs);
+	// pcs = crypto1_create(0xa0a1a2a3a4a5);
+	// printf("Tests: for key = 0xa0a1a2a3a4a5:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+		// SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+	// printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+		// best_first_bytes[0],
+		// SumProperty(pcs),
+		// pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// //test_state_odd = pcs->odd & 0x00ffffff;
+	// //test_state_even = pcs->even & 0x00ffffff;
+	// crypto1_destroy(pcs);
+	// pcs = crypto1_create(0xa6b9aa97b955);
+	// printf("Tests: for key = 0xa6b9aa97b955:\nSum(a0) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+		// SumProperty(pcs), pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
+	// crypto1_byte(pcs, (cuid >> 24) ^ best_first_bytes[0], true);
+	// printf("After adding best first byte 0x%02x:\nSum(a8) = %d\nodd_state =  0x%06x\neven_state = 0x%06x\n",
+		// best_first_bytes[0],
+		// SumProperty(pcs),
+		// pcs->odd & 0x00ffffff, pcs->even & 0x00ffffff);
 	//test_state_odd = pcs->odd & 0x00ffffff;
 	//test_state_even = pcs->even & 0x00ffffff;
-	crypto1_destroy(pcs);
-
+	// crypto1_destroy(pcs);
 
 	
-	printf("\nTests: number of states with BitFlipProperty: %d, (= %1.3f%% of total states)\n", statelist_bitflip.len[0], 100.0 * statelist_bitflip.len[0] / (1<<20));
+	// printf("\nTests: number of states with BitFlipProperty: %d, (= %1.3f%% of total states)\n", statelist_bitflip.len[0], 100.0 * statelist_bitflip.len[0] / (1<<20));
 
-	printf("\nTests: Actual BitFlipProperties odd/even:\n");
-	for (uint16_t i = 0; i < 256; i++) {
-		printf("[%02x]:%c%c ", i, nonces[i].BitFlip[ODD_STATE]?'o':' ', nonces[i].BitFlip[EVEN_STATE]?'e':' ');
-		if (i % 8 == 7) {
-			printf("\n");
-		}
-	}
+	// printf("\nTests: Actual BitFlipProperties odd/even:\n");
+	// for (uint16_t i = 0; i < 256; i++) {
+		// printf("[%02x]:%c  ", i, nonces[i].BitFlip[ODD_STATE]?'o':nonces[i].BitFlip[EVEN_STATE]?'e':' ');
+		// if (i % 8 == 7) {
+			// printf("\n");
+		// }
+	// }
 	
-	printf("\nTests: Best %d first bytes:\n", MAX_BEST_BYTES);
-	for (uint16_t i = 0; i < MAX_BEST_BYTES; i++) {
-		uint8_t best_byte = best_first_bytes[i];
-		printf("#%03d Byte: %02x, n = %2d, k = %2d, Sum(a8): %3d, Confidence: %2.1f%%, Bitflip: %c%c\n", 
-		//printf("#%03d Byte: %02x, n = %2d, k = %2d, Sum(a8): %3d, Confidence: %2.1f%%, Bitflip: %c%c, score1: %f, score2: %f\n", 
-			i, best_byte, 
-			nonces[best_byte].num,
-			nonces[best_byte].Sum,
-			nonces[best_byte].Sum8_guess,
-			nonces[best_byte].Sum8_prob * 100,
-			nonces[best_byte].BitFlip[ODD_STATE]?'o':' ', 
-			nonces[best_byte].BitFlip[EVEN_STATE]?'e':' '
-			//nonces[best_byte].score1,
-			//nonces[best_byte].score2
-			);
-	}
+	// printf("\nTests: Sorted First Bytes:\n");
+	// for (uint16_t i = 0; i < 256; i++) {
+		// uint8_t best_byte = best_first_bytes[i];
+		// printf("#%03d Byte: %02x, n = %3d, k = %3d, Sum(a8): %3d, Confidence: %5.1f%%, Bitflip: %c\n", 
+		// //printf("#%03d Byte: %02x, n = %3d, k = %3d, Sum(a8): %3d, Confidence: %5.1f%%, Bitflip: %c, score1: %1.5f, score2: %1.0f\n", 
+			// i, best_byte, 
+			// nonces[best_byte].num,
+			// nonces[best_byte].Sum,
+			// nonces[best_byte].Sum8_guess,
+			// nonces[best_byte].Sum8_prob * 100,
+			// nonces[best_byte].BitFlip[ODD_STATE]?'o':nonces[best_byte].BitFlip[EVEN_STATE]?'e':' '
+			// //nonces[best_byte].score1,
+			// //nonces[best_byte].score2
+			// );
+	// }
 	
 	// printf("\nTests: parity performance\n");
 	// time_t time1p = clock();
@@ -457,29 +489,28 @@ static void Tests()
 
 }
 
-
 static void sort_best_first_bytes(void)
 {
-	// first, sort based on probability for correct guess	
+	// sort based on probability for correct guess	
 	for (uint16_t i = 0; i < 256; i++ ) {
 		uint16_t j = 0;
 		float prob1 = nonces[i].Sum8_prob;
 		float prob2 = nonces[best_first_bytes[0]].Sum8_prob;
-		while (prob1 < prob2 && j < MAX_BEST_BYTES-1) {
+		while (prob1 < prob2 && j < i) {
 			prob2 = nonces[best_first_bytes[++j]].Sum8_prob;
 		}
-		if (prob1 >= prob2) {
-			for (uint16_t k = MAX_BEST_BYTES-1; k > j; k--) {
+		if (j < i) {
+			for (uint16_t k = i; k > j; k--) {
 				best_first_bytes[k] = best_first_bytes[k-1];
 			}
+		}
 			best_first_bytes[j] = i;
 		}
-	}
 
-	// determine, how many are above the CONFIDENCE_THRESHOLD
+	// determine how many are above the CONFIDENCE_THRESHOLD
 	uint16_t num_good_nonces = 0;
-	for (uint16_t i = 0; i < MAX_BEST_BYTES; i++) {
-		if (nonces[best_first_bytes[i]].Sum8_prob > CONFIDENCE_THRESHOLD) {
+	for (uint16_t i = 0; i < 256; i++) {
+		if (nonces[best_first_bytes[i]].Sum8_prob >= CONFIDENCE_THRESHOLD) {
 			++num_good_nonces;
 		}
 	}
@@ -543,12 +574,8 @@ static void sort_best_first_bytes(void)
 	
 }
 
-
 static uint16_t estimate_second_byte_sum(void) 
 {
-	for (uint16_t i = 0; i < MAX_BEST_BYTES; i++) {
-		best_first_bytes[i] = 0;
-	}
 	
 	for (uint16_t first_byte = 0; first_byte < 256; first_byte++) {
 		float Sum8_prob = 0.0;
@@ -570,8 +597,8 @@ static uint16_t estimate_second_byte_sum(void)
 	sort_best_first_bytes();
 
 	uint16_t num_good_nonces = 0;
-	for (uint16_t i = 0; i < MAX_BEST_BYTES; i++) {
-		if (nonces[best_first_bytes[i]].Sum8_prob > CONFIDENCE_THRESHOLD) {
+	for (uint16_t i = 0; i < 256; i++) {
+		if (nonces[best_first_bytes[i]].Sum8_prob >= CONFIDENCE_THRESHOLD) {
 			++num_good_nonces;
 		}
 	}
@@ -579,15 +606,14 @@ static uint16_t estimate_second_byte_sum(void)
 	return num_good_nonces;
 }	
 
-
 static int read_nonce_file(void)
 {
 	FILE *fnonces = NULL;
-	uint8_t trgBlockNo;
-	uint8_t trgKeyType;
+	uint8_t trgBlockNo = 0;
+	uint8_t trgKeyType = 0;
 	uint8_t read_buf[9];
-	uint32_t nt_enc1, nt_enc2;
-	uint8_t par_enc;
+	uint32_t nt_enc1 = 0, nt_enc2 = 0;
+	uint8_t par_enc = 0;
 	int total_num_nonces = 0;
 	
 	if ((fnonces = fopen("nonces.bin","rb")) == NULL) { 
@@ -596,7 +622,8 @@ static int read_nonce_file(void)
 	}
 
 	PrintAndLog("Reading nonces from file nonces.bin...");
-	if (fread(read_buf, 1, 6, fnonces) == 0) {
+	size_t bytes_read = fread(read_buf, 1, 6, fnonces);
+	if ( bytes_read == 0) {
 		PrintAndLog("File reading error.");
 		fclose(fnonces);
 		return 1;
@@ -617,15 +644,15 @@ static int read_nonce_file(void)
 	}
 	fclose(fnonces);
 	PrintAndLog("Read %d nonces from file. cuid=%08x, Block=%d, Keytype=%c", total_num_nonces, cuid, trgBlockNo, trgKeyType==0?'A':'B');
-
 	return 0;
 }
 
-
 static void Check_for_FilterFlipProperties(void)
 {
 	printf("Checking for Filter Flip Properties...\n");
 
+	uint16_t num_bitflips = 0;
+	
 	for (uint16_t i = 0; i < 256; i++) {
 		nonces[i].BitFlip[ODD_STATE] = false;
 		nonces[i].BitFlip[EVEN_STATE] = false;
@@ -638,12 +665,92 @@ static void Check_for_FilterFlipProperties(void)
 		
 		if (parity1 == parity2_odd) {				// has Bit Flip Property for odd bits
 			nonces[i].BitFlip[ODD_STATE] = true;
+			num_bitflips++;
 		} else if (parity1 == parity2_even) {		// has Bit Flip Property for even bits
 			nonces[i].BitFlip[EVEN_STATE] = true;
+			num_bitflips++;
 		}
 	}
+	
+	if (write_stats) {
+		fprintf(fstats, "%d;", num_bitflips);
+	}
 }
 
+static void simulate_MFplus_RNG(uint32_t test_cuid, uint64_t test_key, uint32_t *nt_enc, uint8_t *par_enc)
+{
+	struct Crypto1State sim_cs = {0, 0};
+	// init cryptostate with key:
+	for(int8_t i = 47; i > 0; i -= 2) {
+		sim_cs.odd  = sim_cs.odd  << 1 | BIT(test_key, (i - 1) ^ 7);
+		sim_cs.even = sim_cs.even << 1 | BIT(test_key, i ^ 7);
+	}
+
+	*par_enc = 0;
+	uint32_t nt = (rand() & 0xff) << 24 | (rand() & 0xff) << 16 | (rand() & 0xff) << 8 | (rand() & 0xff);
+	for (int8_t byte_pos = 3; byte_pos >= 0; byte_pos--) {
+		uint8_t nt_byte_dec = (nt >> (8*byte_pos)) & 0xff;
+		uint8_t nt_byte_enc = crypto1_byte(&sim_cs, nt_byte_dec ^ (test_cuid >> (8*byte_pos)), false) ^ nt_byte_dec; 	// encode the nonce byte
+		*nt_enc = (*nt_enc << 8) | nt_byte_enc;		
+		uint8_t ks_par = filter(sim_cs.odd);											// the keystream bit to encode/decode the parity bit
+		uint8_t nt_byte_par_enc = ks_par ^ oddparity8(nt_byte_dec);						// determine the nt byte's parity and encode it
+		*par_enc = (*par_enc << 1) | nt_byte_par_enc;
+	}
+	
+}
+
+static void simulate_acquire_nonces()
+{
+	clock_t time1 = clock();
+	bool filter_flip_checked = false;
+	uint32_t total_num_nonces = 0;
+	uint32_t next_fivehundred = 500;
+	uint32_t total_added_nonces = 0;
+
+	cuid = (rand() & 0xff) << 24 | (rand() & 0xff) << 16 | (rand() & 0xff) << 8 | (rand() & 0xff);
+	known_target_key = ((uint64_t)rand() & 0xfff) << 36 | ((uint64_t)rand() & 0xfff) << 24 | ((uint64_t)rand() & 0xfff) << 12 | ((uint64_t)rand() & 0xfff);
+	
+	printf("Simulating nonce acquisition for target key %012"llx", cuid %08x ...\n", known_target_key, cuid);
+	fprintf(fstats, "%012"llx";%08x;", known_target_key, cuid);
+	
+	do {
+		uint32_t nt_enc = 0;
+		uint8_t par_enc = 0;
+
+		simulate_MFplus_RNG(cuid, known_target_key, &nt_enc, &par_enc);
+		//printf("Simulated RNG: nt_enc1: %08x, nt_enc2: %08x, par_enc: %02x\n", nt_enc1, nt_enc2, par_enc);
+		total_added_nonces += add_nonce(nt_enc, par_enc);
+		total_num_nonces++;
+		
+		if (first_byte_num == 256 ) {
+			// printf("first_byte_num = %d, first_byte_Sum = %d\n", first_byte_num, first_byte_Sum);
+			if (!filter_flip_checked) {
+				Check_for_FilterFlipProperties();
+				filter_flip_checked = true;
+			}
+			num_good_first_bytes = estimate_second_byte_sum();
+			if (total_num_nonces > next_fivehundred) {
+				next_fivehundred = (total_num_nonces/500+1) * 500;
+				printf("Acquired %5d nonces (%5d with distinct bytes 0 and 1). Number of bytes with probability for correctly guessed Sum(a8) > %1.1f%%: %d\n",
+					total_num_nonces, 
+					total_added_nonces,
+					CONFIDENCE_THRESHOLD * 100.0,
+					num_good_first_bytes);
+			}
+		}
+
+	} while (num_good_first_bytes < GOOD_BYTES_REQUIRED);
+	
+	time1 = clock() - time1;
+	if ( time1 > 0 ) {
+	PrintAndLog("Acquired a total of %d nonces in %1.1f seconds (%0.0f nonces/minute)", 
+		total_num_nonces, 
+		((float)time1)/CLOCKS_PER_SEC, 
+		total_num_nonces * 60.0 * CLOCKS_PER_SEC/(float)time1);
+	}
+	fprintf(fstats, "%d;%d;%d;%1.2f;", total_num_nonces, total_added_nonces, num_good_first_bytes, CONFIDENCE_THRESHOLD);
+		
+}
 
 static int acquire_nonces(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, bool nonce_file_write, bool slow)
 {
@@ -710,7 +817,6 @@ static int acquire_nonces(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_
 				//printf("Encrypted nonce: %08x, encrypted_parity: %02x\n", nt_enc2, par_enc & 0x0f);
 				total_added_nonces += add_nonce(nt_enc2, par_enc & 0x0f);
 				
-
 				if (nonce_file_write) {
 					fwrite(bufp, 1, 9, fnonces);
 				}
@@ -742,8 +848,14 @@ static int acquire_nonces(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_
 		}
 
 		if (!initialize) {
-			if (!WaitForResponseTimeout(CMD_ACK, &resp, 3000)) return 1;
-			if (resp.arg[0]) return resp.arg[0];  // error during nested_hard
+			if (!WaitForResponseTimeout(CMD_ACK, &resp, 3000)) {
+				fclose(fnonces);
+				return 1;
+			}
+			if (resp.arg[0]) {
+				fclose(fnonces);
+				return resp.arg[0];  // error during nested_hard
+			}
 		}
 
 		initialize = false;
@@ -755,15 +867,17 @@ static int acquire_nonces(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_
 		fclose(fnonces);
 	}
 	
-	PrintAndLog("Acquired a total of %d nonces in %1.1f seconds (%0.0f nonces/minute)", 
-		total_num_nonces, 
-		((float)clock()-time1)/CLOCKS_PER_SEC, 
-		total_num_nonces*60.0*CLOCKS_PER_SEC/((float)clock()-time1));
-	
+	time1 = clock() - time1;
+	if ( time1 > 0 ) {
+		PrintAndLog("Acquired a total of %d nonces in %1.1f seconds (%0.0f nonces/minute)", 
+			total_num_nonces, 
+			((float)time1)/CLOCKS_PER_SEC, 
+			total_num_nonces * 60.0 * CLOCKS_PER_SEC/(float)time1
+		);
+	}
 	return 0;
 }
 
-
 static int init_partial_statelists(void)
 {
 	const uint32_t sizes_odd[17] = { 126757, 0, 18387, 0, 74241, 0, 181737, 0, 248801, 0, 182033, 0, 73421, 0, 17607, 0, 125601 };
@@ -807,13 +921,12 @@ static int init_partial_statelists(void)
 		for (uint16_t i = 0; i <= 16; i += 2) {
 			uint32_t *p = partial_statelist[i].states[odd_even];
 			p += partial_statelist[i].len[odd_even];
-			*p = 0xffffffff;
+			*p = END_OF_LIST_MARKER;
 		}
 	}
 	
 	return 0;
 }	
-		
 
 static void init_BitFlip_statelist(void)
 {
@@ -834,10 +947,9 @@ static void init_BitFlip_statelist(void)
 	}
 	// set len and add End Of List marker
 	statelist_bitflip.len[0] = p - statelist_bitflip.states[0];
-	*p = 0xffffffff;
+	*p = END_OF_LIST_MARKER;
 	statelist_bitflip.states[0] = realloc(statelist_bitflip.states[0], sizeof(uint32_t) * (statelist_bitflip.len[0] + 1));
 }
-
 		
 static inline uint32_t *find_first_state(uint32_t state, uint32_t mask, partial_indexed_statelist_t *sl, odd_even_t odd_even)
 {
@@ -845,12 +957,11 @@ static inline uint32_t *find_first_state(uint32_t state, uint32_t mask, partial_
 
 	if (p == NULL) return NULL;
 	while (*p < (state & mask)) p++;
-	if (*p == 0xffffffff) return NULL;					// reached end of list, no match
+	if (*p == END_OF_LIST_MARKER) return NULL;					// reached end of list, no match
 	if ((*p & mask) == (state & mask)) return p;		// found a match.
 	return NULL;										// no match
 } 
 
-
 static inline bool /*__attribute__((always_inline))*/ invariant_holds(uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, uint_fast8_t bit, uint_fast8_t state_bit)
 {
 	uint_fast8_t j_1_bit_mask = 0x01 << (bit-1);
@@ -862,7 +973,6 @@ static inline bool /*__attribute__((always_inline))*/ invariant_holds(uint_fast8
 	return !all_diff;
 }
 
-
 static inline bool /*__attribute__((always_inline))*/ invalid_state(uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, uint_fast8_t bit, uint_fast8_t state_bit)
 {
 	uint_fast8_t j_bit_mask = 0x01 << bit;
@@ -873,7 +983,6 @@ static inline bool /*__attribute__((always_inline))*/ invalid_state(uint_fast8_t
 	return all_diff;
 }
 
-
 static inline bool remaining_bits_match(uint_fast8_t num_common_bits, uint_fast8_t byte_diff, uint_fast32_t state1, uint_fast32_t state2, odd_even_t odd_even)
 {
 	if (odd_even) {
@@ -904,7 +1013,6 @@ static inline bool remaining_bits_match(uint_fast8_t num_common_bits, uint_fast8
 	return true;					// valid state
 }
 
-
 static bool all_other_first_bytes_match(uint32_t state, odd_even_t odd_even) 
 {
 	for (uint16_t i = 1; i < num_good_first_bytes; i++) {
@@ -927,7 +1035,7 @@ static bool all_other_first_bytes_match(uint32_t state, odd_even_t odd_even)
 					uint16_t part_sum_a8 = (odd_even == ODD_STATE) ? r : s;
 					uint32_t *p = find_first_state(state, mask, &partial_statelist[part_sum_a8], odd_even);
 					if (p != NULL) {
-						while ((state & mask) == (*p & mask) && (*p != 0xffffffff)) {
+						while ((state & mask) == (*p & mask) && (*p != END_OF_LIST_MARKER)) {
 							if (remaining_bits_match(j, bytes_diff, state, (state&0x00fffff0) | *p, odd_even)) {
 								found_match = true;
 								// if ((odd_even == ODD_STATE && state == test_state_odd)
@@ -968,7 +1076,6 @@ static bool all_other_first_bytes_match(uint32_t state, odd_even_t odd_even)
 	return true;
 }
 
-
 static bool all_bit_flips_match(uint32_t state, odd_even_t odd_even) 
 {
 	for (uint16_t i = 0; i < 256; i++) {
@@ -986,7 +1093,7 @@ static bool all_bit_flips_match(uint32_t state, odd_even_t odd_even)
 			bool found_match = false;
 			uint32_t *p = find_first_state(state, mask, &statelist_bitflip, 0);
 			if (p != NULL) {
-				while ((state & mask) == (*p & mask) && (*p != 0xffffffff)) {
+				while ((state & mask) == (*p & mask) && (*p != END_OF_LIST_MARKER)) {
 					if (remaining_bits_match(j, bytes_diff, state, (state&0x00fffff0) | *p, odd_even)) {
 						found_match = true;
 						// if ((odd_even == ODD_STATE && state == test_state_odd)
@@ -1025,16 +1132,13 @@ static bool all_bit_flips_match(uint32_t state, odd_even_t odd_even)
 	return true;
 }
 
-
 static struct sl_cache_entry {
 	uint32_t *sl;
 	uint32_t len;
 	} sl_cache[17][17][2];
 
-
 static void init_statelist_cache(void)
 {
-
 	for (uint16_t i = 0; i < 17; i+=2) {
 		for (uint16_t j = 0; j < 17; j+=2) {
 			for (uint16_t k = 0; k < 2; k++) {
@@ -1045,7 +1149,6 @@ static void init_statelist_cache(void)
 	}		
 }
 
-
 static int add_matching_states(statelist_t *candidates, uint16_t part_sum_a0, uint16_t part_sum_a8, odd_even_t odd_even)
 {
 	uint32_t worstcase_size = 1<<20;
@@ -1063,11 +1166,11 @@ static int add_matching_states(statelist_t *candidates, uint16_t part_sum_a0, ui
 		return 4;
 	}
 	uint32_t *add_p = candidates->states[odd_even]; 
-	for (uint32_t *p1 = partial_statelist[part_sum_a0].states[odd_even]; *p1 != 0xffffffff; p1++) {
+	for (uint32_t *p1 = partial_statelist[part_sum_a0].states[odd_even]; *p1 != END_OF_LIST_MARKER; p1++) {
 		uint32_t search_mask = 0x000ffff0;
 		uint32_t *p2 = find_first_state((*p1 << 4), search_mask, &partial_statelist[part_sum_a8], odd_even);
 		if (p2 != NULL) {
-			while (((*p1 << 4) & search_mask) == (*p2 & search_mask) && *p2 != 0xffffffff) {
+			while (((*p1 << 4) & search_mask) == (*p2 & search_mask) && *p2 != END_OF_LIST_MARKER) {
 				if ((nonces[best_first_bytes[0]].BitFlip[odd_even] && find_first_state((*p1 << 4) | *p2, 0x000fffff, &statelist_bitflip, 0))
 					|| !nonces[best_first_bytes[0]].BitFlip[odd_even]) {
 				if (all_other_first_bytes_match((*p1 << 4) | *p2, odd_even)) {
@@ -1082,7 +1185,7 @@ static int add_matching_states(statelist_t *candidates, uint16_t part_sum_a0, ui
 	}
 
 	// set end of list marker and len
-	*add_p = 0xffffffff; 
+	*add_p = END_OF_LIST_MARKER; 
 	candidates->len[odd_even] = add_p - candidates->states[odd_even];
 
 	candidates->states[odd_even] = realloc(candidates->states[odd_even], sizeof(uint32_t) * (candidates->len[odd_even] + 1));
@@ -1093,7 +1196,6 @@ static int add_matching_states(statelist_t *candidates, uint16_t part_sum_a0, ui
 	return 0;
 }
 
-
 static statelist_t *add_more_candidates(statelist_t *current_candidates)
 {
 	statelist_t *new_candidates = NULL;
@@ -1113,7 +1215,6 @@ static statelist_t *add_more_candidates(statelist_t *current_candidates)
 	return new_candidates;
 }
 
-
 static void TestIfKeyExists(uint64_t key)
 {
 	struct Crypto1State *pcs;
@@ -1130,14 +1231,14 @@ static void TestIfKeyExists(uint64_t key)
 		bool found_even = false;
 		uint32_t *p_odd = p->states[ODD_STATE];
 		uint32_t *p_even = p->states[EVEN_STATE];
-		while (*p_odd != 0xffffffff) {
+		while (*p_odd != END_OF_LIST_MARKER) {
 			if ((*p_odd & 0x00ffffff) == state_odd) {
 				found_odd = true;
 				break;
 			}
 			p_odd++;
 		}
-		while (*p_even != 0xffffffff) {
+		while (*p_even != END_OF_LIST_MARKER) {
 			if ((*p_even & 0x00ffffff) == state_even) {
 				found_even = true;
 			}
@@ -1145,20 +1246,27 @@ static void TestIfKeyExists(uint64_t key)
 		}
 		count += (p_odd - p->states[ODD_STATE]) * (p_even - p->states[EVEN_STATE]);
 		if (found_odd && found_even) {
-			PrintAndLog("Key Found after testing %lld (2^%1.1f) out of %lld (2^%1.1f) keys. A brute force would have taken approx %lld minutes.", 
-				count, log(count)/log(2), 
-				maximum_states, log(maximum_states)/log(2),
-				(count>>22)/60);
+			PrintAndLog("Key Found after testing %lld (2^%1.1f) out of %lld (2^%1.1f) keys. ", 
+				count,
+				log(count)/log(2), 
+				maximum_states,
+				log(maximum_states)/log(2)
+				);
+			if (write_stats) {
+				fprintf(fstats, "1\n");
+			}
 			crypto1_destroy(pcs);
 			return;
 		}
 	}
 
 	printf("Key NOT found!\n");
+	if (write_stats) {
+		fprintf(fstats, "0\n");
+	}
 	crypto1_destroy(pcs);
 }
 
-	
 static void generate_candidates(uint16_t sum_a0, uint16_t sum_a8)
 {
 	printf("Generating crypto1 state candidates... \n");
@@ -1173,7 +1281,7 @@ static void generate_candidates(uint16_t sum_a0, uint16_t sum_a8)
 			}
 		}
 	}
-	printf("Number of possible keys with Sum(a0) = %d: %lld (2^%1.1f)\n", sum_a0, maximum_states, log(maximum_states)/log(2.0));
+	printf("Number of possible keys with Sum(a0) = %d: %"PRIu64" (2^%1.1f)\n", sum_a0, maximum_states, log(maximum_states)/log(2.0));
 	
 	init_statelist_cache();
 	
@@ -1196,7 +1304,7 @@ static void generate_candidates(uint16_t sum_a0, uint16_t sum_a8)
 								} else {
 									current_candidates->len[EVEN_STATE] = 0;
 									uint32_t *p = current_candidates->states[EVEN_STATE] = malloc(sizeof(uint32_t));
-									*p = 0xffffffff;
+									*p = END_OF_LIST_MARKER;
 								}
 							} else {
 								add_matching_states(current_candidates, q, s, EVEN_STATE);
@@ -1205,11 +1313,11 @@ static void generate_candidates(uint16_t sum_a0, uint16_t sum_a8)
 								} else {
 									current_candidates->len[ODD_STATE] = 0;
 									uint32_t *p = current_candidates->states[ODD_STATE] = malloc(sizeof(uint32_t));
-									*p = 0xffffffff;
+									*p = END_OF_LIST_MARKER;
 								}
 							}
-							printf("Odd  state candidates: %6d (2^%0.1f)\n", current_candidates->len[ODD_STATE], log(current_candidates->len[ODD_STATE])/log(2)); 
-							printf("Even state candidates: %6d (2^%0.1f)\n", current_candidates->len[EVEN_STATE], log(current_candidates->len[EVEN_STATE])/log(2)); 
+							//printf("Odd  state candidates: %6d (2^%0.1f)\n", current_candidates->len[ODD_STATE], log(current_candidates->len[ODD_STATE])/log(2)); 
+							//printf("Even state candidates: %6d (2^%0.1f)\n", current_candidates->len[EVEN_STATE], log(current_candidates->len[EVEN_STATE])/log(2)); 
 						}
 					}
 				}
@@ -1222,87 +1330,458 @@ static void generate_candidates(uint16_t sum_a0, uint16_t sum_a8)
 	for (statelist_t *sl = candidates; sl != NULL; sl = sl->next) {
 		maximum_states += (uint64_t)sl->len[ODD_STATE] * sl->len[EVEN_STATE];
 	}
-	printf("Number of remaining possible keys: %lld (2^%1.1f)\n", maximum_states, log(maximum_states)/log(2.0));
+	printf("Number of remaining possible keys: %"PRIu64" (2^%1.1f)\n", maximum_states, log(maximum_states)/log(2.0));
+	if (write_stats) {
+		if (maximum_states != 0) {
+			fprintf(fstats, "%1.1f;", log(maximum_states)/log(2.0));
+		} else {
+			fprintf(fstats, "%1.1f;", 0.0);
+		}
+	}
+}
+
+static void	free_candidates_memory(statelist_t *sl)
+{
+	if (sl == NULL) {
+		return;
+	} else {
+		free_candidates_memory(sl->next);
+		free(sl);
+	}
+}
 
+static void free_statelist_cache(void)
+{
+	for (uint16_t i = 0; i < 17; i+=2) {
+		for (uint16_t j = 0; j < 17; j+=2) {
+			for (uint16_t k = 0; k < 2; k++) {
+				free(sl_cache[i][j][k].sl);
+			}
+		}
+	}		
 }
 
+uint64_t foundkey = 0;
+size_t keys_found = 0;
+size_t bucket_count = 0;
+statelist_t* buckets[128];
+size_t total_states_tested = 0;
+size_t thread_count = 4;
+
+// these bitsliced states will hold identical states in all slices
+bitslice_t bitsliced_rollback_byte[ROLLBACK_SIZE];
+
+// arrays of bitsliced states with identical values in all slices
+bitslice_t bitsliced_encrypted_nonces[NONCE_TESTS][STATE_SIZE];
+bitslice_t bitsliced_encrypted_parity_bits[NONCE_TESTS][ROLLBACK_SIZE];
+
+#define EXACT_COUNT
+
+static const uint64_t crack_states_bitsliced(statelist_t *p){
+    // the idea to roll back the half-states before combining them was suggested/explained to me by bla
+    // first we pre-bitslice all the even state bits and roll them back, then bitslice the odd bits and combine the two in the inner loop
+    uint64_t key = -1;
+	uint8_t bSize = sizeof(bitslice_t);
+
+#ifdef EXACT_COUNT
+    size_t bucket_states_tested = 0;
+    size_t bucket_size[p->len[EVEN_STATE]/MAX_BITSLICES];
+#else
+    const size_t bucket_states_tested = (p->len[EVEN_STATE])*(p->len[ODD_STATE]);
+#endif
+
+    bitslice_t *bitsliced_even_states[p->len[EVEN_STATE]/MAX_BITSLICES];
+    size_t bitsliced_blocks = 0;
+    uint32_t const * restrict even_end = p->states[EVEN_STATE]+p->len[EVEN_STATE];
+	
+    // bitslice all the even states
+    for(uint32_t * restrict p_even = p->states[EVEN_STATE]; p_even < even_end; p_even += MAX_BITSLICES){
+
+#ifdef __WIN32
+	#ifdef __MINGW32__
+		bitslice_t * restrict lstate_p = __mingw_aligned_malloc((STATE_SIZE+ROLLBACK_SIZE) * bSize, bSize);
+	#else		
+		bitslice_t * restrict lstate_p = _aligned_malloc((STATE_SIZE+ROLLBACK_SIZE) * bSize, bSize);
+	#endif
+#else
+	#ifdef __APPLE__
+		bitslice_t * restrict lstate_p = malloc((STATE_SIZE+ROLLBACK_SIZE) * bSize);
+	#else
+		bitslice_t * restrict lstate_p = memalign(bSize, (STATE_SIZE+ROLLBACK_SIZE) * bSize);
+	#endif
+#endif
+
+		if ( !lstate_p )	{
+			__sync_fetch_and_add(&total_states_tested, bucket_states_tested);
+			return key;
+		}
+				
+		memset(lstate_p+1, 0x0, (STATE_SIZE-1)*sizeof(bitslice_t)); // zero even bits
+		
+		// bitslice even half-states
+        const size_t max_slices = (even_end-p_even) < MAX_BITSLICES ? even_end-p_even : MAX_BITSLICES;
+#ifdef EXACT_COUNT
+        bucket_size[bitsliced_blocks] = max_slices;
+#endif
+        for(size_t slice_idx = 0; slice_idx < max_slices; ++slice_idx){
+            uint32_t e = *(p_even+slice_idx);
+            for(size_t bit_idx = 1; bit_idx < STATE_SIZE; bit_idx+=2, e >>= 1){
+                // set even bits
+                if(e&1){
+                    lstate_p[bit_idx].bytes64[slice_idx>>6] |= 1ull << (slice_idx&63);
+                }
+            }
+        }
+        // compute the rollback bits
+        for(size_t rollback = 0; rollback < ROLLBACK_SIZE; ++rollback){
+            // inlined crypto1_bs_lfsr_rollback
+            const bitslice_value_t feedout = lstate_p[0].value;
+            ++lstate_p;
+            const bitslice_value_t ks_bits = crypto1_bs_f20(lstate_p);
+            const bitslice_value_t feedback = (feedout ^ ks_bits     ^ lstate_p[47- 5].value ^ lstate_p[47- 9].value ^
+                                               lstate_p[47-10].value ^ lstate_p[47-12].value ^ lstate_p[47-14].value ^
+                                               lstate_p[47-15].value ^ lstate_p[47-17].value ^ lstate_p[47-19].value ^
+                                               lstate_p[47-24].value ^ lstate_p[47-25].value ^ lstate_p[47-27].value ^
+                                               lstate_p[47-29].value ^ lstate_p[47-35].value ^ lstate_p[47-39].value ^
+                                               lstate_p[47-41].value ^ lstate_p[47-42].value ^ lstate_p[47-43].value);
+            lstate_p[47].value = feedback ^ bitsliced_rollback_byte[rollback].value;
+        }
+        bitsliced_even_states[bitsliced_blocks++] = lstate_p;
+    }
+
+    // bitslice every odd state to every block of even half-states with half-finished rollback
+    for(uint32_t const * restrict p_odd = p->states[ODD_STATE]; p_odd < p->states[ODD_STATE]+p->len[ODD_STATE]; ++p_odd){
+        // early abort
+        if(keys_found){
+            goto out;
+        }
+
+        // set the odd bits and compute rollback
+        uint64_t o = (uint64_t) *p_odd;
+        lfsr_rollback_byte((struct Crypto1State*) &o, 0, 1);
+        // pre-compute part of the odd feedback bits (minus rollback)
+        bool odd_feedback_bit = parity(o&0x9ce5c);
+
+        crypto1_bs_rewind_a0();
+        // set odd bits
+        for(size_t state_idx = 0; state_idx < STATE_SIZE-ROLLBACK_SIZE; o >>= 1, state_idx+=2){
+            if(o & 1){
+                state_p[state_idx] = bs_ones;
+            } else {
+                state_p[state_idx] = bs_zeroes;
+            }
+        }
+        const bitslice_value_t odd_feedback = odd_feedback_bit ? bs_ones.value : bs_zeroes.value;
+
+        for(size_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){
+            const bitslice_t const * restrict bitsliced_even_state = bitsliced_even_states[block_idx];
+            size_t state_idx;
+            // set even bits
+            for(state_idx = 0; state_idx < STATE_SIZE-ROLLBACK_SIZE; state_idx+=2){
+                state_p[1+state_idx] = bitsliced_even_state[1+state_idx];
+            }
+            // set rollback bits
+            uint64_t lo = o;
+            for(; state_idx < STATE_SIZE; lo >>= 1, state_idx+=2){
+                // set the odd bits and take in the odd rollback bits from the even states
+                if(lo & 1){
+                    state_p[state_idx].value = ~bitsliced_even_state[state_idx].value;
+                } else {
+                    state_p[state_idx] = bitsliced_even_state[state_idx];
+                }
+
+                // set the even bits and take in the even rollback bits from the odd states
+                if((lo >> 32) & 1){
+                    state_p[1+state_idx].value = ~bitsliced_even_state[1+state_idx].value;
+                } else {
+                    state_p[1+state_idx] = bitsliced_even_state[1+state_idx];
+                }
+            }
+
+#ifdef EXACT_COUNT
+            bucket_states_tested += bucket_size[block_idx];
+#endif
+            // pre-compute first keystream and feedback bit vectors
+            const bitslice_value_t ksb = crypto1_bs_f20(state_p);
+            const bitslice_value_t fbb = (odd_feedback         ^ state_p[47- 0].value ^ state_p[47- 5].value ^ // take in the even and rollback bits
+                                          state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^
+                                          state_p[47-24].value ^ state_p[47-42].value);
+
+            // vector to contain test results (1 = passed, 0 = failed)
+            bitslice_t results = bs_ones;
+
+            for(size_t tests = 0; tests < NONCE_TESTS; ++tests){
+                size_t parity_bit_idx = 0;
+                bitslice_value_t fb_bits = fbb;
+                bitslice_value_t ks_bits = ksb;
+                state_p = &states[KEYSTREAM_SIZE-1];
+                bitslice_value_t parity_bit_vector = bs_zeroes.value;
+
+                // highest bit is transmitted/received first
+                for(int32_t ks_idx = KEYSTREAM_SIZE-1; ks_idx >= 0; --ks_idx, --state_p){
+                    // decrypt nonce bits
+                    const bitslice_value_t encrypted_nonce_bit_vector = bitsliced_encrypted_nonces[tests][ks_idx].value;
+                    const bitslice_value_t decrypted_nonce_bit_vector = (encrypted_nonce_bit_vector ^ ks_bits);
+
+                    // compute real parity bits on the fly
+                    parity_bit_vector ^= decrypted_nonce_bit_vector;
+
+                    // update state
+                    state_p[0].value = (fb_bits ^ decrypted_nonce_bit_vector);
+
+                    // compute next keystream bit
+                    ks_bits = crypto1_bs_f20(state_p);
+
+                    // for each byte:
+                    if((ks_idx&7) == 0){
+                        // get encrypted parity bits
+                        const bitslice_value_t encrypted_parity_bit_vector = bitsliced_encrypted_parity_bits[tests][parity_bit_idx++].value;
+
+                        // decrypt parity bits
+                        const bitslice_value_t decrypted_parity_bit_vector = (encrypted_parity_bit_vector ^ ks_bits);
+
+                        // compare actual parity bits with decrypted parity bits and take count in results vector
+                        results.value &= (parity_bit_vector ^ decrypted_parity_bit_vector);
+
+                        // make sure we still have a match in our set
+                        // if(memcmp(&results, &bs_zeroes, sizeof(bitslice_t)) == 0){
+
+                        // this is much faster on my gcc, because somehow a memcmp needlessly spills/fills all the xmm registers to/from the stack - ???
+                        // the short-circuiting also helps
+                        if(results.bytes64[0] == 0
+#if MAX_BITSLICES > 64
+                           && results.bytes64[1] == 0
+#endif
+#if MAX_BITSLICES > 128
+                           && results.bytes64[2] == 0
+                           && results.bytes64[3] == 0
+#endif
+                          ){
+                            goto stop_tests;
+                        }
+                        // this is about as fast but less portable (requires -std=gnu99)
+                        // asm goto ("ptest %1, %0\n\t"
+                        //           "jz %l2" :: "xm" (results.value), "xm" (bs_ones.value) : "cc" : stop_tests);
+                        parity_bit_vector = bs_zeroes.value;
+                    }
+                    // compute next feedback bit vector
+                    fb_bits = (state_p[47- 0].value ^ state_p[47- 5].value ^ state_p[47- 9].value ^
+                               state_p[47-10].value ^ state_p[47-12].value ^ state_p[47-14].value ^
+                               state_p[47-15].value ^ state_p[47-17].value ^ state_p[47-19].value ^
+                               state_p[47-24].value ^ state_p[47-25].value ^ state_p[47-27].value ^
+                               state_p[47-29].value ^ state_p[47-35].value ^ state_p[47-39].value ^
+                               state_p[47-41].value ^ state_p[47-42].value ^ state_p[47-43].value);
+                }
+            }
+            // all nonce tests were successful: we've found the key in this block!
+            state_t keys[MAX_BITSLICES];
+            crypto1_bs_convert_states(&states[KEYSTREAM_SIZE], keys);
+            for(size_t results_idx = 0; results_idx < MAX_BITSLICES; ++results_idx){
+                if(get_vector_bit(results_idx, results)){
+                    key = keys[results_idx].value;
+                    goto out;
+                }
+            }
+stop_tests:
+            // prepare to set new states
+            crypto1_bs_rewind_a0();
+            continue;
+        }
+    }
+
+out:
+    for(size_t block_idx = 0; block_idx < bitsliced_blocks; ++block_idx){
+		
+#ifdef __WIN32
+	#ifdef __MINGW32__
+		__mingw_aligned_free(bitsliced_even_states[block_idx]-ROLLBACK_SIZE);
+	#else
+		_aligned_free(bitsliced_even_states[block_idx]-ROLLBACK_SIZE);		
+	#endif
+#else
+		free(bitsliced_even_states[block_idx]-ROLLBACK_SIZE);
+#endif		
+		
+    }
+    __sync_fetch_and_add(&total_states_tested, bucket_states_tested);
+    return key;
+}
+
+static void* crack_states_thread(void* x){
+    const size_t thread_id = (size_t)x;
+    size_t current_bucket = thread_id;
+    while(current_bucket < bucket_count){
+        statelist_t * bucket = buckets[current_bucket];
+		if(bucket){
+            const uint64_t key = crack_states_bitsliced(bucket);
+            if(key != -1){
+                __sync_fetch_and_add(&keys_found, 1);
+				__sync_fetch_and_add(&foundkey, key);
+                break;
+            } else if(keys_found){
+                break;
+            } else {				
+                printf(".");
+				fflush(stdout);
+            }
+        }
+        current_bucket += thread_count;
+    }
+    return NULL;
+}
 
 static void brute_force(void)
 {
 	if (known_target_key != -1) {
 		PrintAndLog("Looking for known target key in remaining key space...");
 		TestIfKeyExists(known_target_key);
-		return;
 	} else {
-		PrintAndLog("Brute Force phase is not implemented.");
-		return;
+        PrintAndLog("Brute force phase starting.");
+        time_t start, end;
+        time(&start);
+        keys_found = 0;
+		foundkey = 0;
+		
+        crypto1_bs_init();
+
+        PrintAndLog("Using %u-bit bitslices", MAX_BITSLICES);
+        PrintAndLog("Bitslicing best_first_byte^uid[3] (rollback byte): %02x...", best_first_bytes[0]^(cuid>>24));
+        // convert to 32 bit little-endian
+		crypto1_bs_bitslice_value32((best_first_bytes[0]<<24)^cuid, bitsliced_rollback_byte, 8);
+			
+        PrintAndLog("Bitslicing nonces...");
+        for(size_t tests = 0; tests < NONCE_TESTS; tests++){
+            uint32_t test_nonce = brute_force_nonces[tests]->nonce_enc;
+            uint8_t test_parity = brute_force_nonces[tests]->par_enc;
+            // pre-xor the uid into the decrypted nonces, and also pre-xor the cuid parity into the encrypted parity bits - otherwise an exta xor is required in the decryption routine
+            crypto1_bs_bitslice_value32(cuid^test_nonce, bitsliced_encrypted_nonces[tests], 32);
+            // convert to 32 bit little-endian
+            crypto1_bs_bitslice_value32(rev32( ~(test_parity ^ ~(parity(cuid>>24 & 0xff)<<3 | parity(cuid>>16 & 0xff)<<2 | parity(cuid>>8 & 0xff)<<1 | parity(cuid&0xff)))), bitsliced_encrypted_parity_bits[tests], 4);
+		}
+        total_states_tested = 0;
+
+        // count number of states to go
+        bucket_count = 0;
+        for (statelist_t *p = candidates; p != NULL; p = p->next) {
+            buckets[bucket_count] = p;
+            bucket_count++;
+        }
+
+#ifndef __WIN32
+        thread_count = sysconf(_SC_NPROCESSORS_CONF);
+		if ( thread_count < 1)
+			thread_count = 1;
+#endif  /* _WIN32 */
+
+        pthread_t threads[thread_count];
+		
+        // enumerate states using all hardware threads, each thread handles one bucket
+        PrintAndLog("Starting %u cracking threads to search %u buckets containing a total of %"PRIu64" states...", thread_count, bucket_count, maximum_states);
+		
+        for(size_t i = 0; i < thread_count; i++){
+            pthread_create(&threads[i], NULL, crack_states_thread, (void*) i);
+        }
+        for(size_t i = 0; i < thread_count; i++){
+            pthread_join(threads[i], 0);
+        }
+
+        time(&end);		
+        double elapsed_time = difftime(end, start);
+
+        if(keys_found){
+			PrintAndLog("Success! Tested %"PRIu32" states, found %u keys after %.f seconds", total_states_tested, keys_found, elapsed_time);
+			PrintAndLog("\nFound key: %012"PRIx64"\n", foundkey);
+        } else {
+			PrintAndLog("Fail! Tested %"PRIu32" states, in %.f seconds", total_states_tested, elapsed_time);
+		}
+        // reset this counter for the next call
+        nonces_to_bruteforce = 0;
 	}
-	
-
 }
 
-
-int mfnestedhard(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, uint8_t *trgkey, bool nonce_file_read, bool nonce_file_write, bool slow) 
+int mfnestedhard(uint8_t blockNo, uint8_t keyType, uint8_t *key, uint8_t trgBlockNo, uint8_t trgKeyType, uint8_t *trgkey, bool nonce_file_read, bool nonce_file_write, bool slow, int tests) 
 {
+	// initialize Random number generator
+	time_t t;
+	srand((unsigned) time(&t));
+	
 	if (trgkey != NULL) {
 		known_target_key = bytes_to_num(trgkey, 6);
 	} else {
 		known_target_key = -1;
 	}
 	
-	// initialize the list of nonces
-	for (uint16_t i = 0; i < 256; i++) {
-		nonces[i].num = 0;
-		nonces[i].Sum = 0;
-		nonces[i].Sum8_guess = 0;
-		nonces[i].Sum8_prob = 0.0;
-		nonces[i].updated = true;
-		nonces[i].first = NULL;
-	}
-	first_byte_num = 0;
-	first_byte_Sum = 0;
-	num_good_first_bytes = 0;
-
 	init_partial_statelists();
 	init_BitFlip_statelist();
+	write_stats = false;
 	
-	if (nonce_file_read) {  	// use pre-acquired data from file nonces.bin
-		if (read_nonce_file() != 0) {
+	if (tests) {
+		// set the correct locale for the stats printing
+		setlocale(LC_ALL, "");
+		write_stats = true;
+		if ((fstats = fopen("hardnested_stats.txt","a")) == NULL) { 
+			PrintAndLog("Could not create/open file hardnested_stats.txt");
 			return 3;
 		}
-		Check_for_FilterFlipProperties();
-		num_good_first_bytes = MIN(estimate_second_byte_sum(), GOOD_BYTES_REQUIRED);
-	} else {					// acquire nonces.
-		uint16_t is_OK = acquire_nonces(blockNo, keyType, key, trgBlockNo, trgKeyType, nonce_file_write, slow);
-		if (is_OK != 0) {
-			return is_OK;
+		for (uint32_t i = 0; i < tests; i++) {
+			init_nonce_memory();
+			simulate_acquire_nonces();
+			Tests();
+			printf("Sum(a0) = %d\n", first_byte_Sum);
+			fprintf(fstats, "%d;", first_byte_Sum);
+			generate_candidates(first_byte_Sum, nonces[best_first_bytes[0]].Sum8_guess);
+			brute_force();
+			free_nonces_memory();
+			free_statelist_cache();
+			free_candidates_memory(candidates);
+			candidates = NULL;
+		}
+		fclose(fstats);
+	} else {
+		init_nonce_memory();
+		if (nonce_file_read) {  	// use pre-acquired data from file nonces.bin
+			if (read_nonce_file() != 0) {
+				return 3;
+			}
+			Check_for_FilterFlipProperties();
+			num_good_first_bytes = MIN(estimate_second_byte_sum(), GOOD_BYTES_REQUIRED);
+		} else {					// acquire nonces.
+			uint16_t is_OK = acquire_nonces(blockNo, keyType, key, trgBlockNo, trgKeyType, nonce_file_write, slow);
+			if (is_OK != 0) {
+				return is_OK;
+			}
 		}
-	}
-
-
-	Tests();
 
-	PrintAndLog("");
-	PrintAndLog("Sum(a0) = %d", first_byte_Sum);
-	// PrintAndLog("Best 10 first bytes: %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x",
-		// best_first_bytes[0],
-		// best_first_bytes[1],
-		// best_first_bytes[2],
-		// best_first_bytes[3],
-		// best_first_bytes[4],
-		// best_first_bytes[5],
-		// best_first_bytes[6],
-		// best_first_bytes[7],
-		// best_first_bytes[8],
-		// best_first_bytes[9]  );
-	PrintAndLog("Number of first bytes with confidence > %2.1f%%: %d", CONFIDENCE_THRESHOLD*100.0, num_good_first_bytes);
-
-	time_t start_time = clock();
-	generate_candidates(first_byte_Sum, nonces[best_first_bytes[0]].Sum8_guess);
-	PrintAndLog("Time for generating key candidates list: %1.0f seconds", (float)(clock() - start_time)/CLOCKS_PER_SEC);
-	
-	brute_force();
+		//Tests();
+
+		//PrintAndLog("");
+		//PrintAndLog("Sum(a0) = %d", first_byte_Sum);
+		// PrintAndLog("Best 10 first bytes: %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x",
+			// best_first_bytes[0],
+			// best_first_bytes[1],
+			// best_first_bytes[2],
+			// best_first_bytes[3],
+			// best_first_bytes[4],
+			// best_first_bytes[5],
+			// best_first_bytes[6],
+			// best_first_bytes[7],
+			// best_first_bytes[8],
+			// best_first_bytes[9]  );
+		PrintAndLog("Number of first bytes with confidence > %2.1f%%: %d", CONFIDENCE_THRESHOLD*100.0, num_good_first_bytes);
+
+		clock_t time1 = clock();
+		generate_candidates(first_byte_Sum, nonces[best_first_bytes[0]].Sum8_guess);
+		time1 = clock() - time1;
+		if ( time1 > 0 )
+			PrintAndLog("Time for generating key candidates list: %1.0f seconds", ((float)time1)/CLOCKS_PER_SEC);
 	
+		brute_force();
+		
+		free_nonces_memory();
+		free_statelist_cache();
+		free_candidates_memory(candidates);
+		candidates = NULL;
+	}	
 	return 0;
 }