]> git.zerfleddert.de Git - proxmark3-svn/blame - client/hardnested/hardnested_bitarray_core.c
Merge pull request #443 from doegox/flush
[proxmark3-svn] / client / hardnested / hardnested_bitarray_core.c
CommitLineData
c48c4d78 1//-----------------------------------------------------------------------------
2// Copyright (C) 2016, 2017 by piwi
3//
4// This code is licensed to you under the terms of the GNU GPL, version 2 or,
5// at your option, any later version. See the LICENSE.txt file for the text of
6// the license.ch b
7//-----------------------------------------------------------------------------
8// Implements a card only attack based on crypto text (encrypted nonces
9// received during a nested authentication) only. Unlike other card only
10// attacks this doesn't rely on implementation errors but only on the
11// inherent weaknesses of the crypto1 cypher. Described in
12// Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
13// Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
14// Computer and Communications Security, 2015
15//-----------------------------------------------------------------------------
16// some helper functions which can benefit from SIMD instructions or other special instructions
17//
18
19#include "hardnested_bitarray_core.h"
20
21#include <stdint.h>
22#include <stdio.h>
23#include <stdlib.h>
c3d117a8 24#ifndef __APPLE__
c48c4d78 25#include <malloc.h>
c3d117a8 26#endif
c48c4d78 27
c48c4d78 28// this needs to be compiled several times for each instruction set.
29// For each instruction set, define a dedicated function name:
30#if defined (__AVX512F__)
31#define MALLOC_BITARRAY malloc_bitarray_AVX512
32#define FREE_BITARRAY free_bitarray_AVX512
33#define BITCOUNT bitcount_AVX512
34#define COUNT_STATES count_states_AVX512
35#define BITARRAY_AND bitarray_AND_AVX512
36#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX512
37#define COUNT_BITARRAY_AND count_bitarray_AND_AVX512
38#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX512
39#define BITARRAY_AND4 bitarray_AND4_AVX512
40#define BITARRAY_OR bitarray_OR_AVX512
41#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX512
42#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX512
43#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX512
44#elif defined (__AVX2__)
45#define MALLOC_BITARRAY malloc_bitarray_AVX2
46#define FREE_BITARRAY free_bitarray_AVX2
47#define BITCOUNT bitcount_AVX2
48#define COUNT_STATES count_states_AVX2
49#define BITARRAY_AND bitarray_AND_AVX2
50#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX2
51#define COUNT_BITARRAY_AND count_bitarray_AND_AVX2
52#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX2
53#define BITARRAY_AND4 bitarray_AND4_AVX2
54#define BITARRAY_OR bitarray_OR_AVX2
55#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX2
56#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX2
57#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX2
58#elif defined (__AVX__)
59#define MALLOC_BITARRAY malloc_bitarray_AVX
60#define FREE_BITARRAY free_bitarray_AVX
61#define BITCOUNT bitcount_AVX
62#define COUNT_STATES count_states_AVX
63#define BITARRAY_AND bitarray_AND_AVX
64#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX
65#define COUNT_BITARRAY_AND count_bitarray_AND_AVX
66#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX
67#define BITARRAY_AND4 bitarray_AND4_AVX
68#define BITARRAY_OR bitarray_OR_AVX
69#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX
70#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX
71#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX
72#elif defined (__SSE2__)
73#define MALLOC_BITARRAY malloc_bitarray_SSE2
74#define FREE_BITARRAY free_bitarray_SSE2
75#define BITCOUNT bitcount_SSE2
76#define COUNT_STATES count_states_SSE2
77#define BITARRAY_AND bitarray_AND_SSE2
78#define BITARRAY_LOW20_AND bitarray_low20_AND_SSE2
79#define COUNT_BITARRAY_AND count_bitarray_AND_SSE2
80#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_SSE2
81#define BITARRAY_AND4 bitarray_AND4_SSE2
82#define BITARRAY_OR bitarray_OR_SSE2
83#define COUNT_BITARRAY_AND2 count_bitarray_AND2_SSE2
84#define COUNT_BITARRAY_AND3 count_bitarray_AND3_SSE2
85#define COUNT_BITARRAY_AND4 count_bitarray_AND4_SSE2
86#elif defined (__MMX__)
87#define MALLOC_BITARRAY malloc_bitarray_MMX
88#define FREE_BITARRAY free_bitarray_MMX
89#define BITCOUNT bitcount_MMX
90#define COUNT_STATES count_states_MMX
91#define BITARRAY_AND bitarray_AND_MMX
92#define BITARRAY_LOW20_AND bitarray_low20_AND_MMX
93#define COUNT_BITARRAY_AND count_bitarray_AND_MMX
94#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_MMX
95#define BITARRAY_AND4 bitarray_AND4_MMX
96#define BITARRAY_OR bitarray_OR_MMX
97#define COUNT_BITARRAY_AND2 count_bitarray_AND2_MMX
98#define COUNT_BITARRAY_AND3 count_bitarray_AND3_MMX
99#define COUNT_BITARRAY_AND4 count_bitarray_AND4_MMX
af7a1f70 100#else
101#define MALLOC_BITARRAY malloc_bitarray_NOSIMD
102#define FREE_BITARRAY free_bitarray_NOSIMD
103#define BITCOUNT bitcount_NOSIMD
104#define COUNT_STATES count_states_NOSIMD
105#define BITARRAY_AND bitarray_AND_NOSIMD
106#define BITARRAY_LOW20_AND bitarray_low20_AND_NOSIMD
107#define COUNT_BITARRAY_AND count_bitarray_AND_NOSIMD
108#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_NOSIMD
109#define BITARRAY_AND4 bitarray_AND4_NOSIMD
110#define BITARRAY_OR bitarray_OR_NOSIMD
111#define COUNT_BITARRAY_AND2 count_bitarray_AND2_NOSIMD
112#define COUNT_BITARRAY_AND3 count_bitarray_AND3_NOSIMD
113#define COUNT_BITARRAY_AND4 count_bitarray_AND4_NOSIMD
c48c4d78 114#endif
115
116
117// typedefs and declaration of functions:
118typedef uint32_t* malloc_bitarray_t(uint32_t);
af7a1f70 119malloc_bitarray_t malloc_bitarray_AVX512, malloc_bitarray_AVX2, malloc_bitarray_AVX, malloc_bitarray_SSE2, malloc_bitarray_MMX, malloc_bitarray_NOSIMD, malloc_bitarray_dispatch;
c48c4d78 120typedef void free_bitarray_t(uint32_t*);
af7a1f70 121free_bitarray_t free_bitarray_AVX512, free_bitarray_AVX2, free_bitarray_AVX, free_bitarray_SSE2, free_bitarray_MMX, free_bitarray_NOSIMD, free_bitarray_dispatch;
c48c4d78 122typedef uint32_t bitcount_t(uint32_t);
af7a1f70 123bitcount_t bitcount_AVX512, bitcount_AVX2, bitcount_AVX, bitcount_SSE2, bitcount_MMX, bitcount_NOSIMD, bitcount_dispatch;
c48c4d78 124typedef uint32_t count_states_t(uint32_t*);
af7a1f70 125count_states_t count_states_AVX512, count_states_AVX2, count_states_AVX, count_states_SSE2, count_states_MMX, count_states_NOSIMD, count_states_dispatch;
c48c4d78 126typedef void bitarray_AND_t(uint32_t[], uint32_t[]);
af7a1f70 127bitarray_AND_t bitarray_AND_AVX512, bitarray_AND_AVX2, bitarray_AND_AVX, bitarray_AND_SSE2, bitarray_AND_MMX, bitarray_AND_NOSIMD, bitarray_AND_dispatch;
c48c4d78 128typedef void bitarray_low20_AND_t(uint32_t*, uint32_t*);
af7a1f70 129bitarray_low20_AND_t bitarray_low20_AND_AVX512, bitarray_low20_AND_AVX2, bitarray_low20_AND_AVX, bitarray_low20_AND_SSE2, bitarray_low20_AND_MMX, bitarray_low20_AND_NOSIMD, bitarray_low20_AND_dispatch;
c48c4d78 130typedef uint32_t count_bitarray_AND_t(uint32_t*, uint32_t*);
af7a1f70 131count_bitarray_AND_t count_bitarray_AND_AVX512, count_bitarray_AND_AVX2, count_bitarray_AND_AVX, count_bitarray_AND_SSE2, count_bitarray_AND_MMX, count_bitarray_AND_NOSIMD, count_bitarray_AND_dispatch;
c48c4d78 132typedef uint32_t count_bitarray_low20_AND_t(uint32_t*, uint32_t*);
af7a1f70 133count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512, count_bitarray_low20_AND_AVX2, count_bitarray_low20_AND_AVX, count_bitarray_low20_AND_SSE2, count_bitarray_low20_AND_MMX, count_bitarray_low20_AND_NOSIMD, count_bitarray_low20_AND_dispatch;
c48c4d78 134typedef void bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
af7a1f70 135bitarray_AND4_t bitarray_AND4_AVX512, bitarray_AND4_AVX2, bitarray_AND4_AVX, bitarray_AND4_SSE2, bitarray_AND4_MMX, bitarray_AND4_NOSIMD, bitarray_AND4_dispatch;
c48c4d78 136typedef void bitarray_OR_t(uint32_t[], uint32_t[]);
af7a1f70 137bitarray_OR_t bitarray_OR_AVX512, bitarray_OR_AVX2, bitarray_OR_AVX, bitarray_OR_SSE2, bitarray_OR_MMX, bitarray_OR_NOSIMD, bitarray_OR_dispatch;
c48c4d78 138typedef uint32_t count_bitarray_AND2_t(uint32_t*, uint32_t*);
af7a1f70 139count_bitarray_AND2_t count_bitarray_AND2_AVX512, count_bitarray_AND2_AVX2, count_bitarray_AND2_AVX, count_bitarray_AND2_SSE2, count_bitarray_AND2_MMX, count_bitarray_AND2_NOSIMD, count_bitarray_AND2_dispatch;
c48c4d78 140typedef uint32_t count_bitarray_AND3_t(uint32_t*, uint32_t*, uint32_t*);
af7a1f70 141count_bitarray_AND3_t count_bitarray_AND3_AVX512, count_bitarray_AND3_AVX2, count_bitarray_AND3_AVX, count_bitarray_AND3_SSE2, count_bitarray_AND3_MMX, count_bitarray_AND3_NOSIMD, count_bitarray_AND3_dispatch;
c48c4d78 142typedef uint32_t count_bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
af7a1f70 143count_bitarray_AND4_t count_bitarray_AND4_AVX512, count_bitarray_AND4_AVX2, count_bitarray_AND4_AVX, count_bitarray_AND4_SSE2, count_bitarray_AND4_MMX, count_bitarray_AND4_NOSIMD, count_bitarray_AND4_dispatch;
c48c4d78 144
145
146inline uint32_t *MALLOC_BITARRAY(uint32_t x)
147{
c3d117a8 148#if defined (_WIN32)
c48c4d78 149 return __builtin_assume_aligned(_aligned_malloc((x), __BIGGEST_ALIGNMENT__), __BIGGEST_ALIGNMENT__);
c3d117a8 150#elif defined (__APPLE__)
151 uint32_t *allocated_memory;
152 if (posix_memalign((void**)&allocated_memory, __BIGGEST_ALIGNMENT__, x)) {
153 return NULL;
154 } else {
155 return __builtin_assume_aligned(allocated_memory, __BIGGEST_ALIGNMENT__);
156 }
c48c4d78 157#else
158 return __builtin_assume_aligned(memalign(__BIGGEST_ALIGNMENT__, (x)), __BIGGEST_ALIGNMENT__);
159#endif
160}
161
162
163inline void FREE_BITARRAY(uint32_t *x)
164{
165#ifdef _WIN32
166 _aligned_free(x);
167#else
168 free(x);
169#endif
170}
171
172
173inline uint32_t BITCOUNT(uint32_t a)
174{
175 return __builtin_popcountl(a);
176}
177
178
179inline uint32_t COUNT_STATES(uint32_t *A)
180{
181 uint32_t count = 0;
182 for (uint32_t i = 0; i < (1<<19); i++) {
183 count += BITCOUNT(A[i]);
184 }
185 return count;
186}
187
188
189inline void BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
190{
191 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
192 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
193 for (uint32_t i = 0; i < (1<<19); i++) {
194 A[i] &= B[i];
195 }
196}
197
198
199inline void BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
200{
201 uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
202 uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
203
204 for (uint32_t i = 0; i < (1<<20); i++) {
205 if (!b[i]) {
206 a[i] = 0;
207 }
208 }
209}
210
211
212inline uint32_t COUNT_BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
213{
214 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
215 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
216 uint32_t count = 0;
217 for (uint32_t i = 0; i < (1<<19); i++) {
218 A[i] &= B[i];
219 count += BITCOUNT(A[i]);
220 }
221 return count;
222}
223
224
225inline uint32_t COUNT_BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
226{
227 uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
228 uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
229 uint32_t count = 0;
230
231 for (uint32_t i = 0; i < (1<<20); i++) {
232 if (!b[i]) {
233 a[i] = 0;
234 }
235 count += BITCOUNT(a[i]);
236 }
237 return count;
238}
239
240
241inline void BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
242{
243 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
244 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
245 C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
246 D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
247 for (uint32_t i = 0; i < (1<<19); i++) {
248 A[i] = B[i] & C[i] & D[i];
249 }
250}
251
252
253inline void BITARRAY_OR(uint32_t *restrict A, uint32_t *restrict B)
254{
255 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
256 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
257 for (uint32_t i = 0; i < (1<<19); i++) {
258 A[i] |= B[i];
259 }
260}
261
262
263inline uint32_t COUNT_BITARRAY_AND2(uint32_t *restrict A, uint32_t *restrict B)
264{
265 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
266 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
267 uint32_t count = 0;
268 for (uint32_t i = 0; i < (1<<19); i++) {
269 count += BITCOUNT(A[i] & B[i]);
270 }
271 return count;
272}
273
274
275inline uint32_t COUNT_BITARRAY_AND3(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C)
276{
277 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
278 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
279 C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
280 uint32_t count = 0;
281 for (uint32_t i = 0; i < (1<<19); i++) {
282 count += BITCOUNT(A[i] & B[i] & C[i]);
283 }
284 return count;
285}
286
287
288inline uint32_t COUNT_BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
289{
290 A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
291 B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
292 C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
293 D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
294 uint32_t count = 0;
295 for (uint32_t i = 0; i < (1<<19); i++) {
296 count += BITCOUNT(A[i] & B[i] & C[i] & D[i]);
297 }
298 return count;
299}
300
af7a1f70 301
c48c4d78 302#ifndef __MMX__
303
304// pointers to functions:
305malloc_bitarray_t *malloc_bitarray_function_p = &malloc_bitarray_dispatch;
306free_bitarray_t *free_bitarray_function_p = &free_bitarray_dispatch;
307bitcount_t *bitcount_function_p = &bitcount_dispatch;
308count_states_t *count_states_function_p = &count_states_dispatch;
309bitarray_AND_t *bitarray_AND_function_p = &bitarray_AND_dispatch;
310bitarray_low20_AND_t *bitarray_low20_AND_function_p = &bitarray_low20_AND_dispatch;
311count_bitarray_AND_t *count_bitarray_AND_function_p = &count_bitarray_AND_dispatch;
312count_bitarray_low20_AND_t *count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_dispatch;
313bitarray_AND4_t *bitarray_AND4_function_p = &bitarray_AND4_dispatch;
314bitarray_OR_t *bitarray_OR_function_p = &bitarray_OR_dispatch;
315count_bitarray_AND2_t *count_bitarray_AND2_function_p = &count_bitarray_AND2_dispatch;
316count_bitarray_AND3_t *count_bitarray_AND3_function_p = &count_bitarray_AND3_dispatch;
317count_bitarray_AND4_t *count_bitarray_AND4_function_p = &count_bitarray_AND4_dispatch;
318
319// determine the available instruction set at runtime and call the correct function
320uint32_t *malloc_bitarray_dispatch(uint32_t x) {
af7a1f70 321#if defined (__i386__) || defined (__x86_64__)
de1e68d3 322 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 323 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 324 if (__builtin_cpu_supports("avx512f")) malloc_bitarray_function_p = &malloc_bitarray_AVX512;
325 else if (__builtin_cpu_supports("avx2")) malloc_bitarray_function_p = &malloc_bitarray_AVX2;
087c8bf3 326 #else
f950ce1c 327 if (__builtin_cpu_supports("avx2")) malloc_bitarray_function_p = &malloc_bitarray_AVX2;
087c8bf3 328 #endif
c48c4d78 329 else if (__builtin_cpu_supports("avx")) malloc_bitarray_function_p = &malloc_bitarray_AVX;
330 else if (__builtin_cpu_supports("sse2")) malloc_bitarray_function_p = &malloc_bitarray_SSE2;
331 else if (__builtin_cpu_supports("mmx")) malloc_bitarray_function_p = &malloc_bitarray_MMX;
087c8bf3 332 else
333 #endif
af7a1f70 334#endif
335 malloc_bitarray_function_p = &malloc_bitarray_NOSIMD;
336
c48c4d78 337 // call the most optimized function for this CPU
338 return (*malloc_bitarray_function_p)(x);
339}
340
341void free_bitarray_dispatch(uint32_t *x) {
087c8bf3 342#if defined (__i386__) || defined (__x86_64__)
de1e68d3 343 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 344 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 345 if (__builtin_cpu_supports("avx512f")) free_bitarray_function_p = &free_bitarray_AVX512;
346 else if (__builtin_cpu_supports("avx2")) free_bitarray_function_p = &free_bitarray_AVX2;
087c8bf3 347 #else
f950ce1c 348 if (__builtin_cpu_supports("avx2")) free_bitarray_function_p = &free_bitarray_AVX2;
087c8bf3 349 #endif
c48c4d78 350 else if (__builtin_cpu_supports("avx")) free_bitarray_function_p = &free_bitarray_AVX;
351 else if (__builtin_cpu_supports("sse2")) free_bitarray_function_p = &free_bitarray_SSE2;
352 else if (__builtin_cpu_supports("mmx")) free_bitarray_function_p = &free_bitarray_MMX;
087c8bf3 353 else
354 #endif
af7a1f70 355#endif
356 free_bitarray_function_p = &free_bitarray_NOSIMD;
357
c48c4d78 358 // call the most optimized function for this CPU
359 (*free_bitarray_function_p)(x);
360}
361
362uint32_t bitcount_dispatch(uint32_t a) {
af7a1f70 363#if defined (__i386__) || defined (__x86_64__)
de1e68d3 364 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 365 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 366 if (__builtin_cpu_supports("avx512f")) bitcount_function_p = &bitcount_AVX512;
367 else if (__builtin_cpu_supports("avx2")) bitcount_function_p = &bitcount_AVX2;
087c8bf3 368 #else
f950ce1c 369 if (__builtin_cpu_supports("avx2")) bitcount_function_p = &bitcount_AVX2;
087c8bf3 370 #endif
c48c4d78 371 else if (__builtin_cpu_supports("avx")) bitcount_function_p = &bitcount_AVX;
372 else if (__builtin_cpu_supports("sse2")) bitcount_function_p = &bitcount_SSE2;
373 else if (__builtin_cpu_supports("mmx")) bitcount_function_p = &bitcount_MMX;
087c8bf3 374 else
375 #endif
af7a1f70 376#endif
377 bitcount_function_p = &bitcount_NOSIMD;
378
c48c4d78 379 // call the most optimized function for this CPU
380 return (*bitcount_function_p)(a);
381}
382
383uint32_t count_states_dispatch(uint32_t *bitarray) {
af7a1f70 384#if defined (__i386__) || defined (__x86_64__)
de1e68d3 385 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 386 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 387 if (__builtin_cpu_supports("avx512f")) count_states_function_p = &count_states_AVX512;
388 else if (__builtin_cpu_supports("avx2")) count_states_function_p = &count_states_AVX2;
087c8bf3 389 #else
f950ce1c 390 if (__builtin_cpu_supports("avx2")) count_states_function_p = &count_states_AVX2;
087c8bf3 391 #endif
c48c4d78 392 else if (__builtin_cpu_supports("avx")) count_states_function_p = &count_states_AVX;
393 else if (__builtin_cpu_supports("sse2")) count_states_function_p = &count_states_SSE2;
394 else if (__builtin_cpu_supports("mmx")) count_states_function_p = &count_states_MMX;
087c8bf3 395 else
396 #endif
af7a1f70 397#endif
398 count_states_function_p = &count_states_NOSIMD;
399
c48c4d78 400 // call the most optimized function for this CPU
401 return (*count_states_function_p)(bitarray);
402}
403
404void bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
af7a1f70 405#if defined (__i386__) || defined (__x86_64__)
de1e68d3 406 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 407 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 408 if (__builtin_cpu_supports("avx512f")) bitarray_AND_function_p = &bitarray_AND_AVX512;
409 else if (__builtin_cpu_supports("avx2")) bitarray_AND_function_p = &bitarray_AND_AVX2;
087c8bf3 410 #else
f950ce1c 411 if (__builtin_cpu_supports("avx2")) bitarray_AND_function_p = &bitarray_AND_AVX2;
087c8bf3 412 #endif
c48c4d78 413 else if (__builtin_cpu_supports("avx")) bitarray_AND_function_p = &bitarray_AND_AVX;
414 else if (__builtin_cpu_supports("sse2")) bitarray_AND_function_p = &bitarray_AND_SSE2;
415 else if (__builtin_cpu_supports("mmx")) bitarray_AND_function_p = &bitarray_AND_MMX;
087c8bf3 416 else
417 #endif
af7a1f70 418#endif
419 bitarray_AND_function_p = &bitarray_AND_NOSIMD;
420
c48c4d78 421 // call the most optimized function for this CPU
422 (*bitarray_AND_function_p)(A,B);
423}
424
425void bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
087c8bf3 426#if defined (__i386__) || defined (__x86_64__)
de1e68d3 427 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 428 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 429 if (__builtin_cpu_supports("avx512f")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX512;
430 else if (__builtin_cpu_supports("avx2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX2;
087c8bf3 431 #else
f950ce1c 432 if (__builtin_cpu_supports("avx2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX2;
087c8bf3 433 #endif
c48c4d78 434 else if (__builtin_cpu_supports("avx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX;
435 else if (__builtin_cpu_supports("sse2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_SSE2;
436 else if (__builtin_cpu_supports("mmx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_MMX;
087c8bf3 437 else
438 #endif
af7a1f70 439#endif
440 bitarray_low20_AND_function_p = &bitarray_low20_AND_NOSIMD;
441
c48c4d78 442 // call the most optimized function for this CPU
443 (*bitarray_low20_AND_function_p)(A, B);
444}
445
446uint32_t count_bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
af7a1f70 447#if defined (__i386__) || defined (__x86_64__)
de1e68d3 448 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 449 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 450 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX512;
451 else if (__builtin_cpu_supports("avx2")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX2;
087c8bf3 452 #else
f950ce1c 453 if (__builtin_cpu_supports("avx2")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX2;
087c8bf3 454 #endif
c48c4d78 455 else if (__builtin_cpu_supports("avx")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX;
456 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND_function_p = &count_bitarray_AND_SSE2;
457 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND_function_p = &count_bitarray_AND_MMX;
087c8bf3 458 else
459 #endif
af7a1f70 460#endif
461 count_bitarray_AND_function_p = &count_bitarray_AND_NOSIMD;
462
c48c4d78 463 // call the most optimized function for this CPU
464 return (*count_bitarray_AND_function_p)(A, B);
465}
466
467uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
af7a1f70 468#if defined (__i386__) || defined (__x86_64__)
de1e68d3 469 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 470 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 471 if (__builtin_cpu_supports("avx512f")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX512;
472 else if (__builtin_cpu_supports("avx2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX2;
087c8bf3 473 #else
f950ce1c 474 if (__builtin_cpu_supports("avx2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX2;
087c8bf3 475 #endif
c48c4d78 476 else if (__builtin_cpu_supports("avx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX;
477 else if (__builtin_cpu_supports("sse2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_SSE2;
478 else if (__builtin_cpu_supports("mmx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_MMX;
087c8bf3 479 else
480 #endif
af7a1f70 481#endif
482 count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_NOSIMD;
483
c48c4d78 484 // call the most optimized function for this CPU
485 return (*count_bitarray_low20_AND_function_p)(A, B);
486}
487
488void bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
af7a1f70 489#if defined (__i386__) || defined (__x86_64__)
de1e68d3 490 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 491 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 492 if (__builtin_cpu_supports("avx512f")) bitarray_AND4_function_p = &bitarray_AND4_AVX512;
493 else if (__builtin_cpu_supports("avx2")) bitarray_AND4_function_p = &bitarray_AND4_AVX2;
087c8bf3 494 #else
f950ce1c 495 if (__builtin_cpu_supports("avx2")) bitarray_AND4_function_p = &bitarray_AND4_AVX2;
087c8bf3 496 #endif
c48c4d78 497 else if (__builtin_cpu_supports("avx")) bitarray_AND4_function_p = &bitarray_AND4_AVX;
498 else if (__builtin_cpu_supports("sse2")) bitarray_AND4_function_p = &bitarray_AND4_SSE2;
499 else if (__builtin_cpu_supports("mmx")) bitarray_AND4_function_p = &bitarray_AND4_MMX;
087c8bf3 500 else
501 #endif
af7a1f70 502#endif
503 bitarray_AND4_function_p = &bitarray_AND4_NOSIMD;
504
c48c4d78 505 // call the most optimized function for this CPU
506 (*bitarray_AND4_function_p)(A, B, C, D);
507}
508
509void bitarray_OR_dispatch(uint32_t *A, uint32_t *B) {
af7a1f70 510#if defined (__i386__) || defined (__x86_64__)
de1e68d3 511 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 512 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 513 if (__builtin_cpu_supports("avx512f")) bitarray_OR_function_p = &bitarray_OR_AVX512;
514 else if (__builtin_cpu_supports("avx2")) bitarray_OR_function_p = &bitarray_OR_AVX2;
087c8bf3 515 #else
f950ce1c 516 if (__builtin_cpu_supports("avx2")) bitarray_OR_function_p = &bitarray_OR_AVX2;
087c8bf3 517 #endif
c48c4d78 518 else if (__builtin_cpu_supports("avx")) bitarray_OR_function_p = &bitarray_OR_AVX;
519 else if (__builtin_cpu_supports("sse2")) bitarray_OR_function_p = &bitarray_OR_SSE2;
520 else if (__builtin_cpu_supports("mmx")) bitarray_OR_function_p = &bitarray_OR_MMX;
087c8bf3 521 else
522 #endif
af7a1f70 523#endif
524 bitarray_OR_function_p = &bitarray_OR_NOSIMD;
525
c48c4d78 526 // call the most optimized function for this CPU
527 (*bitarray_OR_function_p)(A,B);
528}
529
530uint32_t count_bitarray_AND2_dispatch(uint32_t *A, uint32_t *B) {
af7a1f70 531#if defined (__i386__) || defined (__x86_64__)
de1e68d3 532 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 533 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 534 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX512;
535 else if (__builtin_cpu_supports("avx2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX2;
087c8bf3 536 #else
f950ce1c 537 if (__builtin_cpu_supports("avx2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX2;
087c8bf3 538 #endif
c48c4d78 539 else if (__builtin_cpu_supports("avx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX;
540 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_SSE2;
541 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_MMX;
087c8bf3 542 else
543 #endif
af7a1f70 544#endif
545 count_bitarray_AND2_function_p = &count_bitarray_AND2_NOSIMD;
546
c48c4d78 547 // call the most optimized function for this CPU
548 return (*count_bitarray_AND2_function_p)(A, B);
549}
550
551uint32_t count_bitarray_AND3_dispatch(uint32_t *A, uint32_t *B, uint32_t *C) {
af7a1f70 552#if defined (__i386__) || defined (__x86_64__)
de1e68d3 553 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 554 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 555 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX512;
556 else if (__builtin_cpu_supports("avx2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX2;
087c8bf3 557 #else
f950ce1c 558 if (__builtin_cpu_supports("avx2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX2;
087c8bf3 559 #endif
c48c4d78 560 else if (__builtin_cpu_supports("avx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX;
561 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_SSE2;
562 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_MMX;
087c8bf3 563 else
564 #endif
af7a1f70 565#endif
566 count_bitarray_AND3_function_p = &count_bitarray_AND3_NOSIMD;
567
c48c4d78 568 // call the most optimized function for this CPU
569 return (*count_bitarray_AND3_function_p)(A, B, C);
570}
571
572uint32_t count_bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
af7a1f70 573#if defined (__i386__) || defined (__x86_64__)
de1e68d3 574 #if !defined(__APPLE__) || (defined(__APPLE__) && (__clang_major__ > 8 || __clang_major__ == 8 && __clang_minor__ >= 1))
087c8bf3 575 #if (__GNUC__ >= 5) && (__GNUC__ > 5 || __GNUC_MINOR__ > 2)
c48c4d78 576 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX512;
577 else if (__builtin_cpu_supports("avx2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX2;
087c8bf3 578 #else
f950ce1c 579 if (__builtin_cpu_supports("avx2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX2;
087c8bf3 580 #endif
c48c4d78 581 else if (__builtin_cpu_supports("avx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX;
582 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_SSE2;
583 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_MMX;
087c8bf3 584 else
585 #endif
af7a1f70 586#endif
587 count_bitarray_AND4_function_p = &count_bitarray_AND4_NOSIMD;
588
c48c4d78 589 // call the most optimized function for this CPU
590 return (*count_bitarray_AND4_function_p)(A, B, C, D);
591}
592
593
594///////////////////////////////////////////////77
595// Entries to dispatched function calls
596
597uint32_t *malloc_bitarray(uint32_t x) {
598 return (*malloc_bitarray_function_p)(x);
599}
600
601void free_bitarray(uint32_t *x) {
602 (*free_bitarray_function_p)(x);
603}
604
605uint32_t bitcount(uint32_t a) {
606 return (*bitcount_function_p)(a);
607}
608
609uint32_t count_states(uint32_t *bitarray) {
610 return (*count_states_function_p)(bitarray);
611}
612
613void bitarray_AND(uint32_t *A, uint32_t *B) {
614 (*bitarray_AND_function_p)(A, B);
615}
616
617void bitarray_low20_AND(uint32_t *A, uint32_t *B) {
618 (*bitarray_low20_AND_function_p)(A, B);
619}
620
621uint32_t count_bitarray_AND(uint32_t *A, uint32_t *B) {
622 return (*count_bitarray_AND_function_p)(A, B);
623}
624
625uint32_t count_bitarray_low20_AND(uint32_t *A, uint32_t *B) {
626 return (*count_bitarray_low20_AND_function_p)(A, B);
627}
628
629void bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
630 (*bitarray_AND4_function_p)(A, B, C, D);
631}
632
633void bitarray_OR(uint32_t *A, uint32_t *B) {
634 (*bitarray_OR_function_p)(A, B);
635}
636
637uint32_t count_bitarray_AND2(uint32_t *A, uint32_t *B) {
638 return (*count_bitarray_AND2_function_p)(A, B);
639}
640
641uint32_t count_bitarray_AND3(uint32_t *A, uint32_t *B, uint32_t *C) {
642 return (*count_bitarray_AND3_function_p)(A, B, C);
643}
644
645uint32_t count_bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
646 return (*count_bitarray_AND4_function_p)(A, B, C, D);
647}
648
649#endif
650
Impressum, Datenschutz