proxmark3/client/hardnested/hardnested_bitarray_core.c
pwpiwi c48c4d7856 New: implementing hf mf hardnested
This implements the attack described in
	Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
	Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
	Computer and Communications Security, 2015
It uses precomputed tables for many bitflip properties (not only two as in the paper)
and is therefore quite efficient. To prevent failing it doesn't do
differential analysis with several nonce bytes' Sum(a8) properties (each of them
may be wrongly guessed) - instead it concentrates on one nonce byte and tries all
Sum(a8) property guesses sequentially (ordered by probability). The brute force phase
makes use of aczid's bit sliced brute forcer (https://github.com/aczid/crypto1_bs).
Includes runtime CPU-detection to leverage modern (and old) SIMD instructions
with a single executable.
2017-05-31 07:30:56 +02:00

545 lines
23 KiB
C

//-----------------------------------------------------------------------------
// Copyright (C) 2016, 2017 by piwi
//
// This code is licensed to you under the terms of the GNU GPL, version 2 or,
// at your option, any later version. See the LICENSE.txt file for the text of
// the license.ch b
//-----------------------------------------------------------------------------
// Implements a card only attack based on crypto text (encrypted nonces
// received during a nested authentication) only. Unlike other card only
// attacks this doesn't rely on implementation errors but only on the
// inherent weaknesses of the crypto1 cypher. Described in
// Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
// Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
// Computer and Communications Security, 2015
//-----------------------------------------------------------------------------
// some helper functions which can benefit from SIMD instructions or other special instructions
//
#include "hardnested_bitarray_core.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
// #include <stdint.h>
// #include <stdbool.h>
// #include <stdlib.h>
// #include <stdio.h>
// #include <malloc.h>
// #include <string.h>
// #include "crapto1/crapto1.h"
// #include "parity.h"
// this needs to be compiled several times for each instruction set.
// For each instruction set, define a dedicated function name:
#if defined (__AVX512F__)
#define MALLOC_BITARRAY malloc_bitarray_AVX512
#define FREE_BITARRAY free_bitarray_AVX512
#define BITCOUNT bitcount_AVX512
#define COUNT_STATES count_states_AVX512
#define BITARRAY_AND bitarray_AND_AVX512
#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX512
#define COUNT_BITARRAY_AND count_bitarray_AND_AVX512
#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX512
#define BITARRAY_AND4 bitarray_AND4_AVX512
#define BITARRAY_OR bitarray_OR_AVX512
#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX512
#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX512
#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX512
#elif defined (__AVX2__)
#define MALLOC_BITARRAY malloc_bitarray_AVX2
#define FREE_BITARRAY free_bitarray_AVX2
#define BITCOUNT bitcount_AVX2
#define COUNT_STATES count_states_AVX2
#define BITARRAY_AND bitarray_AND_AVX2
#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX2
#define COUNT_BITARRAY_AND count_bitarray_AND_AVX2
#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX2
#define BITARRAY_AND4 bitarray_AND4_AVX2
#define BITARRAY_OR bitarray_OR_AVX2
#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX2
#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX2
#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX2
#elif defined (__AVX__)
#define MALLOC_BITARRAY malloc_bitarray_AVX
#define FREE_BITARRAY free_bitarray_AVX
#define BITCOUNT bitcount_AVX
#define COUNT_STATES count_states_AVX
#define BITARRAY_AND bitarray_AND_AVX
#define BITARRAY_LOW20_AND bitarray_low20_AND_AVX
#define COUNT_BITARRAY_AND count_bitarray_AND_AVX
#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX
#define BITARRAY_AND4 bitarray_AND4_AVX
#define BITARRAY_OR bitarray_OR_AVX
#define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX
#define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX
#define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX
#elif defined (__SSE2__)
#define MALLOC_BITARRAY malloc_bitarray_SSE2
#define FREE_BITARRAY free_bitarray_SSE2
#define BITCOUNT bitcount_SSE2
#define COUNT_STATES count_states_SSE2
#define BITARRAY_AND bitarray_AND_SSE2
#define BITARRAY_LOW20_AND bitarray_low20_AND_SSE2
#define COUNT_BITARRAY_AND count_bitarray_AND_SSE2
#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_SSE2
#define BITARRAY_AND4 bitarray_AND4_SSE2
#define BITARRAY_OR bitarray_OR_SSE2
#define COUNT_BITARRAY_AND2 count_bitarray_AND2_SSE2
#define COUNT_BITARRAY_AND3 count_bitarray_AND3_SSE2
#define COUNT_BITARRAY_AND4 count_bitarray_AND4_SSE2
#elif defined (__MMX__)
#define MALLOC_BITARRAY malloc_bitarray_MMX
#define FREE_BITARRAY free_bitarray_MMX
#define BITCOUNT bitcount_MMX
#define COUNT_STATES count_states_MMX
#define BITARRAY_AND bitarray_AND_MMX
#define BITARRAY_LOW20_AND bitarray_low20_AND_MMX
#define COUNT_BITARRAY_AND count_bitarray_AND_MMX
#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_MMX
#define BITARRAY_AND4 bitarray_AND4_MMX
#define BITARRAY_OR bitarray_OR_MMX
#define COUNT_BITARRAY_AND2 count_bitarray_AND2_MMX
#define COUNT_BITARRAY_AND3 count_bitarray_AND3_MMX
#define COUNT_BITARRAY_AND4 count_bitarray_AND4_MMX
#endif
// typedefs and declaration of functions:
typedef uint32_t* malloc_bitarray_t(uint32_t);
malloc_bitarray_t malloc_bitarray_AVX512, malloc_bitarray_AVX2, malloc_bitarray_AVX, malloc_bitarray_SSE2, malloc_bitarray_MMX, malloc_bitarray_dispatch;
typedef void free_bitarray_t(uint32_t*);
free_bitarray_t free_bitarray_AVX512, free_bitarray_AVX2, free_bitarray_AVX, free_bitarray_SSE2, free_bitarray_MMX, free_bitarray_dispatch;
typedef uint32_t bitcount_t(uint32_t);
bitcount_t bitcount_AVX512, bitcount_AVX2, bitcount_AVX, bitcount_SSE2, bitcount_MMX, bitcount_dispatch;
typedef uint32_t count_states_t(uint32_t*);
count_states_t count_states_AVX512, count_states_AVX2, count_states_AVX, count_states_SSE2, count_states_MMX, count_states_dispatch;
typedef void bitarray_AND_t(uint32_t[], uint32_t[]);
bitarray_AND_t bitarray_AND_AVX512, bitarray_AND_AVX2, bitarray_AND_AVX, bitarray_AND_SSE2, bitarray_AND_MMX, bitarray_AND_dispatch;
typedef void bitarray_low20_AND_t(uint32_t*, uint32_t*);
bitarray_low20_AND_t bitarray_low20_AND_AVX512, bitarray_low20_AND_AVX2, bitarray_low20_AND_AVX, bitarray_low20_AND_SSE2, bitarray_low20_AND_MMX, bitarray_low20_AND_dispatch;
typedef uint32_t count_bitarray_AND_t(uint32_t*, uint32_t*);
count_bitarray_AND_t count_bitarray_AND_AVX512, count_bitarray_AND_AVX2, count_bitarray_AND_AVX, count_bitarray_AND_SSE2, count_bitarray_AND_MMX, count_bitarray_AND_dispatch;
typedef uint32_t count_bitarray_low20_AND_t(uint32_t*, uint32_t*);
count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512, count_bitarray_low20_AND_AVX2, count_bitarray_low20_AND_AVX, count_bitarray_low20_AND_SSE2, count_bitarray_low20_AND_MMX, count_bitarray_low20_AND_dispatch;
typedef void bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
bitarray_AND4_t bitarray_AND4_AVX512, bitarray_AND4_AVX2, bitarray_AND4_AVX, bitarray_AND4_SSE2, bitarray_AND4_MMX, bitarray_AND4_dispatch;
typedef void bitarray_OR_t(uint32_t[], uint32_t[]);
bitarray_OR_t bitarray_OR_AVX512, bitarray_OR_AVX2, bitarray_OR_AVX, bitarray_OR_SSE2, bitarray_OR_MMX, bitarray_OR_dispatch;
typedef uint32_t count_bitarray_AND2_t(uint32_t*, uint32_t*);
count_bitarray_AND2_t count_bitarray_AND2_AVX512, count_bitarray_AND2_AVX2, count_bitarray_AND2_AVX, count_bitarray_AND2_SSE2, count_bitarray_AND2_MMX, count_bitarray_AND2_dispatch;
typedef uint32_t count_bitarray_AND3_t(uint32_t*, uint32_t*, uint32_t*);
count_bitarray_AND3_t count_bitarray_AND3_AVX512, count_bitarray_AND3_AVX2, count_bitarray_AND3_AVX, count_bitarray_AND3_SSE2, count_bitarray_AND3_MMX, count_bitarray_AND3_dispatch;
typedef uint32_t count_bitarray_AND4_t(uint32_t*, uint32_t*, uint32_t*, uint32_t*);
count_bitarray_AND4_t count_bitarray_AND4_AVX512, count_bitarray_AND4_AVX2, count_bitarray_AND4_AVX, count_bitarray_AND4_SSE2, count_bitarray_AND4_MMX, count_bitarray_AND4_dispatch;
inline uint32_t *MALLOC_BITARRAY(uint32_t x)
{
#ifdef _WIN32
return __builtin_assume_aligned(_aligned_malloc((x), __BIGGEST_ALIGNMENT__), __BIGGEST_ALIGNMENT__);
#else
return __builtin_assume_aligned(memalign(__BIGGEST_ALIGNMENT__, (x)), __BIGGEST_ALIGNMENT__);
#endif
}
inline void FREE_BITARRAY(uint32_t *x)
{
#ifdef _WIN32
_aligned_free(x);
#else
free(x);
#endif
}
inline uint32_t BITCOUNT(uint32_t a)
{
return __builtin_popcountl(a);
}
inline uint32_t COUNT_STATES(uint32_t *A)
{
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<19); i++) {
count += BITCOUNT(A[i]);
}
return count;
}
inline void BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
for (uint32_t i = 0; i < (1<<19); i++) {
A[i] &= B[i];
}
}
inline void BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
{
uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
for (uint32_t i = 0; i < (1<<20); i++) {
if (!b[i]) {
a[i] = 0;
}
}
}
inline uint32_t COUNT_BITARRAY_AND(uint32_t *restrict A, uint32_t *restrict B)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<19); i++) {
A[i] &= B[i];
count += BITCOUNT(A[i]);
}
return count;
}
inline uint32_t COUNT_BITARRAY_LOW20_AND(uint32_t *restrict A, uint32_t *restrict B)
{
uint16_t *a = (uint16_t *)__builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
uint16_t *b = (uint16_t *)__builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<20); i++) {
if (!b[i]) {
a[i] = 0;
}
count += BITCOUNT(a[i]);
}
return count;
}
inline void BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
for (uint32_t i = 0; i < (1<<19); i++) {
A[i] = B[i] & C[i] & D[i];
}
}
inline void BITARRAY_OR(uint32_t *restrict A, uint32_t *restrict B)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
for (uint32_t i = 0; i < (1<<19); i++) {
A[i] |= B[i];
}
}
inline uint32_t COUNT_BITARRAY_AND2(uint32_t *restrict A, uint32_t *restrict B)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<19); i++) {
count += BITCOUNT(A[i] & B[i]);
}
return count;
}
inline uint32_t COUNT_BITARRAY_AND3(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<19); i++) {
count += BITCOUNT(A[i] & B[i] & C[i]);
}
return count;
}
inline uint32_t COUNT_BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, uint32_t *restrict C, uint32_t *restrict D)
{
A = __builtin_assume_aligned(A, __BIGGEST_ALIGNMENT__);
B = __builtin_assume_aligned(B, __BIGGEST_ALIGNMENT__);
C = __builtin_assume_aligned(C, __BIGGEST_ALIGNMENT__);
D = __builtin_assume_aligned(D, __BIGGEST_ALIGNMENT__);
uint32_t count = 0;
for (uint32_t i = 0; i < (1<<19); i++) {
count += BITCOUNT(A[i] & B[i] & C[i] & D[i]);
}
return count;
}
#ifndef __MMX__
// pointers to functions:
malloc_bitarray_t *malloc_bitarray_function_p = &malloc_bitarray_dispatch;
free_bitarray_t *free_bitarray_function_p = &free_bitarray_dispatch;
bitcount_t *bitcount_function_p = &bitcount_dispatch;
count_states_t *count_states_function_p = &count_states_dispatch;
bitarray_AND_t *bitarray_AND_function_p = &bitarray_AND_dispatch;
bitarray_low20_AND_t *bitarray_low20_AND_function_p = &bitarray_low20_AND_dispatch;
count_bitarray_AND_t *count_bitarray_AND_function_p = &count_bitarray_AND_dispatch;
count_bitarray_low20_AND_t *count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_dispatch;
bitarray_AND4_t *bitarray_AND4_function_p = &bitarray_AND4_dispatch;
bitarray_OR_t *bitarray_OR_function_p = &bitarray_OR_dispatch;
count_bitarray_AND2_t *count_bitarray_AND2_function_p = &count_bitarray_AND2_dispatch;
count_bitarray_AND3_t *count_bitarray_AND3_function_p = &count_bitarray_AND3_dispatch;
count_bitarray_AND4_t *count_bitarray_AND4_function_p = &count_bitarray_AND4_dispatch;
// determine the available instruction set at runtime and call the correct function
uint32_t *malloc_bitarray_dispatch(uint32_t x) {
if (__builtin_cpu_supports("avx512f")) malloc_bitarray_function_p = &malloc_bitarray_AVX512;
else if (__builtin_cpu_supports("avx2")) malloc_bitarray_function_p = &malloc_bitarray_AVX2;
else if (__builtin_cpu_supports("avx")) malloc_bitarray_function_p = &malloc_bitarray_AVX;
else if (__builtin_cpu_supports("sse2")) malloc_bitarray_function_p = &malloc_bitarray_SSE2;
else if (__builtin_cpu_supports("mmx")) malloc_bitarray_function_p = &malloc_bitarray_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*malloc_bitarray_function_p)(x);
}
void free_bitarray_dispatch(uint32_t *x) {
if (__builtin_cpu_supports("avx512f")) free_bitarray_function_p = &free_bitarray_AVX512;
else if (__builtin_cpu_supports("avx2")) free_bitarray_function_p = &free_bitarray_AVX2;
else if (__builtin_cpu_supports("avx")) free_bitarray_function_p = &free_bitarray_AVX;
else if (__builtin_cpu_supports("sse2")) free_bitarray_function_p = &free_bitarray_SSE2;
else if (__builtin_cpu_supports("mmx")) free_bitarray_function_p = &free_bitarray_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
(*free_bitarray_function_p)(x);
}
uint32_t bitcount_dispatch(uint32_t a) {
if (__builtin_cpu_supports("avx512f")) bitcount_function_p = &bitcount_AVX512;
else if (__builtin_cpu_supports("avx2")) bitcount_function_p = &bitcount_AVX2;
else if (__builtin_cpu_supports("avx")) bitcount_function_p = &bitcount_AVX;
else if (__builtin_cpu_supports("sse2")) bitcount_function_p = &bitcount_SSE2;
else if (__builtin_cpu_supports("mmx")) bitcount_function_p = &bitcount_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*bitcount_function_p)(a);
}
uint32_t count_states_dispatch(uint32_t *bitarray) {
if (__builtin_cpu_supports("avx512f")) count_states_function_p = &count_states_AVX512;
else if (__builtin_cpu_supports("avx2")) count_states_function_p = &count_states_AVX2;
else if (__builtin_cpu_supports("avx")) count_states_function_p = &count_states_AVX;
else if (__builtin_cpu_supports("sse2")) count_states_function_p = &count_states_SSE2;
else if (__builtin_cpu_supports("mmx")) count_states_function_p = &count_states_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_states_function_p)(bitarray);
}
void bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) bitarray_AND_function_p = &bitarray_AND_AVX512;
else if (__builtin_cpu_supports("avx2")) bitarray_AND_function_p = &bitarray_AND_AVX2;
else if (__builtin_cpu_supports("avx")) bitarray_AND_function_p = &bitarray_AND_AVX;
else if (__builtin_cpu_supports("sse2")) bitarray_AND_function_p = &bitarray_AND_SSE2;
else if (__builtin_cpu_supports("mmx")) bitarray_AND_function_p = &bitarray_AND_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
(*bitarray_AND_function_p)(A,B);
}
void bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX512;
else if (__builtin_cpu_supports("avx2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX2;
else if (__builtin_cpu_supports("avx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX;
else if (__builtin_cpu_supports("sse2")) bitarray_low20_AND_function_p = &bitarray_low20_AND_SSE2;
else if (__builtin_cpu_supports("mmx")) bitarray_low20_AND_function_p = &bitarray_low20_AND_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
(*bitarray_low20_AND_function_p)(A, B);
}
uint32_t count_bitarray_AND_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX512;
else if (__builtin_cpu_supports("avx2")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX2;
else if (__builtin_cpu_supports("avx")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX;
else if (__builtin_cpu_supports("sse2")) count_bitarray_AND_function_p = &count_bitarray_AND_SSE2;
else if (__builtin_cpu_supports("mmx")) count_bitarray_AND_function_p = &count_bitarray_AND_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_bitarray_AND_function_p)(A, B);
}
uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX512;
else if (__builtin_cpu_supports("avx2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX2;
else if (__builtin_cpu_supports("avx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX;
else if (__builtin_cpu_supports("sse2")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_SSE2;
else if (__builtin_cpu_supports("mmx")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_bitarray_low20_AND_function_p)(A, B);
}
void bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
if (__builtin_cpu_supports("avx512f")) bitarray_AND4_function_p = &bitarray_AND4_AVX512;
else if (__builtin_cpu_supports("avx2")) bitarray_AND4_function_p = &bitarray_AND4_AVX2;
else if (__builtin_cpu_supports("avx")) bitarray_AND4_function_p = &bitarray_AND4_AVX;
else if (__builtin_cpu_supports("sse2")) bitarray_AND4_function_p = &bitarray_AND4_SSE2;
else if (__builtin_cpu_supports("mmx")) bitarray_AND4_function_p = &bitarray_AND4_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
(*bitarray_AND4_function_p)(A, B, C, D);
}
void bitarray_OR_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) bitarray_OR_function_p = &bitarray_OR_AVX512;
else if (__builtin_cpu_supports("avx2")) bitarray_OR_function_p = &bitarray_OR_AVX2;
else if (__builtin_cpu_supports("avx")) bitarray_OR_function_p = &bitarray_OR_AVX;
else if (__builtin_cpu_supports("sse2")) bitarray_OR_function_p = &bitarray_OR_SSE2;
else if (__builtin_cpu_supports("mmx")) bitarray_OR_function_p = &bitarray_OR_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
(*bitarray_OR_function_p)(A,B);
}
uint32_t count_bitarray_AND2_dispatch(uint32_t *A, uint32_t *B) {
if (__builtin_cpu_supports("avx512f")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX512;
else if (__builtin_cpu_supports("avx2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX2;
else if (__builtin_cpu_supports("avx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX;
else if (__builtin_cpu_supports("sse2")) count_bitarray_AND2_function_p = &count_bitarray_AND2_SSE2;
else if (__builtin_cpu_supports("mmx")) count_bitarray_AND2_function_p = &count_bitarray_AND2_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_bitarray_AND2_function_p)(A, B);
}
uint32_t count_bitarray_AND3_dispatch(uint32_t *A, uint32_t *B, uint32_t *C) {
if (__builtin_cpu_supports("avx512f")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX512;
else if (__builtin_cpu_supports("avx2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX2;
else if (__builtin_cpu_supports("avx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX;
else if (__builtin_cpu_supports("sse2")) count_bitarray_AND3_function_p = &count_bitarray_AND3_SSE2;
else if (__builtin_cpu_supports("mmx")) count_bitarray_AND3_function_p = &count_bitarray_AND3_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_bitarray_AND3_function_p)(A, B, C);
}
uint32_t count_bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
if (__builtin_cpu_supports("avx512f")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX512;
else if (__builtin_cpu_supports("avx2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX2;
else if (__builtin_cpu_supports("avx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX;
else if (__builtin_cpu_supports("sse2")) count_bitarray_AND4_function_p = &count_bitarray_AND4_SSE2;
else if (__builtin_cpu_supports("mmx")) count_bitarray_AND4_function_p = &count_bitarray_AND4_MMX;
else {
printf("\nFatal: you need at least a CPU with MMX instruction set support. Aborting...\n");
exit(5);
}
// call the most optimized function for this CPU
return (*count_bitarray_AND4_function_p)(A, B, C, D);
}
///////////////////////////////////////////////77
// Entries to dispatched function calls
uint32_t *malloc_bitarray(uint32_t x) {
return (*malloc_bitarray_function_p)(x);
}
void free_bitarray(uint32_t *x) {
(*free_bitarray_function_p)(x);
}
uint32_t bitcount(uint32_t a) {
return (*bitcount_function_p)(a);
}
uint32_t count_states(uint32_t *bitarray) {
return (*count_states_function_p)(bitarray);
}
void bitarray_AND(uint32_t *A, uint32_t *B) {
(*bitarray_AND_function_p)(A, B);
}
void bitarray_low20_AND(uint32_t *A, uint32_t *B) {
(*bitarray_low20_AND_function_p)(A, B);
}
uint32_t count_bitarray_AND(uint32_t *A, uint32_t *B) {
return (*count_bitarray_AND_function_p)(A, B);
}
uint32_t count_bitarray_low20_AND(uint32_t *A, uint32_t *B) {
return (*count_bitarray_low20_AND_function_p)(A, B);
}
void bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
(*bitarray_AND4_function_p)(A, B, C, D);
}
void bitarray_OR(uint32_t *A, uint32_t *B) {
(*bitarray_OR_function_p)(A, B);
}
uint32_t count_bitarray_AND2(uint32_t *A, uint32_t *B) {
return (*count_bitarray_AND2_function_p)(A, B);
}
uint32_t count_bitarray_AND3(uint32_t *A, uint32_t *B, uint32_t *C) {
return (*count_bitarray_AND3_function_p)(A, B, C);
}
uint32_t count_bitarray_AND4(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) {
return (*count_bitarray_AND4_function_p)(A, B, C, D);
}
#endif