--- libecb/ecb.h 2020/01/20 13:13:56 1.180 +++ libecb/ecb.h 2021/06/21 23:59:58 1.192 @@ -1,7 +1,7 @@ /* * libecb - http://software.schmorp.de/pkg/libecb * - * Copyright (©) 2009-2015 Marc Alexander Lehmann + * Copyright (©) 2009-2015,2018-2021 Marc Alexander Lehmann * Copyright (©) 2011 Emanuele Giaquinta * All rights reserved. * @@ -42,9 +42,11 @@ #define ECB_H /* 16 bits major, 16 bits minor */ -#define ECB_VERSION 0x00010008 +#define ECB_VERSION 0x00010009 -#ifdef _WIN32 +#include /* for memcpy */ + +#if defined (_WIN32) && !defined (__MINGW32__) typedef signed char int8_t; typedef unsigned char uint8_t; typedef signed char int_fast8_t; @@ -104,6 +106,12 @@ #endif #endif +#if ECB_PTRSIZE >= 8 || ECB_AMD64_X32 + #define ECB_64BIT_NATIVE 1 +#else + #define ECB_64BIT_NATIVE 0 +#endif + /* many compilers define _GNUC_ to some versions but then only implement * what their idiot authors think are the "more important" extensions, * causing enormous grief in return for some better fake benchmark numbers. @@ -242,6 +250,7 @@ #define ECB_MEMORY_FENCE __atomic_thread_fence (__ATOMIC_SEQ_CST) #define ECB_MEMORY_FENCE_ACQUIRE __atomic_thread_fence (__ATOMIC_ACQUIRE) #define ECB_MEMORY_FENCE_RELEASE __atomic_thread_fence (__ATOMIC_RELEASE) + #undef ECB_MEMORY_FENCE_RELAXED #define ECB_MEMORY_FENCE_RELAXED __atomic_thread_fence (__ATOMIC_RELAXED) #elif ECB_CLANG_EXTENSION(c_atomic) @@ -249,6 +258,7 @@ #define ECB_MEMORY_FENCE __c11_atomic_thread_fence (__ATOMIC_SEQ_CST) #define ECB_MEMORY_FENCE_ACQUIRE __c11_atomic_thread_fence (__ATOMIC_ACQUIRE) #define ECB_MEMORY_FENCE_RELEASE __c11_atomic_thread_fence (__ATOMIC_RELEASE) + #undef ECB_MEMORY_FENCE_RELAXED #define ECB_MEMORY_FENCE_RELAXED __c11_atomic_thread_fence (__ATOMIC_RELAXED) #elif ECB_GCC_VERSION(4,4) || defined __INTEL_COMPILER || defined __clang__ @@ -610,6 +620,44 @@ ecb_inline ecb_const uint64_t ecb_rotl64 (uint64_t x, unsigned int count) { return (x >> (64 - count)) | (x << count); } ecb_inline ecb_const uint64_t ecb_rotr64 (uint64_t x, unsigned int count) { return (x << (64 - count)) | (x >> count); } +#if ECB_CPP + +inline uint8_t ecb_ctz (uint8_t v) { return ecb_ctz32 (v); } +inline uint16_t ecb_ctz (uint16_t v) { return ecb_ctz32 (v); } +inline uint32_t ecb_ctz (uint32_t v) { return ecb_ctz32 (v); } +inline uint64_t ecb_ctz (uint64_t v) { return ecb_ctz64 (v); } + +inline bool ecb_is_pot (uint8_t v) { return ecb_is_pot32 (v); } +inline bool ecb_is_pot (uint16_t v) { return ecb_is_pot32 (v); } +inline bool ecb_is_pot (uint32_t v) { return ecb_is_pot32 (v); } +inline bool ecb_is_pot (uint64_t v) { return ecb_is_pot64 (v); } + +inline int ecb_ld (uint8_t v) { return ecb_ld32 (v); } +inline int ecb_ld (uint16_t v) { return ecb_ld32 (v); } +inline int ecb_ld (uint32_t v) { return ecb_ld32 (v); } +inline int ecb_ld (uint64_t v) { return ecb_ld64 (v); } + +inline int ecb_popcount (uint8_t v) { return ecb_popcount32 (v); } +inline int ecb_popcount (uint16_t v) { return ecb_popcount32 (v); } +inline int ecb_popcount (uint32_t v) { return ecb_popcount32 (v); } +inline int ecb_popcount (uint64_t v) { return ecb_popcount64 (v); } + +inline uint8_t ecb_bitrev (uint8_t v) { return ecb_bitrev8 (v); } +inline uint16_t ecb_bitrev (uint16_t v) { return ecb_bitrev16 (v); } +inline uint32_t ecb_bitrev (uint32_t v) { return ecb_bitrev32 (v); } + +inline uint8_t ecb_rotl (uint8_t v, unsigned int count) { return ecb_rotl8 (v, count); } +inline uint16_t ecb_rotl (uint16_t v, unsigned int count) { return ecb_rotl16 (v, count); } +inline uint32_t ecb_rotl (uint32_t v, unsigned int count) { return ecb_rotl32 (v, count); } +inline uint64_t ecb_rotl (uint64_t v, unsigned int count) { return ecb_rotl64 (v, count); } + +inline uint8_t ecb_rotr (uint8_t v, unsigned int count) { return ecb_rotr8 (v, count); } +inline uint16_t ecb_rotr (uint16_t v, unsigned int count) { return ecb_rotr16 (v, count); } +inline uint32_t ecb_rotr (uint32_t v, unsigned int count) { return ecb_rotr32 (v, count); } +inline uint64_t ecb_rotr (uint64_t v, unsigned int count) { return ecb_rotr64 (v, count); } + +#endif + #if ECB_GCC_VERSION(4,3) || (ECB_CLANG_BUILTIN(__builtin_bswap32) && ECB_CLANG_BUILTIN(__builtin_bswap64)) #if ECB_GCC_VERSION(4,8) || ECB_CLANG_BUILTIN(__builtin_bswap16) #define ecb_bswap16(x) __builtin_bswap16 (x) @@ -733,7 +781,7 @@ ecb_inline void ecb_poke_le_u32_u (void *ptr, uint_fast32_t v) { ecb_poke_u32_u (ptr, ecb_host_to_le_u32 (v)); } ecb_inline void ecb_poke_le_u64_u (void *ptr, uint_fast64_t v) { ecb_poke_u64_u (ptr, ecb_host_to_le_u64 (v)); } -#ifdef __cplusplus +#if ECB_CPP inline uint8_t ecb_bswap (uint8_t v) { return v; } inline uint16_t ecb_bswap (uint16_t v) { return ecb_bswap16 (v); } @@ -745,7 +793,7 @@ template inline T ecb_peek (const void *ptr) { return *(const T *)ptr; } template inline T ecb_peek_be (const void *ptr) { return ecb_be_to_host (ecb_peek (ptr)); } template inline T ecb_peek_le (const void *ptr) { return ecb_le_to_host (ecb_peek (ptr)); } -template inline T ecb_peek_u (const void *ptr) { T v; std::memcpy (&v, ptr, sizeof (v)); return v; } +template inline T ecb_peek_u (const void *ptr) { T v; memcpy (&v, ptr, sizeof (v)); return v; } template inline T ecb_peek_be_u (const void *ptr) { return ecb_be_to_host (ecb_peek_u (ptr)); } template inline T ecb_peek_le_u (const void *ptr) { return ecb_le_to_host (ecb_peek_u (ptr)); } @@ -754,15 +802,17 @@ template inline void ecb_poke (void *ptr, T v) { *(T *)ptr = v; } template inline void ecb_poke_be (void *ptr, T v) { return ecb_poke (ptr, ecb_host_to_be (v)); } template inline void ecb_poke_le (void *ptr, T v) { return ecb_poke (ptr, ecb_host_to_le (v)); } -template inline void ecb_poke_u (void *ptr, T v) { std::memcpy (ptr, &v, sizeof (v)); } +template inline void ecb_poke_u (void *ptr, T v) { memcpy (ptr, &v, sizeof (v)); } template inline void ecb_poke_be_u (void *ptr, T v) { return ecb_poke_u (ptr, ecb_host_to_be (v)); } template inline void ecb_poke_le_u (void *ptr, T v) { return ecb_poke_u (ptr, ecb_host_to_le (v)); } #endif /*****************************************************************************/ +/* division */ #if ECB_GCC_VERSION(3,0) || ECB_C99 + /* C99 tightened the definition of %, so we can use a more efficient version */ #define ecb_mod(m,n) ((m) % (n) + ((m) % (n) < 0 ? (n) : 0)) #else #define ecb_mod(m,n) ((m) < 0 ? ((n) - 1 - ((-1 - (m)) % (n))) : ((m) % (n))) @@ -784,6 +834,9 @@ #define ecb_div_ru(val,div) ((val) < 0 ? - ((-(val) ) / (div)) : ((val) + (div) - 1) / (div)) #endif +/*****************************************************************************/ +/* array length */ + #if ecb_cplusplus_does_not_suck /* does not work for local types (http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2657.htm) */ template @@ -796,6 +849,7 @@ #endif /*****************************************************************************/ +/* IEEE 754-2008 half float conversions */ ecb_function_ ecb_const uint32_t ecb_binary16_to_binary32 (uint32_t x); ecb_function_ ecb_const uint32_t @@ -834,7 +888,7 @@ ecb_binary32_to_binary16 (uint32_t x) { unsigned int s = (x >> 16) & 0x00008000; /* sign bit, the easy part */ - unsigned int e = ((x >> 23) & 0x000000ff) - (127 - 15); /* the desired exponent */ + int e = ((x >> 23) & 0x000000ff) - (127 - 15); /* the desired exponent */ unsigned int m = x & 0x007fffff; x &= 0x7fffffff; @@ -894,6 +948,215 @@ } /*******************************************************************************/ +/* fast integer to ascii */ + +// simply return a mask with "bits" bits set +#define ecb_i2a_mask(type,bits) ((((type)1) << (bits)) - 1) + +// oputput a single digit. maskvalue is 10**digitidx +#define ecb_i2a_digit(type,bits,digitmask,maskvalue,digitidx) \ + if (digitmask >= maskvalue) /* constant, used to decide how many digits to generate */ \ + { \ + char digit = x >> (bits - digitidx); /* calculate the topmost digit */ \ + *ptr = digit + '0'; /* output it */ \ + nz = (digitmask == maskvalue) || nz || digit; /* first term == always output last digit */ \ + ptr += nz; /* output digit only if non-zero digit seen */ \ + x = (x & ecb_i2a_mask (type, bits - digitidx)) * 5; /* *10, but shift decimal point right */ \ + } + +// convert integer to fixed point format and multiply out digits, highest first +// requires magic constants: max. digits and number of bits after the decimal point +#define ecb_i2a_def(suffix,ptr,v,type,bits,digitmask,lz) \ +ecb_inline char *ecb_i2a_ ## suffix (char *ptr, uint32_t u) \ +{ \ + char nz = lz; /* non-zero digit seen? */ \ + /* convert to x.bits fixed-point */ \ + type x = u * ((ecb_i2a_mask (type, bits) + digitmask) / digitmask); \ + /* output up to 10 digits */ \ + ecb_i2a_digit (type,bits,digitmask, 1, 0); \ + ecb_i2a_digit (type,bits,digitmask, 10, 1); \ + ecb_i2a_digit (type,bits,digitmask, 100, 2); \ + ecb_i2a_digit (type,bits,digitmask, 1000, 3); \ + ecb_i2a_digit (type,bits,digitmask, 10000, 4); \ + ecb_i2a_digit (type,bits,digitmask, 100000, 5); \ + ecb_i2a_digit (type,bits,digitmask, 1000000, 6); \ + ecb_i2a_digit (type,bits,digitmask, 10000000, 7); \ + ecb_i2a_digit (type,bits,digitmask, 100000000, 8); \ + ecb_i2a_digit (type,bits,digitmask, 1000000000, 9); \ + return ptr; \ +} + +// predefined versions of the above, for various digits +// ecb_i2a_xN = almost N digits, limit defined by macro +// ecb_i2a_N = up to N digits, leading zeroes suppressed +// ecb_i2a_0N = exactly N digits, including leading zeroes + +// non-leading-zero versions, limited range +#define ECB_I2A_MAX_X5 59074 // limit for ecb_i2a_x5 +#define ECB_I2A_MAX_X10 2932500665 // limit for ecb_i2a_x10 +ecb_i2a_def ( x5, ptr, v, uint32_t, 26, 10000, 0) +ecb_i2a_def (x10, ptr, v, uint64_t, 60, 1000000000, 0) + +// non-leading zero versions, all digits, 4 and 9 are optimal for 32/64 bit +ecb_i2a_def ( 2, ptr, v, uint32_t, 10, 10, 0) +ecb_i2a_def ( 3, ptr, v, uint32_t, 12, 100, 0) +ecb_i2a_def ( 4, ptr, v, uint32_t, 26, 1000, 0) +ecb_i2a_def ( 5, ptr, v, uint64_t, 30, 10000, 0) +ecb_i2a_def ( 6, ptr, v, uint64_t, 36, 100000, 0) +ecb_i2a_def ( 7, ptr, v, uint64_t, 44, 1000000, 0) +ecb_i2a_def ( 8, ptr, v, uint64_t, 50, 10000000, 0) +ecb_i2a_def ( 9, ptr, v, uint64_t, 56, 100000000, 0) + +// leading-zero versions, all digits, 04 and 09 are optimal for 32/64 bit +ecb_i2a_def (02, ptr, v, uint32_t, 10, 10, 1) +ecb_i2a_def (03, ptr, v, uint32_t, 12, 100, 1) +ecb_i2a_def (04, ptr, v, uint32_t, 26, 1000, 1) +ecb_i2a_def (05, ptr, v, uint64_t, 30, 10000, 1) +ecb_i2a_def (06, ptr, v, uint64_t, 36, 100000, 1) +ecb_i2a_def (07, ptr, v, uint64_t, 44, 1000000, 1) +ecb_i2a_def (08, ptr, v, uint64_t, 50, 10000000, 1) +ecb_i2a_def (09, ptr, v, uint64_t, 56, 100000000, 1) + +#define ECB_I2A_I32_DIGITS 11 +#define ECB_I2A_U32_DIGITS 10 +#define ECB_I2A_I64_DIGITS 20 +#define ECB_I2A_U32_DIGITS 21 +#define ECB_I2A_DIGITS 21 + +ecb_inline char * +ecb_i2a_u32 (char *ptr, uint32_t u) +{ + #if ECB_64BIT_NATIVE + if (ecb_expect_true (u <= ECB_I2A_MAX_X10)) + ptr = ecb_i2a_x10 (ptr, u); + else // x10 almost, but not fully, covers 32 bit + { + uint32_t u1 = u % 1000000000; + uint32_t u2 = u / 1000000000; + + *ptr++ = u2 + '0'; + ptr = ecb_i2a_09 (ptr, u1); + } + #else + if (ecb_expect_true (u <= ECB_I2A_MAX_X5)) + ecb_i2a_x5 (ptr, u); + else if (ecb_expect_true (u <= ECB_I2A_MAX_X5 * 10000)) + { + uint32_t u1 = u % 10000; + uint32_t u2 = u / 10000; + + ptr = ecb_i2a_x5 (ptr, u2); + ptr = ecb_i2a_04 (ptr, u1); + } + else + { + uint32_t u1 = u % 10000; + uint32_t ua = u / 10000; + uint32_t u2 = ua % 10000; + uint32_t u3 = ua / 10000; + + ptr = ecb_i2a_2 (ptr, u3); + ptr = ecb_i2a_04 (ptr, u2); + ptr = ecb_i2a_04 (ptr, u1); + } + #endif + + return ptr; +} + +ecb_inline char * +ecb_i2a_i32 (char *ptr, int32_t v) +{ + *ptr = '-'; ptr += v < 0; + uint32_t u = v < 0 ? -(uint32_t)v : v; + + #if ECB_64BIT_NATIVE + ptr = ecb_i2a_x10 (ptr, u); // x10 fully covers 31 bit + #else + ptr = ecb_i2a_u32 (ptr, u); + #endif + + return ptr; +} + +ecb_inline char * +ecb_i2a_u64 (char *ptr, uint64_t u) +{ + #if ECB_64BIT_NATIVE + if (ecb_expect_true (u <= ECB_I2A_MAX_X10)) + ptr = ecb_i2a_x10 (ptr, u); + else if (ecb_expect_false (u <= ECB_I2A_MAX_X10 * 1000000000)) + { + uint64_t u1 = u % 1000000000; + uint64_t u2 = u / 1000000000; + + ptr = ecb_i2a_x10 (ptr, u2); + ptr = ecb_i2a_09 (ptr, u1); + } + else + { + uint64_t u1 = u % 1000000000; + uint64_t ua = u / 1000000000; + uint64_t u2 = ua % 1000000000; + uint64_t u3 = ua / 1000000000; + + ptr = ecb_i2a_2 (ptr, u3); + ptr = ecb_i2a_09 (ptr, u2); + ptr = ecb_i2a_09 (ptr, u1); + } + #else + if (ecb_expect_true (u <= ECB_I2A_MAX_X5)) + ptr = ecb_i2a_x5 (ptr, u); + else + { + uint64_t u1 = u % 10000; + uint64_t u2 = u / 10000; + + ptr = ecb_i2a_u64 (ptr, u2); + ptr = ecb_i2a_04 (ptr, u1); + } + #endif + + return ptr; +} + +ecb_inline char * +ecb_i2a_i64 (char *ptr, int64_t v) +{ + *ptr = '-'; ptr += v < 0; + uint64_t u = v < 0 ? -(uint64_t)v : v; + + #if ECB_64BIT_NATIVE + if (ecb_expect_true (u <= ECB_I2A_MAX_X10)) + ptr = ecb_i2a_x10 (ptr, u); + else if (ecb_expect_false (u <= ECB_I2A_MAX_X10 * 1000000000)) + { + uint64_t u1 = u % 1000000000; + uint64_t u2 = u / 1000000000; + + ptr = ecb_i2a_x10 (ptr, u2); + ptr = ecb_i2a_09 (ptr, u1); + } + else + { + uint64_t u1 = u % 1000000000; + uint64_t ua = u / 1000000000; + uint64_t u2 = ua % 1000000000; + uint64_t u3 = ua / 1000000000; + + // 2**31 is 19 digits, so the top is exactly one digit + *ptr++ = u3 + '0'; + ptr = ecb_i2a_09 (ptr, u2); + ptr = ecb_i2a_09 (ptr, u1); + } + #else + ptr = ecb_i2a_u64 (ptr, u); + #endif + + return ptr; +} + +/*******************************************************************************/ /* floating point stuff, can be disabled by defining ECB_NO_LIBM */ /* basically, everything uses "ieee pure-endian" floating point numbers */ @@ -914,7 +1177,6 @@ || (defined __arm__ && (defined __ARM_EABI__ || defined __EABI__ || defined __VFP_FP__ || defined _WIN32_WCE || defined __ANDROID__)) \ || defined __aarch64__ #define ECB_STDFP 1 - #include /* for memcpy */ #else #define ECB_STDFP 0 #endif