--- libecb/ecb.h	2020/01/20 13:13:56	1.180
+++ libecb/ecb.h	2021/06/21 23:59:58	1.192
@@ -1,7 +1,7 @@
 /*
  * libecb - http://software.schmorp.de/pkg/libecb
  *
- * Copyright (©) 2009-2015 Marc Alexander Lehmann <libecb@schmorp.de>
+ * Copyright (©) 2009-2015,2018-2021 Marc Alexander Lehmann <libecb@schmorp.de>
  * Copyright (©) 2011 Emanuele Giaquinta
  * All rights reserved.
  *
@@ -42,9 +42,11 @@
 #define ECB_H
 
 /* 16 bits major, 16 bits minor */
-#define ECB_VERSION 0x00010008
+#define ECB_VERSION 0x00010009
 
-#ifdef _WIN32
+#include <string.h> /* for memcpy */
+
+#if defined (_WIN32) && !defined (__MINGW32__)
   typedef   signed char   int8_t;
   typedef unsigned char  uint8_t;
   typedef   signed char   int_fast8_t;
@@ -104,6 +106,12 @@
   #endif
 #endif
 
+#if ECB_PTRSIZE >= 8 || ECB_AMD64_X32
+  #define ECB_64BIT_NATIVE 1
+#else
+  #define ECB_64BIT_NATIVE 0
+#endif
+
 /* many compilers define _GNUC_ to some versions but then only implement
  * what their idiot authors think are the "more important" extensions,
  * causing enormous grief in return for some better fake benchmark numbers.
@@ -242,6 +250,7 @@
     #define ECB_MEMORY_FENCE         __atomic_thread_fence (__ATOMIC_SEQ_CST)
     #define ECB_MEMORY_FENCE_ACQUIRE __atomic_thread_fence (__ATOMIC_ACQUIRE)
     #define ECB_MEMORY_FENCE_RELEASE __atomic_thread_fence (__ATOMIC_RELEASE)
+    #undef ECB_MEMORY_FENCE_RELAXED
     #define ECB_MEMORY_FENCE_RELAXED __atomic_thread_fence (__ATOMIC_RELAXED)
 
   #elif ECB_CLANG_EXTENSION(c_atomic)
@@ -249,6 +258,7 @@
     #define ECB_MEMORY_FENCE         __c11_atomic_thread_fence (__ATOMIC_SEQ_CST)
     #define ECB_MEMORY_FENCE_ACQUIRE __c11_atomic_thread_fence (__ATOMIC_ACQUIRE)
     #define ECB_MEMORY_FENCE_RELEASE __c11_atomic_thread_fence (__ATOMIC_RELEASE)
+    #undef ECB_MEMORY_FENCE_RELAXED
     #define ECB_MEMORY_FENCE_RELAXED __c11_atomic_thread_fence (__ATOMIC_RELAXED)
 
   #elif ECB_GCC_VERSION(4,4) || defined __INTEL_COMPILER || defined __clang__
@@ -610,6 +620,44 @@
 ecb_inline ecb_const uint64_t ecb_rotl64 (uint64_t x, unsigned int count) { return (x >> (64 - count)) | (x << count); }
 ecb_inline ecb_const uint64_t ecb_rotr64 (uint64_t x, unsigned int count) { return (x << (64 - count)) | (x >> count); }
 
+#if ECB_CPP
+
+inline uint8_t  ecb_ctz (uint8_t  v) { return ecb_ctz32 (v); }
+inline uint16_t ecb_ctz (uint16_t v) { return ecb_ctz32 (v); }
+inline uint32_t ecb_ctz (uint32_t v) { return ecb_ctz32 (v); }
+inline uint64_t ecb_ctz (uint64_t v) { return ecb_ctz64 (v); }
+
+inline bool ecb_is_pot (uint8_t  v) { return ecb_is_pot32 (v); }
+inline bool ecb_is_pot (uint16_t v) { return ecb_is_pot32 (v); }
+inline bool ecb_is_pot (uint32_t v) { return ecb_is_pot32 (v); }
+inline bool ecb_is_pot (uint64_t v) { return ecb_is_pot64 (v); }
+
+inline int ecb_ld (uint8_t  v) { return ecb_ld32 (v); }
+inline int ecb_ld (uint16_t v) { return ecb_ld32 (v); }
+inline int ecb_ld (uint32_t v) { return ecb_ld32 (v); }
+inline int ecb_ld (uint64_t v) { return ecb_ld64 (v); }
+
+inline int ecb_popcount (uint8_t  v) { return ecb_popcount32 (v); }
+inline int ecb_popcount (uint16_t v) { return ecb_popcount32 (v); }
+inline int ecb_popcount (uint32_t v) { return ecb_popcount32 (v); }
+inline int ecb_popcount (uint64_t v) { return ecb_popcount64 (v); }
+
+inline uint8_t  ecb_bitrev (uint8_t  v) { return ecb_bitrev8  (v); }
+inline uint16_t ecb_bitrev (uint16_t v) { return ecb_bitrev16 (v); }
+inline uint32_t ecb_bitrev (uint32_t v) { return ecb_bitrev32 (v); }
+
+inline uint8_t  ecb_rotl (uint8_t  v, unsigned int count) { return ecb_rotl8  (v, count); }
+inline uint16_t ecb_rotl (uint16_t v, unsigned int count) { return ecb_rotl16 (v, count); }
+inline uint32_t ecb_rotl (uint32_t v, unsigned int count) { return ecb_rotl32 (v, count); }
+inline uint64_t ecb_rotl (uint64_t v, unsigned int count) { return ecb_rotl64 (v, count); }
+
+inline uint8_t  ecb_rotr (uint8_t  v, unsigned int count) { return ecb_rotr8  (v, count); }
+inline uint16_t ecb_rotr (uint16_t v, unsigned int count) { return ecb_rotr16 (v, count); }
+inline uint32_t ecb_rotr (uint32_t v, unsigned int count) { return ecb_rotr32 (v, count); }
+inline uint64_t ecb_rotr (uint64_t v, unsigned int count) { return ecb_rotr64 (v, count); }
+
+#endif
+
 #if ECB_GCC_VERSION(4,3) || (ECB_CLANG_BUILTIN(__builtin_bswap32) && ECB_CLANG_BUILTIN(__builtin_bswap64))
   #if ECB_GCC_VERSION(4,8) || ECB_CLANG_BUILTIN(__builtin_bswap16)
   #define ecb_bswap16(x)  __builtin_bswap16 (x)
@@ -733,7 +781,7 @@
 ecb_inline void ecb_poke_le_u32_u (void *ptr, uint_fast32_t v) { ecb_poke_u32_u (ptr, ecb_host_to_le_u32 (v)); }
 ecb_inline void ecb_poke_le_u64_u (void *ptr, uint_fast64_t v) { ecb_poke_u64_u (ptr, ecb_host_to_le_u64 (v)); }
 
-#ifdef __cplusplus
+#if ECB_CPP
 
 inline uint8_t  ecb_bswap (uint8_t  v) { return v; }
 inline uint16_t ecb_bswap (uint16_t v) { return ecb_bswap16 (v); }
@@ -745,7 +793,7 @@
 template<typename T> inline T ecb_peek       (const void *ptr) { return *(const T *)ptr; }
 template<typename T> inline T ecb_peek_be    (const void *ptr) { return ecb_be_to_host (ecb_peek  <T> (ptr)); }
 template<typename T> inline T ecb_peek_le    (const void *ptr) { return ecb_le_to_host (ecb_peek  <T> (ptr)); }
-template<typename T> inline T ecb_peek_u     (const void *ptr) { T v; std::memcpy (&v, ptr, sizeof (v)); return v; }
+template<typename T> inline T ecb_peek_u     (const void *ptr) { T v; memcpy (&v, ptr, sizeof (v)); return v; }
 template<typename T> inline T ecb_peek_be_u  (const void *ptr) { return ecb_be_to_host (ecb_peek_u<T> (ptr)); }
 template<typename T> inline T ecb_peek_le_u  (const void *ptr) { return ecb_le_to_host (ecb_peek_u<T> (ptr)); }
 
@@ -754,15 +802,17 @@
 template<typename T> inline void ecb_poke      (void *ptr, T v) { *(T *)ptr = v; }
 template<typename T> inline void ecb_poke_be   (void *ptr, T v) { return ecb_poke  <T> (ptr, ecb_host_to_be (v)); }
 template<typename T> inline void ecb_poke_le   (void *ptr, T v) { return ecb_poke  <T> (ptr, ecb_host_to_le (v)); }
-template<typename T> inline void ecb_poke_u    (void *ptr, T v) { std::memcpy (ptr, &v, sizeof (v)); }
+template<typename T> inline void ecb_poke_u    (void *ptr, T v) { memcpy (ptr, &v, sizeof (v)); }
 template<typename T> inline void ecb_poke_be_u (void *ptr, T v) { return ecb_poke_u<T> (ptr, ecb_host_to_be (v)); }
 template<typename T> inline void ecb_poke_le_u (void *ptr, T v) { return ecb_poke_u<T> (ptr, ecb_host_to_le (v)); }
 
 #endif
 
 /*****************************************************************************/
+/* division */
 
 #if ECB_GCC_VERSION(3,0) || ECB_C99
+  /* C99 tightened the definition of %, so we can use a more efficient version */
   #define ecb_mod(m,n) ((m) % (n) + ((m) % (n) < 0 ? (n) : 0))
 #else
   #define ecb_mod(m,n) ((m) < 0 ? ((n) - 1 - ((-1 - (m)) % (n))) : ((m) % (n)))
@@ -784,6 +834,9 @@
   #define ecb_div_ru(val,div) ((val) < 0 ? - ((-(val)            ) / (div)) : ((val) + (div) - 1) / (div))
 #endif
 
+/*****************************************************************************/
+/* array length */
+
 #if ecb_cplusplus_does_not_suck
   /* does not work for local types (http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2657.htm) */
   template<typename T, int N>
@@ -796,6 +849,7 @@
 #endif
 
 /*****************************************************************************/
+/* IEEE 754-2008 half float conversions */
 
 ecb_function_ ecb_const uint32_t ecb_binary16_to_binary32 (uint32_t x);
 ecb_function_ ecb_const uint32_t
@@ -834,7 +888,7 @@
 ecb_binary32_to_binary16 (uint32_t x)
 {
   unsigned int s =  (x >> 16) & 0x00008000; /* sign bit, the easy part */
-  unsigned int e = ((x >> 23) & 0x000000ff) - (127 - 15); /* the desired exponent */
+  int          e = ((x >> 23) & 0x000000ff) - (127 - 15); /* the desired exponent */
   unsigned int m =   x        & 0x007fffff;
 
   x &= 0x7fffffff;
@@ -894,6 +948,215 @@
 }
 
 /*******************************************************************************/
+/* fast integer to ascii */
+
+// simply return a mask with "bits" bits set
+#define ecb_i2a_mask(type,bits) ((((type)1) << (bits)) - 1)
+
+// oputput a single digit. maskvalue is 10**digitidx
+#define ecb_i2a_digit(type,bits,digitmask,maskvalue,digitidx) \
+  if (digitmask >= maskvalue) /* constant, used to decide how many digits to generate */ \
+    { \
+      char digit = x >> (bits - digitidx); /* calculate the topmost digit */ \
+      *ptr = digit + '0'; /* output it */ \
+      nz = (digitmask == maskvalue) || nz || digit; /* first term == always output last digit */ \
+      ptr += nz; /* output digit only if non-zero digit seen */ \
+      x = (x & ecb_i2a_mask (type, bits - digitidx)) * 5; /* *10, but shift decimal point right */ \
+    }
+
+// convert integer to fixed point format and multiply out digits, highest first
+// requires magic constants: max. digits and number of bits after the decimal point
+#define ecb_i2a_def(suffix,ptr,v,type,bits,digitmask,lz) \
+ecb_inline char *ecb_i2a_ ## suffix (char *ptr, uint32_t u) \
+{ \
+  char nz = lz; /* non-zero digit seen? */ \
+  /* convert to x.bits fixed-point */ \
+  type x = u * ((ecb_i2a_mask (type, bits) + digitmask) / digitmask); \
+  /* output up to 10 digits */ \
+  ecb_i2a_digit (type,bits,digitmask,          1, 0); \
+  ecb_i2a_digit (type,bits,digitmask,         10, 1); \
+  ecb_i2a_digit (type,bits,digitmask,        100, 2); \
+  ecb_i2a_digit (type,bits,digitmask,       1000, 3); \
+  ecb_i2a_digit (type,bits,digitmask,      10000, 4); \
+  ecb_i2a_digit (type,bits,digitmask,     100000, 5); \
+  ecb_i2a_digit (type,bits,digitmask,    1000000, 6); \
+  ecb_i2a_digit (type,bits,digitmask,   10000000, 7); \
+  ecb_i2a_digit (type,bits,digitmask,  100000000, 8); \
+  ecb_i2a_digit (type,bits,digitmask, 1000000000, 9); \
+  return ptr; \
+}
+
+// predefined versions of the above, for various digits
+// ecb_i2a_xN = almost N digits, limit defined by macro
+// ecb_i2a_N = up to N digits, leading zeroes suppressed
+// ecb_i2a_0N = exactly N digits, including leading zeroes
+
+// non-leading-zero versions, limited range
+#define ECB_I2A_MAX_X5       59074 // limit for ecb_i2a_x5
+#define ECB_I2A_MAX_X10 2932500665 // limit for ecb_i2a_x10
+ecb_i2a_def ( x5, ptr, v, uint32_t, 26,      10000, 0)
+ecb_i2a_def (x10, ptr, v, uint64_t, 60, 1000000000, 0)
+
+// non-leading zero versions, all digits, 4 and 9 are optimal for 32/64 bit
+ecb_i2a_def ( 2, ptr, v, uint32_t, 10,         10, 0)
+ecb_i2a_def ( 3, ptr, v, uint32_t, 12,        100, 0)
+ecb_i2a_def ( 4, ptr, v, uint32_t, 26,       1000, 0)
+ecb_i2a_def ( 5, ptr, v, uint64_t, 30,      10000, 0)
+ecb_i2a_def ( 6, ptr, v, uint64_t, 36,     100000, 0)
+ecb_i2a_def ( 7, ptr, v, uint64_t, 44,    1000000, 0)
+ecb_i2a_def ( 8, ptr, v, uint64_t, 50,   10000000, 0)
+ecb_i2a_def ( 9, ptr, v, uint64_t, 56,  100000000, 0)
+
+// leading-zero versions, all digits, 04 and 09 are optimal for 32/64 bit
+ecb_i2a_def (02, ptr, v, uint32_t, 10,         10, 1)
+ecb_i2a_def (03, ptr, v, uint32_t, 12,        100, 1)
+ecb_i2a_def (04, ptr, v, uint32_t, 26,       1000, 1)
+ecb_i2a_def (05, ptr, v, uint64_t, 30,      10000, 1)
+ecb_i2a_def (06, ptr, v, uint64_t, 36,     100000, 1)
+ecb_i2a_def (07, ptr, v, uint64_t, 44,    1000000, 1)
+ecb_i2a_def (08, ptr, v, uint64_t, 50,   10000000, 1)
+ecb_i2a_def (09, ptr, v, uint64_t, 56,  100000000, 1)
+
+#define ECB_I2A_I32_DIGITS 11
+#define ECB_I2A_U32_DIGITS 10
+#define ECB_I2A_I64_DIGITS 20
+#define ECB_I2A_U32_DIGITS 21
+#define ECB_I2A_DIGITS     21
+
+ecb_inline char *
+ecb_i2a_u32 (char *ptr, uint32_t u)
+{
+  #if ECB_64BIT_NATIVE
+    if (ecb_expect_true (u <= ECB_I2A_MAX_X10))
+      ptr = ecb_i2a_x10 (ptr, u);
+    else // x10 almost, but not fully, covers 32 bit
+      {
+        uint32_t u1 = u % 1000000000;
+        uint32_t u2 = u / 1000000000;
+
+        *ptr++ = u2 + '0';
+        ptr = ecb_i2a_09 (ptr, u1);
+      }
+  #else
+    if (ecb_expect_true (u <= ECB_I2A_MAX_X5))
+      ecb_i2a_x5 (ptr, u);
+    else if (ecb_expect_true (u <= ECB_I2A_MAX_X5 * 10000))
+      {
+        uint32_t u1 = u % 10000;
+        uint32_t u2 = u / 10000;
+
+        ptr = ecb_i2a_x5 (ptr, u2);
+        ptr = ecb_i2a_04 (ptr, u1);
+      }
+    else
+      {
+        uint32_t u1 = u  % 10000;
+        uint32_t ua = u  / 10000;
+        uint32_t u2 = ua % 10000;
+        uint32_t u3 = ua / 10000;
+
+        ptr = ecb_i2a_2  (ptr, u3);
+        ptr = ecb_i2a_04 (ptr, u2);
+        ptr = ecb_i2a_04 (ptr, u1);
+      }
+  #endif
+
+  return ptr;
+}
+
+ecb_inline char *
+ecb_i2a_i32 (char *ptr, int32_t v)
+{
+  *ptr = '-'; ptr += v < 0;
+  uint32_t u = v < 0 ? -(uint32_t)v : v;
+
+  #if ECB_64BIT_NATIVE
+    ptr = ecb_i2a_x10 (ptr, u); // x10 fully covers 31 bit
+  #else
+    ptr = ecb_i2a_u32 (ptr, u);
+  #endif
+
+  return ptr;
+}
+
+ecb_inline char *
+ecb_i2a_u64 (char *ptr, uint64_t u)
+{
+  #if ECB_64BIT_NATIVE
+    if (ecb_expect_true (u <= ECB_I2A_MAX_X10))
+      ptr = ecb_i2a_x10 (ptr, u);
+    else if (ecb_expect_false (u <= ECB_I2A_MAX_X10 * 1000000000))
+      {
+        uint64_t u1 = u % 1000000000;
+        uint64_t u2 = u / 1000000000;
+
+        ptr = ecb_i2a_x10 (ptr, u2);
+        ptr = ecb_i2a_09  (ptr, u1);
+      }
+    else
+      {
+        uint64_t u1 = u  % 1000000000;
+        uint64_t ua = u  / 1000000000;
+        uint64_t u2 = ua % 1000000000;
+        uint64_t u3 = ua / 1000000000;
+
+        ptr = ecb_i2a_2  (ptr, u3);
+        ptr = ecb_i2a_09 (ptr, u2);
+        ptr = ecb_i2a_09 (ptr, u1);
+      }
+  #else
+    if (ecb_expect_true (u <= ECB_I2A_MAX_X5))
+      ptr = ecb_i2a_x5 (ptr, u);
+    else
+      {
+        uint64_t u1 = u % 10000;
+        uint64_t u2 = u / 10000;
+
+        ptr = ecb_i2a_u64 (ptr, u2);
+        ptr = ecb_i2a_04 (ptr, u1);
+      }
+  #endif
+
+  return ptr;
+}
+
+ecb_inline char *
+ecb_i2a_i64 (char *ptr, int64_t v)
+{
+  *ptr = '-'; ptr += v < 0;
+  uint64_t u = v < 0 ? -(uint64_t)v : v;
+
+  #if ECB_64BIT_NATIVE
+    if (ecb_expect_true (u <= ECB_I2A_MAX_X10))
+      ptr = ecb_i2a_x10 (ptr, u);
+    else if (ecb_expect_false (u <= ECB_I2A_MAX_X10 * 1000000000))
+      {
+        uint64_t u1 = u % 1000000000;
+        uint64_t u2 = u / 1000000000;
+
+        ptr = ecb_i2a_x10 (ptr, u2);
+        ptr = ecb_i2a_09  (ptr, u1);
+      }
+    else
+      {
+        uint64_t u1 = u  % 1000000000;
+        uint64_t ua = u  / 1000000000;
+        uint64_t u2 = ua % 1000000000;
+        uint64_t u3 = ua / 1000000000;
+
+        // 2**31 is 19 digits, so the top is exactly one digit
+        *ptr++ = u3 + '0';
+        ptr = ecb_i2a_09 (ptr, u2);
+        ptr = ecb_i2a_09 (ptr, u1);
+      }
+  #else
+    ptr = ecb_i2a_u64 (ptr, u);
+  #endif
+
+  return ptr;
+}
+
+/*******************************************************************************/
 /* floating point stuff, can be disabled by defining ECB_NO_LIBM */
 
 /* basically, everything uses "ieee pure-endian" floating point numbers */
@@ -914,7 +1177,6 @@
     || (defined __arm__ && (defined __ARM_EABI__ || defined __EABI__ || defined __VFP_FP__ || defined _WIN32_WCE || defined __ANDROID__)) \
     || defined __aarch64__
   #define ECB_STDFP 1
-  #include <string.h> /* for memcpy */
 #else
   #define ECB_STDFP 0
 #endif