1 |
elmex |
1.1 |
/* |
2 |
root |
1.3 |
* shstr.C |
3 |
elmex |
1.1 |
*/ |
4 |
|
|
|
5 |
root |
1.3 |
#include <cstring> |
6 |
|
|
#include <cstdlib> |
7 |
|
|
|
8 |
|
|
#include <tr1/unordered_set> |
9 |
elmex |
1.1 |
|
10 |
|
|
#include "shstr.h" |
11 |
|
|
|
12 |
root |
1.5 |
// NOTE: even with lots of stuff loaded, we do not usually have >>20000 strings. |
13 |
|
|
// maybe refcounting is just overhead? |
14 |
|
|
|
15 |
root |
1.3 |
struct hash |
16 |
|
|
{ |
17 |
|
|
std::size_t operator ()(const char *s) const |
18 |
|
|
{ |
19 |
elmex |
1.1 |
unsigned long hash = 0; |
20 |
root |
1.3 |
unsigned int i = 0; |
21 |
elmex |
1.1 |
|
22 |
root |
1.3 |
/* use the one-at-a-time hash function, which supposedly is |
23 |
|
|
* better than the djb2-like one used by perl5.005, but |
24 |
|
|
* certainly is better then the bug used here before. |
25 |
|
|
* see http://burtleburtle.net/bob/hash/doobs.html |
26 |
elmex |
1.1 |
*/ |
27 |
root |
1.3 |
while (*s) |
28 |
|
|
{ |
29 |
|
|
hash += *s++; |
30 |
|
|
hash += hash << 10; |
31 |
|
|
hash ^= hash >> 6; |
32 |
|
|
} |
33 |
|
|
|
34 |
|
|
hash += hash << 3; |
35 |
|
|
hash ^= hash >> 11; |
36 |
|
|
hash += hash << 15; |
37 |
|
|
|
38 |
|
|
return hash; |
39 |
|
|
} |
40 |
|
|
}; |
41 |
elmex |
1.1 |
|
42 |
root |
1.3 |
struct equal |
43 |
|
|
{ |
44 |
|
|
bool operator ()(const char *a, const char *b) const |
45 |
|
|
{ |
46 |
|
|
return !strcmp (a, b); |
47 |
|
|
} |
48 |
|
|
}; |
49 |
elmex |
1.1 |
|
50 |
root |
1.3 |
typedef std::tr1::unordered_set<const char *, hash, equal> HT; |
51 |
elmex |
1.1 |
|
52 |
root |
1.3 |
static HT ht; |
53 |
elmex |
1.1 |
|
54 |
|
|
const char * |
55 |
root |
1.3 |
shstr::find (const char *s) |
56 |
|
|
{ |
57 |
root |
1.4 |
if (!s) |
58 |
|
|
return s; |
59 |
|
|
|
60 |
root |
1.3 |
HT::iterator i = ht.find (s); |
61 |
elmex |
1.1 |
|
62 |
root |
1.3 |
return i != ht.end () |
63 |
root |
1.5 |
? *i |
64 |
root |
1.3 |
: 0; |
65 |
elmex |
1.1 |
} |
66 |
|
|
|
67 |
|
|
const char * |
68 |
root |
1.3 |
shstr::intern (const char *s) |
69 |
|
|
{ |
70 |
root |
1.4 |
if (!s) |
71 |
|
|
return s; |
72 |
elmex |
1.1 |
|
73 |
root |
1.4 |
if (const char *found = find (s)) |
74 |
root |
1.5 |
{ |
75 |
|
|
++refcnt (found); |
76 |
|
|
return found; |
77 |
|
|
} |
78 |
elmex |
1.1 |
|
79 |
root |
1.3 |
int len = strlen (s); |
80 |
elmex |
1.1 |
|
81 |
root |
1.5 |
const char *v = (const char *)(2 + (int *)malloc (sizeof (int) * 2 + len + 1)); |
82 |
elmex |
1.1 |
|
83 |
root |
1.5 |
length (v) = len; |
84 |
|
|
refcnt (v) = 1; |
85 |
elmex |
1.1 |
|
86 |
root |
1.5 |
memcpy ((char *)v, s, len + 1); |
87 |
elmex |
1.1 |
|
88 |
root |
1.5 |
ht.insert (v); |
89 |
elmex |
1.1 |
|
90 |
root |
1.5 |
return v; |
91 |
elmex |
1.1 |
} |
92 |
|
|
|
93 |
root |
1.5 |
// periodically test refcounts == 0 for a few strings |
94 |
|
|
// this is the ONLY thing that erases stuff from ht. keep it that way. |
95 |
elmex |
1.1 |
void |
96 |
root |
1.3 |
shstr::gc () |
97 |
|
|
{ |
98 |
root |
1.5 |
static const char *curpos; |
99 |
|
|
|
100 |
|
|
HT::iterator i = curpos ? ht.find (curpos) : ht.begin (); |
101 |
|
|
|
102 |
|
|
if (i == ht.end ()) |
103 |
|
|
i = ht.begin (); |
104 |
|
|
|
105 |
|
|
// go through all strings roughly once every 4 minutes |
106 |
root |
1.6 |
int n = ht.size () / 256 + 16; |
107 |
|
|
|
108 |
|
|
for (;;) |
109 |
root |
1.5 |
{ |
110 |
|
|
if (i == ht.end ()) |
111 |
|
|
{ |
112 |
|
|
curpos = 0; |
113 |
|
|
return; |
114 |
|
|
} |
115 |
root |
1.6 |
else if (!--n) |
116 |
|
|
break; |
117 |
|
|
else if (!refcnt (*i)) |
118 |
root |
1.5 |
{ |
119 |
|
|
HT::iterator o = i++; |
120 |
|
|
const char *s = *o; |
121 |
|
|
ht.erase (o); |
122 |
|
|
|
123 |
|
|
//printf ("GC %4d %3d %d >%s<%d\n", (int)ht.size (), n, shstr::refcnt (s), s, shstr::length (s)); |
124 |
|
|
free (-2 + (int *)s); |
125 |
|
|
} |
126 |
|
|
else |
127 |
|
|
++i; |
128 |
|
|
} |
129 |
|
|
|
130 |
|
|
curpos = *i; |
131 |
elmex |
1.1 |
} |
132 |
|
|
|
133 |
root |
1.5 |
//TODO: this should of course not be here |
134 |
elmex |
1.1 |
/* buf_overflow() - we don't want to exceed the buffer size of |
135 |
|
|
* buf1 by adding on buf2! Returns true if overflow will occur. |
136 |
|
|
*/ |
137 |
|
|
|
138 |
|
|
int |
139 |
|
|
buf_overflow (const char *buf1, const char *buf2, int bufsize) |
140 |
|
|
{ |
141 |
root |
1.3 |
int len1 = 0, len2 = 0; |
142 |
elmex |
1.1 |
|
143 |
|
|
if (buf1) |
144 |
root |
1.2 |
len1 = strlen (buf1); |
145 |
elmex |
1.1 |
if (buf2) |
146 |
root |
1.2 |
len2 = strlen (buf2); |
147 |
elmex |
1.1 |
if ((len1 + len2) >= bufsize) |
148 |
root |
1.2 |
return 1; |
149 |
elmex |
1.1 |
return 0; |
150 |
|
|
} |
151 |
|
|
|