1 |
root |
1.1 |
/* |
2 |
|
|
* Author: Marc A. Lehmann <xsthreadpool@schmorp.de> |
3 |
|
|
* License: public domain, or where this is not possible/at your option, |
4 |
|
|
* CC0 (https://creativecommons.org/publicdomain/zero/1.0/) |
5 |
|
|
*/ |
6 |
|
|
|
7 |
|
|
#ifndef PERL_MULTICORE_H |
8 |
|
|
#define PERL_MULTICORE_H |
9 |
|
|
|
10 |
|
|
/* |
11 |
|
|
|
12 |
|
|
=head1 NAME |
13 |
|
|
|
14 |
|
|
perlmulticore.h - the Perl Multicore Specification and Implementation |
15 |
|
|
|
16 |
|
|
=head1 SYNOPSIS |
17 |
|
|
|
18 |
|
|
#include "perlmultiore.h" |
19 |
|
|
|
20 |
|
|
// in your XS function: |
21 |
|
|
|
22 |
|
|
perlinterp_release (); |
23 |
|
|
do_the_C_thing (); |
24 |
|
|
perlinterp_acquire (); |
25 |
|
|
|
26 |
|
|
=head1 DESCRIPTION |
27 |
|
|
|
28 |
|
|
This header file implements a simple mechanism for XS modules to allow |
29 |
|
|
re-use of the perl interpreter for other threads while doing some lengthy |
30 |
|
|
operation, such as cryptography, SQL queries, disk I/O and so on. |
31 |
|
|
|
32 |
|
|
The design goals for this mechanism were to be simple to use, very |
33 |
|
|
efficient when not needed, low code and data size overhead and broad |
34 |
|
|
applicability. |
35 |
|
|
|
36 |
|
|
The newest version of this document can be found at |
37 |
|
|
L<http://pod.tst.eu/http://cvs.schmorp.de/Coro-Multicore/perlmulticore.h>. |
38 |
|
|
|
39 |
|
|
The newest version of the header file itself, which |
40 |
|
|
includes this documentation, can be downloaded from |
41 |
|
|
L<http://cvs.schmorp.de/Coro-Multicore/perlmulticore.h>. |
42 |
|
|
|
43 |
|
|
=head1 HOW DO I USE THIS IN MY MODULES? |
44 |
|
|
|
45 |
|
|
The usage is very simple - you include this header file in your XS module. Then, before you |
46 |
|
|
do your lengthy operation, you release the perl interpreter: |
47 |
|
|
|
48 |
|
|
perlinterp_release (); |
49 |
|
|
|
50 |
|
|
And when you are done with your computation, you acquire it again: |
51 |
|
|
|
52 |
|
|
perlinterp_acquire (); |
53 |
|
|
|
54 |
|
|
And that's it. This doesn't load any modules and consists of only a few |
55 |
|
|
machine instructions when no module to take advantage of it is loaded. |
56 |
|
|
|
57 |
|
|
Here is a simple example, an C<flock> wrapper implemented in XS. Unlike |
58 |
|
|
perl's built-in C<flock>, it allows other threads (for example, those |
59 |
|
|
provided by L<Coro>) to execute, instead of blocking the whole perl |
60 |
|
|
interpreter. For the sake of this example, it requires a file descriptor |
61 |
|
|
instead of a handle. |
62 |
|
|
|
63 |
|
|
#include "perlmulticore.h" // this header file |
64 |
|
|
|
65 |
|
|
// and in the XS portion |
66 |
|
|
int flock (int fd, int operation) |
67 |
|
|
CODE: |
68 |
|
|
perlinterp_release (); |
69 |
|
|
RETVAL = flock (fd, operation); |
70 |
|
|
perlinterp_acquire (); |
71 |
|
|
OUTPUT: |
72 |
|
|
RETVAL |
73 |
|
|
|
74 |
|
|
Another example would be to modify L<DBD::mysql> to allow other |
75 |
|
|
threads to execute while executing SQL queries. One way to do this |
76 |
|
|
is find all C<mysql_st_internal_execute> and similar calls (such as |
77 |
|
|
C<mysql_st_internal_execute41>), and adorn them with release/acquire |
78 |
|
|
calls: |
79 |
|
|
|
80 |
|
|
{ |
81 |
|
|
perlinterp_release (); |
82 |
|
|
imp_sth->row_num= mysql_st_internal_execute(sth, ...); |
83 |
|
|
perlinterp_acquire (); |
84 |
|
|
} |
85 |
|
|
|
86 |
|
|
=head2 HOW ABOUT NOT-SO LONG WORK? |
87 |
|
|
|
88 |
|
|
Sometimes you don't know how long your code will take - in a compression |
89 |
|
|
library for example, compressing a few hundred Kilobyte of data can take |
90 |
|
|
a while, while 50 Bytes will compress so fast that even attempting to do |
91 |
|
|
something else could be more costly than just doing it. |
92 |
|
|
|
93 |
|
|
This is a very hard problem to solve. The best you can do at the moment is |
94 |
|
|
to release the perl interpreter only when you think the work to be done |
95 |
|
|
justifies the expense. |
96 |
|
|
|
97 |
|
|
As a rule of thumb, if you expect to need more than a few thousand cycles, |
98 |
|
|
you should release the interpreter, else you shouldn't. When in doubt, |
99 |
|
|
release. |
100 |
|
|
|
101 |
|
|
For example, in a compression library, you might want to do this: |
102 |
|
|
|
103 |
|
|
if (bytes_to_be_compressed > 2000) perlinterp_release (); |
104 |
|
|
do_compress (...); |
105 |
|
|
if (bytes_to_be_compressed > 2000) perlinterp_acquire (); |
106 |
|
|
|
107 |
|
|
Make sure the if conditions are exactly the same and don't change, so you |
108 |
|
|
always call acquire when you release, and vice versa. |
109 |
|
|
|
110 |
|
|
When you don't have a handy indicator, you might still do something |
111 |
|
|
useful. For example, if you do some file locking with C<fcntl> and you |
112 |
|
|
expect the lock to be available immediately in most cases, you could try |
113 |
|
|
with C<F_SETLK> (which doesn't wait), and only release/wait/acquire when |
114 |
|
|
the lock couldn't be set: |
115 |
|
|
|
116 |
|
|
int res = fcntl (fd, F_SETLK, &flock); |
117 |
|
|
|
118 |
|
|
if (res) |
119 |
|
|
{ |
120 |
|
|
// error, assume lock is held by another process and do it the slow way |
121 |
|
|
perlinterp_release (); |
122 |
|
|
res = fcntl (fd, F_SETLKW, &flock); |
123 |
|
|
perlinterp_acquire (); |
124 |
|
|
} |
125 |
|
|
|
126 |
|
|
=head1 THE HARD AND FAST RULES |
127 |
|
|
|
128 |
|
|
As with everything, there are a number of rules to follow. |
129 |
|
|
|
130 |
|
|
=over 4 |
131 |
|
|
|
132 |
|
|
=item I<Never> touch any perl data structures after calling C<perlinterp_release>. |
133 |
|
|
|
134 |
|
|
Possibly the most important rule of them all, anything perl is |
135 |
|
|
completely off-limits after C<perlinterp_release>, until you call |
136 |
|
|
C<perlinterp_acquire>, after which you can access perl stuff again. |
137 |
|
|
|
138 |
|
|
That includes anything in the perl interpreter that you didn't prove to be |
139 |
|
|
safe, and didn't prove to be safe in older and future versions of perl: |
140 |
|
|
global variables, local perl scalars, even if you are sure nobody accesses |
141 |
|
|
them and you only try to "read" their value, and so on. |
142 |
|
|
|
143 |
|
|
If you need to access perl things, do it before releasing the |
144 |
|
|
interpreter with C<perlinterp_release>, or after acquiring it again with |
145 |
|
|
C<perlinterp_acquire>. |
146 |
|
|
|
147 |
|
|
=item I<Always> call C<perlinterp_release> and C<perlinterp_acquire> in pairs. |
148 |
|
|
|
149 |
|
|
For each C<perlinterp_release> call there must be a C<perlinterp_acquire> |
150 |
|
|
call. They don't have to be in the same function, and you can have |
151 |
|
|
multiple calls to them, as long as every C<perlinterp_release> call is |
152 |
|
|
followed by exactly one C<perlinterp_acquire> call. |
153 |
|
|
|
154 |
|
|
For example., this would be fine: |
155 |
|
|
|
156 |
|
|
perlinterp_release (); |
157 |
|
|
|
158 |
|
|
if (!function_that_fails_with_0_return_value ()) |
159 |
|
|
{ |
160 |
|
|
perlinterp_acquire (); |
161 |
|
|
croak ("error"); |
162 |
|
|
// croak doesn't return |
163 |
|
|
} |
164 |
|
|
|
165 |
|
|
perlinterp_acquire (); |
166 |
|
|
// do other stuff |
167 |
|
|
|
168 |
|
|
=item I<Never> nest calls to C<perlinterp_release> and C<perlinterp_acquire>. |
169 |
|
|
|
170 |
|
|
That simply means that after calling C<perlinterp_release>, you must |
171 |
|
|
call C<perlinterp_acquire> before calling C<perlinterp_release> |
172 |
|
|
again. Likewise, after C<perlinterp_acquire>, you can call |
173 |
|
|
C<perlinterp_release> but not another C<perlinterp_acquire>. |
174 |
|
|
|
175 |
|
|
=item I<Always> call C<perlinterp_release> first. |
176 |
|
|
|
177 |
|
|
Also simple: you I<must not> call C<perlinterp_acquire> without having |
178 |
|
|
called C<perlinterp_release> before. |
179 |
|
|
|
180 |
|
|
=item I<Never> underestimate threads. |
181 |
|
|
|
182 |
|
|
While it's easy to add parallel execution ability to your XS module, it |
183 |
|
|
doesn't mean it is safe. After you release the perl interpreter, it's |
184 |
|
|
perfectly possible that it will call your XS function in another thread, |
185 |
|
|
even while your original function still executes. In other words: your C |
186 |
|
|
code must be thread safe, and if you use any library, that library must be |
187 |
|
|
thread-safe, too. |
188 |
|
|
|
189 |
|
|
Always assume that the code between C<perlinterp_release> and |
190 |
|
|
C<perlinterp_acquire> is executed in parallel on multiple CPUs at the same |
191 |
|
|
time. If your code can't cope with that, you could consider using a mutex |
192 |
|
|
to only allow one such execution, which is still better than blocking |
193 |
|
|
everybody else from doing anything: |
194 |
|
|
|
195 |
|
|
static pthread_mutex_t my_mutex = PTHREAD_MUTEX_INITIALIZER; |
196 |
|
|
|
197 |
|
|
perlinterp_release (); |
198 |
|
|
pthread_mutex_lock (&my_mutex); |
199 |
|
|
do_your_non_thread_safe_thing (); |
200 |
|
|
pthread_mutex_unlock (&my_mutex); |
201 |
|
|
perlinterp_acquire (); |
202 |
|
|
|
203 |
|
|
=item I<Don't> get confused by having to release first. |
204 |
|
|
|
205 |
|
|
In many real world scenarios, you acquire a resource, do something, then |
206 |
|
|
release it again. Don't let this confuse you, with this, you already own |
207 |
|
|
the resource (the perl interpreter) so you have to I<release> first, and |
208 |
|
|
I<acquire> it again later, not the other way around. |
209 |
|
|
|
210 |
|
|
=back |
211 |
|
|
|
212 |
|
|
|
213 |
|
|
=head1 DESIGN PRINCIPLES |
214 |
|
|
|
215 |
|
|
This section discusses how the design goals were reached (you be the |
216 |
|
|
judge), how it is implemented, and what overheads this implies. |
217 |
|
|
|
218 |
|
|
=over 4 |
219 |
|
|
|
220 |
|
|
=item Simple to Use |
221 |
|
|
|
222 |
|
|
All you have to do is identify the place in your existing code where you |
223 |
|
|
stop touching perl stuff, do your actual work, and start touching perl |
224 |
|
|
stuff again. |
225 |
|
|
|
226 |
|
|
Then slap C<perlinterp_release ()> and C<perlinterp_acquire ()> around the |
227 |
|
|
actual work code. |
228 |
|
|
|
229 |
|
|
You have to include F<perlmulticore.h> and distribute it with your XS |
230 |
|
|
code, but all these things border on the trivial. |
231 |
|
|
|
232 |
|
|
=item Very Efficient |
233 |
|
|
|
234 |
|
|
The definition for C<perlinterp_release> and C<perlinterp_release> is very |
235 |
|
|
short: |
236 |
|
|
|
237 |
|
|
#define perlinterp_release() perl_multicore_api->pmapi_release () |
238 |
|
|
#define perlinterp_acquire() perl_multicore_api->pmapi_acquire () |
239 |
|
|
|
240 |
|
|
Both are macros that read a pointer from memory (perl_multicore_api), |
241 |
|
|
dereference a function pointer stored at that place, and call the |
242 |
|
|
function, which takes no arguments and returns nothing. |
243 |
|
|
|
244 |
|
|
The first call to C<perlinterp_release> will check for the presence |
245 |
|
|
of any supporting module, and if none is loaded, will create a dummy |
246 |
|
|
implementation where both C<pmapi_release> and C<pmapi_acquire> execute |
247 |
|
|
this function: |
248 |
|
|
|
249 |
|
|
static void perl_multicore_nop (void) { } |
250 |
|
|
|
251 |
|
|
So in the case of no magical module being loaded, all calls except the |
252 |
|
|
first are two memory accesses and a predictable function call of an empty |
253 |
|
|
function. |
254 |
|
|
|
255 |
|
|
Of course, the overhead is much higher when these functions actually |
256 |
|
|
implement anything useful, but you always get what you pay for. |
257 |
|
|
|
258 |
|
|
With L<Coro::Multicore>, every release/acquire involves two pthread |
259 |
|
|
switches, two coro thread switches, a bunch of syscalls, and sometimes |
260 |
|
|
interacting with the event loop. |
261 |
|
|
|
262 |
|
|
A dedicated thread pool such as the one L<IO::AIO> uses could reduce |
263 |
|
|
these overheads, and would also reduce the dependencies (L<AnyEvent> is a |
264 |
|
|
smaller and more portable dependency than L<Coro>), but it would require a |
265 |
|
|
lot more work on the side of the module author wanting to support it than |
266 |
|
|
this solution. |
267 |
|
|
|
268 |
|
|
=item Low Code and Data Size Overhead |
269 |
|
|
|
270 |
|
|
On a 64 bit system, F<perlmulticore.h> uses exactly C<8> octets (one |
271 |
|
|
pointer) of your data segment, to store the C<perl_multicore_api> |
272 |
|
|
pointer. In addition it creates a C<16> octet perl string to store the |
273 |
|
|
function pointers in, and stores it in a hash provided by perl for this |
274 |
|
|
purpose. |
275 |
|
|
|
276 |
|
|
This is pretty much the equivalent of executing this code: |
277 |
|
|
|
278 |
|
|
$existing_hash{perl_multicore_api} = "123456781234567812345678"; |
279 |
|
|
|
280 |
|
|
And that's it, which is, as I think, indeed very little. |
281 |
|
|
|
282 |
|
|
As for code size, on my amd64 system, every call to C<perlinterp_release> |
283 |
|
|
or C<perlinterp_acquire> results in a variation of the following 9-10 |
284 |
|
|
octet sequence: |
285 |
|
|
|
286 |
|
|
150> mov 0x200f23(%rip),%rax # <perl_multicore_api> |
287 |
|
|
157> callq *0x8(%rax) |
288 |
|
|
|
289 |
|
|
The biggest part if the initialisation code, which consists of 11 lines of |
290 |
|
|
typical XS code. On my system, all the code in F<perlmulticore.h> compiles |
291 |
|
|
to less than 160 octets of read-only data. |
292 |
|
|
|
293 |
|
|
=item Broad Applicability |
294 |
|
|
|
295 |
|
|
While there are alternative ways to achieve the goal of parallel execution |
296 |
|
|
with threads that might be more efficient, this mechanism was chosen |
297 |
|
|
because it is very simple to retrofit existing modules with it, and it |
298 |
|
|
|
299 |
|
|
The design goals for this mechanism were to be simple to use, very |
300 |
|
|
efficient when not needed, low code and data size overhead and broad |
301 |
|
|
applicability. |
302 |
|
|
|
303 |
|
|
=back |
304 |
|
|
|
305 |
|
|
|
306 |
|
|
=head1 DISABLING PERL MULTICORE AT COMPILE TIME |
307 |
|
|
|
308 |
|
|
You can disable the complete perl multicore API by defining the |
309 |
|
|
symbol C<PERL_MULTICORE_DISABLE> to C<1> (e.g. by specifying |
310 |
|
|
F<-DPERL_MULTICORE_DISABLE> as compiler argument). |
311 |
|
|
|
312 |
|
|
This will leave no traces of the API in the compiled code, suitable |
313 |
|
|
"empty" C<perl_release> and C<perl_acquire> definitions will be provided. |
314 |
|
|
|
315 |
|
|
This could be added to perl's C<CPPFLAGS> when configuring perl on |
316 |
|
|
platforms that do not support threading at all for example. |
317 |
|
|
|
318 |
|
|
|
319 |
|
|
=head1 AUTHOR |
320 |
|
|
|
321 |
|
|
Marc A. Lehmann <perlmulticore@schmorp.de> |
322 |
|
|
http://perlmulticore.schmorp.de/ |
323 |
|
|
|
324 |
|
|
=head1 LICENSE |
325 |
|
|
|
326 |
|
|
The F<perlmulticore.h> header file is put into the public |
327 |
|
|
domain. Where this is legally not possible, or at your |
328 |
|
|
option, it can be licensed under creativecommons CC0 |
329 |
|
|
license: L<https://creativecommons.org/publicdomain/zero/1.0/>. |
330 |
|
|
|
331 |
|
|
=cut |
332 |
|
|
|
333 |
|
|
*/ |
334 |
|
|
|
335 |
|
|
#define PERL_MULTICORE_MAJOR 1 /* bumped on incompatible changes */ |
336 |
|
|
#define PERL_MULTICORE_MINOR 0 /* bumped on every change */ |
337 |
|
|
|
338 |
|
|
#if PERL_MULTICORE_DISABLE |
339 |
|
|
|
340 |
|
|
#define perlinterp_release() do { } while (0) |
341 |
|
|
#define perlinterp_acquire() do { } while (0) |
342 |
|
|
|
343 |
|
|
#else |
344 |
|
|
|
345 |
|
|
/* this struct is shared between all modules, and currently */ |
346 |
|
|
/* contain only the two function pointers for release/acquire */ |
347 |
|
|
struct perl_multicore_api |
348 |
|
|
{ |
349 |
|
|
void (*pmapi_release)(void); |
350 |
|
|
void (*pmapi_acquire)(void); |
351 |
|
|
}; |
352 |
|
|
|
353 |
|
|
static void perl_multicore_init (void); |
354 |
|
|
|
355 |
|
|
const struct perl_multicore_api perl_multicore_api_init = { perl_multicore_init, abort }; |
356 |
|
|
|
357 |
|
|
static struct perl_multicore_api *perl_multicore_api |
358 |
|
|
= (struct perl_multicore_api *)&perl_multicore_api_init; |
359 |
|
|
|
360 |
|
|
#define perlinterp_release() perl_multicore_api->pmapi_release () |
361 |
|
|
#define perlinterp_acquire() perl_multicore_api->pmapi_acquire () |
362 |
|
|
|
363 |
|
|
/* this is the release/acquire implementation used as fallback */ |
364 |
|
|
static void |
365 |
|
|
perl_multicore_nop (void) |
366 |
|
|
{ |
367 |
|
|
} |
368 |
|
|
|
369 |
|
|
/* this is the initial implementation of "release" - it initialises */ |
370 |
|
|
/* the api and then calls the real release function */ |
371 |
|
|
static void |
372 |
|
|
perl_multicore_init (void) |
373 |
|
|
{ |
374 |
|
|
dTHX; |
375 |
|
|
|
376 |
|
|
/* check for existing API struct in PL_modglobal */ |
377 |
|
|
SV **api_svp = hv_fetch (PL_modglobal, "perl_multicore_api", sizeof ("perl_multicore_api") - 1, 1); |
378 |
|
|
|
379 |
|
|
if (SvPOKp (*api_svp)) |
380 |
|
|
perl_multicore_api = (struct perl_multicore_api *)SvPVX (*api_svp); /* we have one, use the existing one */ |
381 |
|
|
else |
382 |
|
|
{ |
383 |
|
|
/* create a new one with a dummy nop implementation */ |
384 |
|
|
SV *api_sv = NEWSV (0, sizeof (*perl_multicore_api)); |
385 |
|
|
SvCUR_set (api_sv, sizeof (*perl_multicore_api)); |
386 |
|
|
SvPOK_only (api_sv); |
387 |
|
|
perl_multicore_api = (struct perl_multicore_api *)SvPVX (api_sv); |
388 |
|
|
perl_multicore_api->pmapi_release = |
389 |
|
|
perl_multicore_api->pmapi_acquire = perl_multicore_nop; |
390 |
|
|
*api_svp = api_sv; |
391 |
|
|
} |
392 |
|
|
|
393 |
|
|
/* call the real (or dummy) implementation now */ |
394 |
|
|
perlinterp_release (); |
395 |
|
|
} |
396 |
|
|
|
397 |
|
|
#endif |
398 |
|
|
|
399 |
|
|
#endif |