1 |
=head1 NAME |
2 |
|
3 |
AnyEvent::Watchdog - generic watchdog/program restarter |
4 |
|
5 |
=head1 SYNOPSIS |
6 |
|
7 |
# MUST be use'd as the very first thing in the main program |
8 |
use AnyEvent::Watchdog; |
9 |
|
10 |
=head1 DESCRIPTION |
11 |
|
12 |
This module implements a watchdog that can repeatedly fork the program and |
13 |
thus effectively restart it - as soon as the module is use'd, it will fork |
14 |
the program (if possible) and continue to run it normally in the child, |
15 |
while the parent becomes a supervisor. |
16 |
|
17 |
The child can then ask the supervisor to restart itself instead of |
18 |
exiting, or ask the supervisor to restart it gracefully or forcefully. |
19 |
|
20 |
B<NOTE:> This module B<< I<MUST> >> be used as the first thing in the main |
21 |
program. It will cause weird effects when used from another module, as |
22 |
perl does not expect to be forked inside C<BEGIN> blocks. |
23 |
|
24 |
=head1 RECIPES |
25 |
|
26 |
Use AnyEvent::Watchdog solely as a convinient on-demand-restarter: |
27 |
|
28 |
use AnyEvent::Watchdog; |
29 |
|
30 |
# and whenever you wnat to restart (e.g. to upgrade code): |
31 |
AnyEvent::Watchdog::restart; |
32 |
|
33 |
Use AnyEvent::Watchdog to kill the program and exit when the event loop |
34 |
fails to run for more than two minutes: |
35 |
|
36 |
use AnyEvent::Watchdog qw(autorestart heartbeat=120); |
37 |
|
38 |
Use AnyEvent::Watchdog to automatically restart the program |
39 |
when it fails to handle events for longer than 5 minutes: |
40 |
|
41 |
use AnyEvent::Watchdog qw(autorestart heartbeat=300); |
42 |
|
43 |
=head1 VARIABLES/FUNCTIONS |
44 |
|
45 |
The module supports the following variables and functions: |
46 |
|
47 |
=over 4 |
48 |
|
49 |
=cut |
50 |
|
51 |
package AnyEvent::Watchdog; |
52 |
|
53 |
# load modules we will use later anyways |
54 |
use common::sense; |
55 |
|
56 |
use Carp (); |
57 |
|
58 |
our $VERSION = '0.9'; |
59 |
|
60 |
=item $AnyEvent::Watchdog::ENABLED |
61 |
|
62 |
This is true when the program is running under the regime of |
63 |
AnyEvent::Watchdog. Semi-obviously, you should I<NOT> C<use> or C<require> |
64 |
this module before looking at this variable, and neither should you try |
65 |
to load this module unless in the main program, rather use an idiom like |
66 |
this: |
67 |
|
68 |
$AnyEvent::Watchdog::ENABLED |
69 |
or die "watchdog not enabled..."; |
70 |
AnyEvent::Watchdog::restart (60); # MUST use () |
71 |
|
72 |
Note that if this variable is defined, but false, then AnyEvent::Watchdog |
73 |
is running, but you are in the watchdog process - you probably did |
74 |
something very wrong in this case. |
75 |
|
76 |
=cut |
77 |
|
78 |
our $PID; # child pid |
79 |
our $ENABLED = 0; |
80 |
our $AUTORESTART; # actually exit |
81 |
our $HEARTBEAT; |
82 |
our ($P, $C); |
83 |
|
84 |
sub poll($) { |
85 |
(vec my $v, fileno $P, 1) = 1; |
86 |
CORE::select $v, undef, undef, $_[0] |
87 |
} |
88 |
|
89 |
sub server { |
90 |
my $expected;# do we expect a program exit? |
91 |
my $heartbeat; |
92 |
|
93 |
$AUTORESTART = 0; |
94 |
|
95 |
local $SIG{HUP} = 'IGNORE'; |
96 |
local $SIG{INT} = 'IGNORE'; |
97 |
local $SIG{TERM} = 'IGNORE'; |
98 |
|
99 |
while () { |
100 |
if ($heartbeat) { |
101 |
unless (poll $heartbeat) { |
102 |
$expected = 1; |
103 |
warn "AnyEvent::Watchdog: heartbeat failed. killing.\n"; |
104 |
kill 9, $PID; |
105 |
last; |
106 |
} |
107 |
} |
108 |
|
109 |
sysread $P, my $cmd, 1 |
110 |
or last; |
111 |
|
112 |
if ($cmd eq chr 0) { |
113 |
$AUTORESTART = 0; |
114 |
|
115 |
} elsif ($cmd eq chr 1) { |
116 |
$AUTORESTART = 1; |
117 |
|
118 |
} elsif ($cmd eq chr 2) { |
119 |
sysread $P, my $timeout, 1 |
120 |
or last; |
121 |
|
122 |
$timeout = ord $timeout; |
123 |
|
124 |
unless (poll $timeout) { |
125 |
warn "AnyEvent::Watchdog: program attempted restart, but failed to do so within $timeout seconds. killing.\n"; |
126 |
kill 9, $PID; |
127 |
} |
128 |
|
129 |
if (sysread $P, my $dummy, 1) { |
130 |
warn "AnyEvent::Watchdog: unexpected program output. killing.\n"; |
131 |
kill 9, $PID; |
132 |
} |
133 |
|
134 |
$expected = 1; |
135 |
last; |
136 |
|
137 |
} elsif ($cmd eq chr 3) { |
138 |
sysread $P, my $interval, 1 |
139 |
or last; |
140 |
|
141 |
$heartbeat = ord $interval; |
142 |
|
143 |
} elsif ($cmd eq chr 4) { |
144 |
# heartbeat |
145 |
# TODO: should only reset heartbeat timeout with \005 |
146 |
|
147 |
} else { |
148 |
warn "AnyEvent::Watchdog: unexpected program output. killing.\n"; |
149 |
kill 9, $PID; |
150 |
last; |
151 |
} |
152 |
} |
153 |
|
154 |
waitpid $PID, 0; |
155 |
|
156 |
require POSIX; |
157 |
|
158 |
my $termsig = POSIX::WIFSIGNALED ($?) && POSIX::WTERMSIG ($?); |
159 |
|
160 |
if ($termsig == POSIX::SIGINT () || $termsig == POSIX::SIGTERM ()) { |
161 |
$AUTORESTART = 0; |
162 |
$expected = 1; |
163 |
} |
164 |
|
165 |
unless ($expected) { |
166 |
warn "AnyEvent::Watchdog: program exited unexpectedly with status $?.\n" |
167 |
if $? >> 8; |
168 |
} |
169 |
|
170 |
if ($AUTORESTART) { |
171 |
warn "AnyEvent::Watchdog: attempting automatic restart.\n"; |
172 |
} else { |
173 |
if ($termsig) { |
174 |
$SIG{$_} = 'DEFAULT' for keys %SIG; |
175 |
kill $termsig, $$; |
176 |
POSIX::_exit (127); |
177 |
} else { |
178 |
POSIX::_exit ($? >> 8); |
179 |
} |
180 |
} |
181 |
} |
182 |
|
183 |
our %SEEKPOS; |
184 |
# due to bugs in perl, try to remember file offsets for all fds, and restore them later |
185 |
# (the parser otherwise exhausts the input files) |
186 |
|
187 |
# this causes perlio to flush its handles internally, so |
188 |
# seek offsets become correct. |
189 |
exec "."; # toi toi toi |
190 |
#{ |
191 |
# local $SIG{CHLD} = 'DEFAULT'; |
192 |
# my $pid = fork; |
193 |
# |
194 |
# if ($pid) { |
195 |
# waitpid $pid, 0; |
196 |
# } else { |
197 |
# kill 9, $$; |
198 |
# } |
199 |
#} |
200 |
|
201 |
# now record "all" fd positions, assuming 1023 is more than enough. |
202 |
for (0 .. 1023) { |
203 |
open my $fh, "<&$_" or next; |
204 |
$SEEKPOS{$_} = (sysseek $fh, 0, 1 or next); |
205 |
} |
206 |
|
207 |
while () { |
208 |
if ($^O =~ /mswin32/i) { |
209 |
require AnyEvent::Util; |
210 |
($P, $C) = AnyEvent::Util::portable_socketpair () |
211 |
or Carp::croak "AnyEvent::Watchdog: unable to create restarter pipe: $!\n"; |
212 |
} else { |
213 |
require Socket; |
214 |
socketpair $P, $C, Socket::AF_UNIX (), Socket::SOCK_STREAM (), 0 |
215 |
or Carp::croak "AnyEvent::Watchdog: unable to create restarter pipe: $!\n"; |
216 |
} |
217 |
|
218 |
local $SIG{CHLD} = 'DEFAULT'; |
219 |
|
220 |
$PID = fork; |
221 |
|
222 |
unless (defined $PID) { |
223 |
warn "AnyEvent::Watchdog: '$!', retrying in one second...\n"; |
224 |
sleep 1; |
225 |
} elsif ($PID) { |
226 |
# parent code |
227 |
close $C; |
228 |
server; |
229 |
} else { |
230 |
# child code |
231 |
$ENABLED = 1; |
232 |
|
233 |
# restore seek offsets |
234 |
while (my ($k, $v) = each %SEEKPOS) { |
235 |
open my $fh, "<&$k" or next; |
236 |
sysseek $fh, $v, 0; |
237 |
} |
238 |
|
239 |
# continue the program normally |
240 |
close $P; |
241 |
last; |
242 |
} |
243 |
} |
244 |
|
245 |
=item AnyEvent::Watchdog::restart [$timeout] |
246 |
|
247 |
Tells the supervisor to restart the process when it exits, or forcefully |
248 |
after C<$timeout> seconds (minimum 1, maximum 255, default 60). |
249 |
|
250 |
Calls C<exit 0> to exit the process cleanly. |
251 |
|
252 |
=cut |
253 |
|
254 |
sub restart(;$) { |
255 |
my ($timeout) = @_; |
256 |
|
257 |
$timeout = 60 unless defined $timeout; |
258 |
$timeout = 1 if $timeout < 1; |
259 |
$timeout = 255 if $timeout > 255; |
260 |
|
261 |
syswrite $C, "\x01\x02" . chr $timeout; |
262 |
exit 0; |
263 |
} |
264 |
|
265 |
=item AnyEvent::Watchdog::autorestart [$boolean] |
266 |
|
267 |
=item use AnyEvent::Watchdog qw(autorestart[=$boolean]) |
268 |
|
269 |
Enables or disables autorestart (initially disabled, default for |
270 |
C<$boolean> is to enable): By default, the supervisor will exit if the |
271 |
program exits or dies in any way. When enabling autorestart behaviour, |
272 |
then the supervisor will try to restart the program after it dies. |
273 |
|
274 |
Note that the supervisor will never autorestart when the child died with |
275 |
SIGINT or SIGTERM. |
276 |
|
277 |
=cut |
278 |
|
279 |
sub autorestart(;$) { |
280 |
syswrite $C, !@_ || $_[0] ? "\x01" : "\x00"; |
281 |
} |
282 |
|
283 |
=item AnyEvent::Watchdog::heartbeat [$interval] |
284 |
|
285 |
=item use AnyEvent::Watchdog qw(heartbeat[=$interval]) |
286 |
|
287 |
Tells the supervisor to automatically kill the program if it doesn't |
288 |
react for C<$interval> seconds (minium 1, maximum 255, default 60) , then |
289 |
installs an AnyEvent timer the sends a regular heartbeat to the supervisor |
290 |
twice as often. |
291 |
|
292 |
Exit behaviour isn't changed, so if you want a restart instead of an exit, |
293 |
you have to call C<autorestart>. |
294 |
|
295 |
The heartbeat frequency can be changed as often as you want, an interval |
296 |
of C<0> disables the heartbeat check again. |
297 |
|
298 |
=cut |
299 |
|
300 |
sub heartbeat(;$) { |
301 |
my ($interval) = @_; |
302 |
|
303 |
$interval = 60 unless defined $interval; |
304 |
$interval = 1 if $interval < 1; |
305 |
$interval = 255 if $interval > 255; |
306 |
|
307 |
syswrite $C, "\x03" . chr $interval; |
308 |
|
309 |
require AE; |
310 |
$HEARTBEAT = AE::timer (0, $interval * 0.5, sub { |
311 |
syswrite $C, "\x04"; |
312 |
}); |
313 |
} |
314 |
|
315 |
sub import { |
316 |
shift; |
317 |
|
318 |
for (@_) { |
319 |
if (/^autorestart(?:=(.*))?$/) { |
320 |
autorestart defined $1 ? $1 : 1; |
321 |
} elsif (/^heartbeat(?:=(.*))?$/) { |
322 |
heartbeat $1; |
323 |
} else { |
324 |
Carp::croak "AnyEvent::Watchdog: '$_' is not a valid import argument"; |
325 |
} |
326 |
} |
327 |
} |
328 |
|
329 |
=back |
330 |
|
331 |
=head1 SEE ALSO |
332 |
|
333 |
L<AnyEvent>. |
334 |
|
335 |
=head1 AUTHOR |
336 |
|
337 |
Marc Lehmann <schmorp@schmorp.de> |
338 |
http://home.schmorp.de/ |
339 |
|
340 |
=cut |
341 |
|
342 |
1 |
343 |
|