1 |
=head1 NAME |
2 |
|
3 |
Sys::FreezeThaw - stop and start all user processes on a machine |
4 |
|
5 |
=head1 SYNOPSIS |
6 |
|
7 |
use Sys::FreezeThaw; |
8 |
|
9 |
Sys::FreezeThaw::freezethaw { |
10 |
# run code while system is frozen |
11 |
}; |
12 |
|
13 |
my $token = Sys::FreezeThaw::freeze; |
14 |
... do something ... |
15 |
Sys::FreezeThaw::thaw $token; |
16 |
|
17 |
=head1 DESCRIPTION |
18 |
|
19 |
Operating Systems/Kernels current supported: Linux-2.6/3.0 with F</proc>. |
20 |
|
21 |
This module implements a very specific feature: stopping(freezing and |
22 |
thawing/continuing all userspace processes on the machine. It works by |
23 |
sending SIGSTOP to all processes, parent-process first, so that the wait |
24 |
syscall will not trigger on stopped children. Restarting is done in |
25 |
reverse order. |
26 |
|
27 |
Using the combined function Sys::FreezeThaw::freezethaw is recommended as |
28 |
it will catch runtime errors, but stopping and restarting can be dine via |
29 |
separate function calls. |
30 |
|
31 |
=head2 What could it possibly be sueful for?? |
32 |
|
33 |
Possible uses include: doing atomic file system operations (such as |
34 |
replacing files while they are guaranteed not to be in use), or quieting |
35 |
down a system to investigate suspicious behaviour. |
36 |
|
37 |
=over 4 |
38 |
|
39 |
=cut |
40 |
|
41 |
package Sys::FreezeThaw; |
42 |
|
43 |
use Carp; |
44 |
|
45 |
$VERSION = '0.02'; |
46 |
$PARTIAL_OK = 0; |
47 |
|
48 |
=item Sys::FreezeThaw::freezethaw { BLOCK } |
49 |
|
50 |
First tries to stop all processes. If successful, runs the given code block |
51 |
(or code reference), then restarts all processes again. As the system is |
52 |
basically frozen during the code block execution, it should be as fast as |
53 |
possible. |
54 |
|
55 |
Runtime errors will be caught with C<eval>. If an exception occurs it will |
56 |
be re-thrown after processes are restarted. If processes cannot be frozen |
57 |
or restarted, this function will throw an exception. |
58 |
|
59 |
Signal handlers for SIGINT, SIGTERM, SIGPIPE, SIGHUP, SIGALRM, SIGUSR1 and |
60 |
SIGUSR2 will be installed temporarily, so if you want to catch these, you |
61 |
have to do so yourself within the executed code block. |
62 |
|
63 |
Try to do as few things as possible. For example, outputting text might |
64 |
cause a deadlock, as the terminal emulator on the other side of STDOUT |
65 |
might be stopped, logging to syslog might not work and so on. |
66 |
|
67 |
The return value of the code block is ignored right now, and the function |
68 |
doesn't yet return anything sensible. |
69 |
|
70 |
=item $token = Sys::FreezeThaw::freeze |
71 |
|
72 |
Send SIGSTOP to all processes, and return a token that allows them to be |
73 |
thawed again. |
74 |
|
75 |
If an error occurs, an exception will be thrown and all stopped processes |
76 |
will automatically be thawed. |
77 |
|
78 |
=item Sys::FreezeThaw::thaw $token |
79 |
|
80 |
Take a token returned by Sys::FreezeThaw::freeze and send all processes |
81 |
a C<CONT> signal, in the order required for them not to receive child STOP |
82 |
notifications. |
83 |
|
84 |
=item $Sys::FreezeThaw::PARTIAL_OK |
85 |
|
86 |
A boolean that tells C<freeze> whether it is an error if a process cannot |
87 |
be stopped. If false (the default), then C<freeze> will fail if there is |
88 |
an unstoppable process. If it is true, then C<freeze> will pretend it the |
89 |
process stopped. |
90 |
|
91 |
=cut |
92 |
|
93 |
# this is laughably broken, but... |
94 |
sub yield { |
95 |
select undef, undef, undef, 1/1000; |
96 |
} |
97 |
|
98 |
# the maximum number of iterations per stop/cont etc. loop |
99 |
# used to shield against catastrophic events (or bugs :) |
100 |
# on current linux systems it can take an enourmous amount of |
101 |
# time for some processes to stop, but usually it only takes |
102 |
# one or two iterations. |
103 |
sub MAX_WAIT() { 10 } |
104 |
|
105 |
# return a list o fall pid's in the system, |
106 |
# topologically sorted parent-first |
107 |
# skips, keys %$exclude_pid, zombies and stopped processes |
108 |
sub enum_pids($) { |
109 |
my ($exclude_pid) = @_; |
110 |
|
111 |
opendir my $proc, "/proc" |
112 |
or die "/proc: $!"; |
113 |
my @pid = sort { $b <=> $a } |
114 |
grep /^\d+/, |
115 |
readdir $proc; |
116 |
closedir $proc; |
117 |
|
118 |
my %ppid; |
119 |
for (@pid) { |
120 |
next if exists $exclude_pid->{$_}; |
121 |
|
122 |
open my $stat, "<", "/proc/$_/stat" |
123 |
or next; |
124 |
my ($state, $ppid, $vsize, $rss) = (split /\s+/, scalar <$stat>)[2,3,22,23]; |
125 |
|
126 |
next if $state =~ /^[TZX]/i; # stopped, zombies, dead |
127 |
next unless $vsize || $rss; # skip kernel threads or other nasties |
128 |
|
129 |
$ppid{$_} = $ppid; |
130 |
} |
131 |
|
132 |
# now topologically sort by parent-id |
133 |
my @res; |
134 |
while (scalar %ppid) { |
135 |
my @pass; |
136 |
|
137 |
for my $pid (keys %ppid) { |
138 |
if (!exists $ppid{$ppid{$pid}}) { |
139 |
push @pass, $pid; |
140 |
} |
141 |
} |
142 |
|
143 |
delete $ppid{$_} for @pass; |
144 |
|
145 |
push @res, \@pass; |
146 |
} |
147 |
|
148 |
\@res |
149 |
} |
150 |
|
151 |
sub process_stopped($) { |
152 |
open my $stat, "</proc/$_[0]/stat" |
153 |
or return 1; |
154 |
|
155 |
return +(split /\s+/, <$stat>)[2] =~ /^[TZX]/i; |
156 |
} |
157 |
|
158 |
sub thaw($) { |
159 |
local $@; |
160 |
|
161 |
my $token = shift; |
162 |
|
163 |
for (reverse @$token) { |
164 |
my @pids = @$_; |
165 |
kill CONT => @pids; |
166 |
|
167 |
# now wait till processes actually run again before the next round |
168 |
for (1..MAX_WAIT) { |
169 |
@pids = grep process_stopped $_, @pids; |
170 |
last unless @pids; |
171 |
|
172 |
yield; |
173 |
} |
174 |
} |
175 |
} |
176 |
|
177 |
sub freeze(;$) { |
178 |
local $@; |
179 |
|
180 |
my $procs; |
181 |
|
182 |
eval { |
183 |
for (1..MAX_WAIT) { |
184 |
my $passes = enum_pids { 1 => 1, $$ => 1 }; |
185 |
last unless @$passes; |
186 |
|
187 |
for (@$passes) { |
188 |
my @pids = @$_; |
189 |
push @procs, $_; |
190 |
kill STOP => @pids; |
191 |
|
192 |
for (1..MAX_WAIT) { |
193 |
@pids = grep !process_stopped $_, @pids; |
194 |
last unless @pids; |
195 |
|
196 |
# wait till processes are really stopped |
197 |
yield; |
198 |
} |
199 |
|
200 |
die "unable to stop some processes: @pids" if @pids && !$PARTIAL_OK; |
201 |
} |
202 |
} |
203 |
}; |
204 |
|
205 |
if ($@) { |
206 |
thaw \@procs; |
207 |
die $@; |
208 |
} |
209 |
|
210 |
\@procs |
211 |
} |
212 |
|
213 |
sub freezethaw(&) { |
214 |
my ($code) = @_; |
215 |
|
216 |
my $token = freeze; |
217 |
|
218 |
eval { |
219 |
local $SIG{HUP} = sub { die "ERROR: caught SIGHUP while system frozen" }; |
220 |
local $SIG{INT} = sub { die "ERROR: caught SIGINT while system frozen" }; |
221 |
local $SIG{TERM} = sub { die "ERROR: caught SIGTERM while system frozen" }; |
222 |
local $SIG{PIPE} = sub { die "ERROR: caught SIGPIPE while system frozen" }; |
223 |
local $SIG{ALRM} = sub { die "ERROR: caught SIGALRM while system frozen" }; |
224 |
local $SIG{USR1} = sub { die "ERROR: caught SIGUSR1 while system frozen" }; |
225 |
local $SIG{USR2} = sub { die "ERROR: caught SIGUSR2 while system frozen" }; |
226 |
|
227 |
$code->(); |
228 |
}; |
229 |
|
230 |
thaw $token; |
231 |
|
232 |
die $@ if $@; |
233 |
|
234 |
() |
235 |
} |
236 |
|
237 |
1; |
238 |
|
239 |
=back |
240 |
|
241 |
=head1 BUGS |
242 |
|
243 |
SIGCONT is not unnoticed by processes. Some programs (such as irssi-text) |
244 |
respond by flickering (IMHO a bug in irssi-text). Other programs might |
245 |
have other problems, but actual problems should be rare. However, one |
246 |
shouldn't overuse this module. |
247 |
|
248 |
=head1 AUTHOR |
249 |
|
250 |
Marc Lehmann <schmorp@schmorp.de> |
251 |
http://home.schmorp.de/ |
252 |
|
253 |
=cut |
254 |
|