1 |
=head1 NAME |
2 |
|
3 |
Sys::FreezeThaw - stop and start all user processes on a machine |
4 |
|
5 |
=head1 SYNOPSIS |
6 |
|
7 |
use Sys::FreezeThaw; |
8 |
|
9 |
Sys::FreezeThaw::freezethaw { |
10 |
# run code while system is frozen |
11 |
}; |
12 |
|
13 |
my $token = Sys::FreezeThaw::freeze; |
14 |
... do something ... |
15 |
Sys::FreezeThaw::thaw $token; |
16 |
|
17 |
=head1 DESCRIPTION |
18 |
|
19 |
Operating Systems/Kernels current supported: Linux-2.6/3.0 with F</proc>. |
20 |
|
21 |
This module implements a very specific feature: stopping(freezing and |
22 |
thawing/continuing all userspace processes on the machine. It works by |
23 |
sending SIGSTOP to all processes, parent-process first, so that the wait |
24 |
syscall will not trigger on stopped children. Restarting is done in |
25 |
reverse order. |
26 |
|
27 |
Using the combined function Sys::FreezeThaw::freezethaw is recommended as |
28 |
it will catch runtime errors, but stopping and restarting can be dine via |
29 |
separate function calls. |
30 |
|
31 |
=head2 What could it possibly be sueful for?? |
32 |
|
33 |
Possible uses include: doing atomic file system operations (such as |
34 |
replacing files while they are guaranteed not to be in use), or quieting |
35 |
down a system to investigate suspicious behaviour. |
36 |
|
37 |
=over 4 |
38 |
|
39 |
=cut |
40 |
|
41 |
package Sys::FreezeThaw; |
42 |
|
43 |
use Carp; |
44 |
|
45 |
$VERSION = '0.02'; |
46 |
|
47 |
=item Sys::FreezeThaw::freezethaw { BLOCK } |
48 |
|
49 |
First tries to stop all processes. If successful, runs the given code block |
50 |
(or code reference), then restarts all processes again. As the system is |
51 |
basically frozen during the code block execution, it should be as fast as |
52 |
possible. |
53 |
|
54 |
Runtime errors will be caught with C<eval>. If an exception occurs it will |
55 |
be re-thrown after processes are restarted. If processes cannot be frozen |
56 |
or restarted, this function will throw an exception. |
57 |
|
58 |
Signal handlers for SIGPIPE, SIGHUP, SIGALRM, SIGUSR1 and SIGUSR2 will |
59 |
temporarily be installed, so if you want to catch these, you have to do so |
60 |
yourself within the executed code block. |
61 |
|
62 |
Try to do as few things as possible. For example, outputting text might |
63 |
cause a deadlock, as the terminal emulator on the other side of STDOUT |
64 |
might be stopped, etc. |
65 |
|
66 |
The return value of the code block is ignored right now, and the function |
67 |
doesn't yet return anything sensible. |
68 |
|
69 |
=item $token = Sys::FreezeThaw::freeze; |
70 |
|
71 |
Send SIGSTOP to all processes, and return a token that allows them to be |
72 |
thawed again. |
73 |
|
74 |
If an error occurs, an exception will be thrown and all stopped processes |
75 |
will automatically be thawed. |
76 |
|
77 |
=item Sys::FreezeThaw::thaw $token |
78 |
|
79 |
Take a token returned by Sys::FreezeThaw::freeze and send all processes |
80 |
a CONT signal, in the order required for them not to receive child STOP |
81 |
notifications. |
82 |
|
83 |
=cut |
84 |
|
85 |
# this is laughably broken, but... |
86 |
sub yield { |
87 |
select undef, undef, undef, 1/1000; |
88 |
} |
89 |
|
90 |
# the maximum number of iterations per stop/cont etc. loop |
91 |
# used to shield against catastrophic events (or bugs :) |
92 |
# on current linux systems it can take an enourmous amount of |
93 |
# time for some processes to stop, but usually it only takes |
94 |
# one or two iterations. |
95 |
sub MAX_WAIT() { 1000 } |
96 |
|
97 |
# return a list o fall pid's in the system, |
98 |
# topologically sorted parent-first |
99 |
# skips, keys %$exclude_pid, zombies and stopped processes |
100 |
sub enum_pids($) { |
101 |
my ($exclude_pid) = @_; |
102 |
|
103 |
opendir my $proc, "/proc" |
104 |
or die "/proc: $!"; |
105 |
my @pid = sort { $b <=> $a } |
106 |
grep /^\d+/, |
107 |
readdir $proc; |
108 |
closedir $proc; |
109 |
|
110 |
my %ppid; |
111 |
for (@pid) { |
112 |
next if exists $exclude_pid->{$_}; |
113 |
|
114 |
open my $stat, "<", "/proc/$_/stat" |
115 |
or next; |
116 |
my ($state, $ppid, $vsize, $rss) = (split /\s+/, scalar <$stat>)[2,3,22,23]; |
117 |
|
118 |
next if $state =~ /^[TZX]/i; # stopped, zombies, dead |
119 |
next unless $vsize || $rss; # skip kernel threads or other nasties |
120 |
|
121 |
$ppid{$_} = $ppid; |
122 |
} |
123 |
|
124 |
# now topologically sort by parent-id |
125 |
my @res; |
126 |
while (scalar %ppid) { |
127 |
my @pass; |
128 |
|
129 |
for my $pid (keys %ppid) { |
130 |
if (!exists $ppid{$ppid{$pid}}) { |
131 |
push @pass, $pid; |
132 |
} |
133 |
} |
134 |
|
135 |
delete $ppid{$_} for @pass; |
136 |
|
137 |
push @res, \@pass; |
138 |
} |
139 |
|
140 |
\@res |
141 |
} |
142 |
|
143 |
sub process_stopped($) { |
144 |
open my $stat, "</proc/$_[0]/stat" |
145 |
or return 1; |
146 |
|
147 |
return +(split /\s+/, <$stat>)[2] =~ /^[TZX]/i; |
148 |
} |
149 |
|
150 |
sub thaw($) { |
151 |
local $@; |
152 |
|
153 |
my $token = shift; |
154 |
|
155 |
for (reverse @$token) { |
156 |
my @pids = @$_; |
157 |
kill CONT => @pids; |
158 |
|
159 |
# now wait till processes actually run again before the next round |
160 |
for (1..MAX_WAIT) { |
161 |
@pids = grep process_stopped $_, @pids; |
162 |
last unless @pids; |
163 |
|
164 |
yield; |
165 |
} |
166 |
} |
167 |
} |
168 |
|
169 |
sub freeze() { |
170 |
local $@; |
171 |
|
172 |
my @procs; |
173 |
|
174 |
eval { |
175 |
for (1..MAX_WAIT) { |
176 |
my $passes = enum_pids { 1 => 1, $$ => 1 }; |
177 |
last unless @$passes; |
178 |
|
179 |
for (@$passes) { |
180 |
my @pids = @$_; |
181 |
push @procs, $_; |
182 |
kill STOP => @pids; |
183 |
|
184 |
for (1..MAX_WAIT) { |
185 |
@pids = grep !process_stopped $_, @pids; |
186 |
last unless @pids; |
187 |
|
188 |
# wait till processes are really stopped |
189 |
yield; |
190 |
} |
191 |
|
192 |
die "unable to stop some processes: @pids" if @pids; |
193 |
} |
194 |
} |
195 |
}; |
196 |
|
197 |
if ($@) { |
198 |
thaw \@procs; |
199 |
die $@; |
200 |
} |
201 |
|
202 |
\@procs |
203 |
} |
204 |
|
205 |
sub freezethaw(&) { |
206 |
my ($code) = @_; |
207 |
|
208 |
my $token = freeze; |
209 |
|
210 |
eval { |
211 |
local $SIG{HUP} = sub { die "ERROR: caught SIGHUP while system frozen" }; |
212 |
local $SIG{PIPE} = sub { die "ERROR: caught SIGPIPE while system frozen" }; |
213 |
local $SIG{ALRM} = sub { die "ERROR: caught SIGALRM while system frozen" }; |
214 |
local $SIG{USR1} = sub { die "ERROR: caught SIGUSR1 while system frozen" }; |
215 |
local $SIG{USR2} = sub { die "ERROR: caught SIGUSR2 while system frozen" }; |
216 |
|
217 |
$code->(); |
218 |
}; |
219 |
|
220 |
thaw $token; |
221 |
|
222 |
die $@ if $@; |
223 |
|
224 |
() |
225 |
} |
226 |
|
227 |
1; |
228 |
|
229 |
=back |
230 |
|
231 |
=head1 BUGS |
232 |
|
233 |
SIGCONT is not unnoticed by processes. Some programs (such as irssi-text) |
234 |
respond by flickering (IMHO a bug in irssi-text). Other programs might |
235 |
have other problems, but actual problems should be rare. However, one |
236 |
shouldn't overuse this module. |
237 |
|
238 |
=head1 AUTHOR |
239 |
|
240 |
Marc Lehmann <schmorp@schmorp.de> |
241 |
http://home.schmorp.de/ |
242 |
|
243 |
=cut |
244 |
|