| 1 |
=head1 NAME |
| 2 |
|
| 3 |
Sys::FreezeThaw - stop and start all user processes on a machine |
| 4 |
|
| 5 |
=head1 SYNOPSIS |
| 6 |
|
| 7 |
use Sys::FreezeThaw; |
| 8 |
|
| 9 |
Sys::FreezeThaw::freezethaw { |
| 10 |
# run code while system is frozen |
| 11 |
}; |
| 12 |
|
| 13 |
my $token = Sys::FreezeThaw::freeze; |
| 14 |
... do something ... |
| 15 |
Sys::FreezeThaw::thaw $token; |
| 16 |
|
| 17 |
=head1 DESCRIPTION |
| 18 |
|
| 19 |
Operating Systems/Kernels current supported: Linux-2.6/3.0 with F</proc>. |
| 20 |
|
| 21 |
This module implements a very specific feature: stopping(freezing and |
| 22 |
thawing/continuing all userspace processes on the machine. It works by |
| 23 |
sending SIGSTOP to all processes, parent-process first, so that the wait |
| 24 |
syscall will not trigger on stopped children. Restarting is done in |
| 25 |
reverse order. |
| 26 |
|
| 27 |
Using the combined function Sys::FreezeThaw::freezethaw is recommended as |
| 28 |
it will catch runtime errors, but stopping and restarting can be dine via |
| 29 |
separate function calls. |
| 30 |
|
| 31 |
=head2 What could it possibly be sueful for?? |
| 32 |
|
| 33 |
Possible uses include: doing atomic file system operations (such as |
| 34 |
replacing files while they are guaranteed not to be in use), or quieting |
| 35 |
down a system to investigate suspicious behaviour. |
| 36 |
|
| 37 |
=over 4 |
| 38 |
|
| 39 |
=cut |
| 40 |
|
| 41 |
package Sys::FreezeThaw; |
| 42 |
|
| 43 |
use Carp; |
| 44 |
|
| 45 |
$VERSION = '0.02'; |
| 46 |
$PARTIAL_OK = 0; |
| 47 |
|
| 48 |
=item Sys::FreezeThaw::freezethaw { BLOCK } |
| 49 |
|
| 50 |
First tries to stop all processes. If successful, runs the given code block |
| 51 |
(or code reference), then restarts all processes again. As the system is |
| 52 |
basically frozen during the code block execution, it should be as fast as |
| 53 |
possible. |
| 54 |
|
| 55 |
Runtime errors will be caught with C<eval>. If an exception occurs it will |
| 56 |
be re-thrown after processes are restarted. If processes cannot be frozen |
| 57 |
or restarted, this function will throw an exception. |
| 58 |
|
| 59 |
Signal handlers for SIGINT, SIGTERM, SIGPIPE, SIGHUP, SIGALRM, SIGUSR1 and |
| 60 |
SIGUSR2 will be installed temporarily, so if you want to catch these, you |
| 61 |
have to do so yourself within the executed code block. |
| 62 |
|
| 63 |
Try to do as few things as possible. For example, outputting text might |
| 64 |
cause a deadlock, as the terminal emulator on the other side of STDOUT |
| 65 |
might be stopped, logging to syslog might not work and so on. |
| 66 |
|
| 67 |
The return value of the code block is ignored right now, and the function |
| 68 |
doesn't yet return anything sensible. |
| 69 |
|
| 70 |
=item $token = Sys::FreezeThaw::freeze |
| 71 |
|
| 72 |
Send SIGSTOP to all processes, and return a token that allows them to be |
| 73 |
thawed again. |
| 74 |
|
| 75 |
If an error occurs, an exception will be thrown and all stopped processes |
| 76 |
will automatically be thawed. |
| 77 |
|
| 78 |
=item Sys::FreezeThaw::thaw $token |
| 79 |
|
| 80 |
Take a token returned by Sys::FreezeThaw::freeze and send all processes |
| 81 |
a C<CONT> signal, in the order required for them not to receive child STOP |
| 82 |
notifications. |
| 83 |
|
| 84 |
=item $Sys::FreezeThaw::PARTIAL_OK |
| 85 |
|
| 86 |
A boolean that tells C<freeze> whether it is an error if a process cannot |
| 87 |
be stopped. If false (the default), then C<freeze> will fail if there is |
| 88 |
an unstoppable process. If it is true, then C<freeze> will pretend it the |
| 89 |
process stopped. |
| 90 |
|
| 91 |
=cut |
| 92 |
|
| 93 |
# this is laughably broken, but... |
| 94 |
sub yield { |
| 95 |
select undef, undef, undef, 1/1000; |
| 96 |
} |
| 97 |
|
| 98 |
# the maximum number of iterations per stop/cont etc. loop |
| 99 |
# used to shield against catastrophic events (or bugs :) |
| 100 |
# on current linux systems it can take an enourmous amount of |
| 101 |
# time for some processes to stop, but usually it only takes |
| 102 |
# one or two iterations. |
| 103 |
sub MAX_WAIT() { 10 } |
| 104 |
|
| 105 |
# return a list o fall pid's in the system, |
| 106 |
# topologically sorted parent-first |
| 107 |
# skips, keys %$exclude_pid, zombies and stopped processes |
| 108 |
sub enum_pids($) { |
| 109 |
my ($exclude_pid) = @_; |
| 110 |
|
| 111 |
opendir my $proc, "/proc" |
| 112 |
or die "/proc: $!"; |
| 113 |
my @pid = sort { $b <=> $a } |
| 114 |
grep /^\d+/, |
| 115 |
readdir $proc; |
| 116 |
closedir $proc; |
| 117 |
|
| 118 |
my %ppid; |
| 119 |
for (@pid) { |
| 120 |
next if exists $exclude_pid->{$_}; |
| 121 |
|
| 122 |
open my $stat, "<", "/proc/$_/stat" |
| 123 |
or next; |
| 124 |
my ($state, $ppid, $vsize, $rss) = (split /\s+/, scalar <$stat>)[2,3,22,23]; |
| 125 |
|
| 126 |
next if $state =~ /^[TZX]/i; # stopped, zombies, dead |
| 127 |
next unless $vsize || $rss; # skip kernel threads or other nasties |
| 128 |
|
| 129 |
$ppid{$_} = $ppid; |
| 130 |
} |
| 131 |
|
| 132 |
# now topologically sort by parent-id |
| 133 |
my @res; |
| 134 |
while (scalar %ppid) { |
| 135 |
my @pass; |
| 136 |
|
| 137 |
for my $pid (keys %ppid) { |
| 138 |
if (!exists $ppid{$ppid{$pid}}) { |
| 139 |
push @pass, $pid; |
| 140 |
} |
| 141 |
} |
| 142 |
|
| 143 |
delete $ppid{$_} for @pass; |
| 144 |
|
| 145 |
push @res, \@pass; |
| 146 |
} |
| 147 |
|
| 148 |
\@res |
| 149 |
} |
| 150 |
|
| 151 |
sub process_stopped($) { |
| 152 |
open my $stat, "</proc/$_[0]/stat" |
| 153 |
or return 1; |
| 154 |
|
| 155 |
return +(split /\s+/, <$stat>)[2] =~ /^[TZX]/i; |
| 156 |
} |
| 157 |
|
| 158 |
sub thaw($) { |
| 159 |
local $@; |
| 160 |
|
| 161 |
my $token = shift; |
| 162 |
|
| 163 |
for (reverse @$token) { |
| 164 |
my @pids = @$_; |
| 165 |
kill CONT => @pids; |
| 166 |
|
| 167 |
# now wait till processes actually run again before the next round |
| 168 |
for (1..MAX_WAIT) { |
| 169 |
@pids = grep process_stopped $_, @pids; |
| 170 |
last unless @pids; |
| 171 |
|
| 172 |
yield; |
| 173 |
} |
| 174 |
} |
| 175 |
} |
| 176 |
|
| 177 |
sub freeze(;$) { |
| 178 |
local $@; |
| 179 |
|
| 180 |
my $procs; |
| 181 |
|
| 182 |
eval { |
| 183 |
for (1..MAX_WAIT) { |
| 184 |
my $passes = enum_pids { 1 => 1, $$ => 1 }; |
| 185 |
last unless @$passes; |
| 186 |
|
| 187 |
for (@$passes) { |
| 188 |
my @pids = @$_; |
| 189 |
push @procs, $_; |
| 190 |
kill STOP => @pids; |
| 191 |
|
| 192 |
for (1..MAX_WAIT) { |
| 193 |
@pids = grep !process_stopped $_, @pids; |
| 194 |
last unless @pids; |
| 195 |
|
| 196 |
# wait till processes are really stopped |
| 197 |
yield; |
| 198 |
} |
| 199 |
|
| 200 |
die "unable to stop some processes: @pids" if @pids && !$PARTIAL_OK; |
| 201 |
} |
| 202 |
} |
| 203 |
}; |
| 204 |
|
| 205 |
if ($@) { |
| 206 |
thaw \@procs; |
| 207 |
die $@; |
| 208 |
} |
| 209 |
|
| 210 |
\@procs |
| 211 |
} |
| 212 |
|
| 213 |
sub freezethaw(&) { |
| 214 |
my ($code) = @_; |
| 215 |
|
| 216 |
my $token = freeze; |
| 217 |
|
| 218 |
eval { |
| 219 |
local $SIG{HUP} = sub { die "ERROR: caught SIGHUP while system frozen" }; |
| 220 |
local $SIG{INT} = sub { die "ERROR: caught SIGINT while system frozen" }; |
| 221 |
local $SIG{TERM} = sub { die "ERROR: caught SIGTERM while system frozen" }; |
| 222 |
local $SIG{PIPE} = sub { die "ERROR: caught SIGPIPE while system frozen" }; |
| 223 |
local $SIG{ALRM} = sub { die "ERROR: caught SIGALRM while system frozen" }; |
| 224 |
local $SIG{USR1} = sub { die "ERROR: caught SIGUSR1 while system frozen" }; |
| 225 |
local $SIG{USR2} = sub { die "ERROR: caught SIGUSR2 while system frozen" }; |
| 226 |
|
| 227 |
$code->(); |
| 228 |
}; |
| 229 |
|
| 230 |
thaw $token; |
| 231 |
|
| 232 |
die $@ if $@; |
| 233 |
|
| 234 |
() |
| 235 |
} |
| 236 |
|
| 237 |
1; |
| 238 |
|
| 239 |
=back |
| 240 |
|
| 241 |
=head1 BUGS |
| 242 |
|
| 243 |
SIGCONT is not unnoticed by processes. Some programs (such as irssi-text) |
| 244 |
respond by flickering (IMHO a bug in irssi-text). Other programs might |
| 245 |
have other problems, but actual problems should be rare. However, one |
| 246 |
shouldn't overuse this module. |
| 247 |
|
| 248 |
=head1 AUTHOR |
| 249 |
|
| 250 |
Marc Lehmann <schmorp@schmorp.de> |
| 251 |
http://home.schmorp.de/ |
| 252 |
|
| 253 |
=cut |
| 254 |
|