1 |
#!/opt/bin/perl |
2 |
|
3 |
# inspired by treescan by Jamie Lokier <jamie@imbolc.ucc.ie> |
4 |
# about 40% faster than the original version (on my fs and raid :) |
5 |
|
6 |
=head1 NAME |
7 |
|
8 |
treescan - scan directory trees, list dirs/files, stat, sync, grep |
9 |
|
10 |
=head1 SYNOPSIS |
11 |
|
12 |
treescan [OPTION...] [PATH...] |
13 |
|
14 |
-q, --quiet do not print list of files/directories |
15 |
-0, --print0 use null character instead of newline to separate names |
16 |
-s, --stat call stat on every entry, to get stat data into cache |
17 |
-d, --dirs only list dirs |
18 |
-f, --files only list files |
19 |
-p, --progress regularly print progress to stderr |
20 |
--sync open/fsync/close every entry |
21 |
-g, --grep=RE only list files that match the given perl RegEx |
22 |
|
23 |
=head1 DESCRIPTION |
24 |
|
25 |
The F<treescan> command scans directories and their contents |
26 |
recursively. By default it lists all files and directories (with trailing |
27 |
C</>), but it can optionally do various other things. |
28 |
|
29 |
If no paths are given, F<treescan> will use C<.>, the current directory. |
30 |
|
31 |
=head2 OPTIONS |
32 |
|
33 |
=over 4 |
34 |
|
35 |
=item -q, --quiet |
36 |
|
37 |
By default, F<treescan> prints the full paths of all directories or files |
38 |
it finds. This option disables printing of filenames completely. This is |
39 |
useful if you want to run F<treescan> solely for its side effects, such as |
40 |
pulling C<stat> data into memory. |
41 |
|
42 |
=item -0, --print0 |
43 |
|
44 |
Instead of using newlines, use null characters after each filename. This |
45 |
is useful to avoid quoting problems when piping the result into other |
46 |
programs (for example, GNU F<grep>, F<xargs> and so on all have options to |
47 |
deal with this). |
48 |
|
49 |
=item -s, --stat |
50 |
|
51 |
Normally, F<treescan> will use heuristics to avoid most C<stat> calls, |
52 |
which is what makes it so fast. This option forces it to C<stat> every file. |
53 |
|
54 |
This is only useful for the side effect of pulling the C<stat> data into |
55 |
the cache. If your disk cache is big enough, it will be filled with |
56 |
file meta data after F<treescan> is done, which can speed up subsequent |
57 |
commands considerably. Often, you can run F<treescan> in parallel with |
58 |
other directory-scanning programs to speed them up. |
59 |
|
60 |
=item -d, --dirs |
61 |
|
62 |
Only lists directories, not file paths. This is useful if you quickly want |
63 |
a list of directories and their subdirectories. |
64 |
|
65 |
=item -f, --files |
66 |
|
67 |
Only list files, not directories. This is useful if you want to operate on |
68 |
all files in a hierarchy, and the directories would ony get in the way. |
69 |
|
70 |
=item -p, --progress |
71 |
|
72 |
Regularly print some progress information to standard error. This is |
73 |
useful to get some progress information on long running tasks. Since |
74 |
the progress is printed to standard error, you can pipe the output of |
75 |
F<treescan> into other programs as usual. |
76 |
|
77 |
=item --sync |
78 |
|
79 |
The C<--sync> option can be used to make sure all the files/dirs in a tree |
80 |
are sync'ed to disk. For example this could be useful after unpacking an |
81 |
archive, to make sure the files hit the disk before deleting the archive |
82 |
file itself. |
83 |
|
84 |
=item -g, --grep=RE |
85 |
|
86 |
This applies a perl regular expression (see the L<perlre> manpage) to all paths that would normally be printed |
87 |
and will only print matching paths. |
88 |
|
89 |
The regular expression uses an C</s> (single line) modifier by default, so |
90 |
newlines are matched by C<.>. |
91 |
|
92 |
=back |
93 |
|
94 |
=head1 AUTHOR |
95 |
|
96 |
Marc Lehmann <schmorp@schmorp.de> |
97 |
http://home.schmorp.de/ |
98 |
|
99 |
=cut |
100 |
|
101 |
use common::sense; |
102 |
use Getopt::Long; |
103 |
use Time::HiRes (); |
104 |
use IO::AIO; |
105 |
|
106 |
our $VERSION = $IO::AIO::VERSION; |
107 |
|
108 |
Getopt::Long::Configure ("bundling", "no_ignore_case", "require_order", "auto_help", "auto_version"); |
109 |
|
110 |
my ($opt_silent, $opt_print0, $opt_stat, $opt_nodirs, $opt_help, |
111 |
$opt_nofiles, $opt_grep, $opt_progress, $opt_sync); |
112 |
|
113 |
GetOptions |
114 |
"quiet|q" => \$opt_silent, |
115 |
"print0|0" => \$opt_print0, |
116 |
"stat|s" => \$opt_stat, |
117 |
"dirs|d" => \$opt_nofiles, |
118 |
"files|f" => \$opt_nodirs, |
119 |
"grep|g=s" => \$opt_grep, |
120 |
"progress|p" => \$opt_progress, |
121 |
"sync" => \$opt_sync, |
122 |
"help" => \$opt_help, |
123 |
or die "Usage: try $0 --help"; |
124 |
|
125 |
if ($opt_help) { |
126 |
require Pod::Usage; |
127 |
|
128 |
Pod::Usage::pod2usage ( |
129 |
-verbose => 1, |
130 |
-exitval => 0, |
131 |
); |
132 |
} |
133 |
|
134 |
@ARGV = "." unless @ARGV; |
135 |
|
136 |
my @todo; # list of dirs/files still left to scan |
137 |
|
138 |
$opt_grep &&= qr{$opt_grep}s; |
139 |
|
140 |
my ($n_dirs, $n_files, $n_stats) = (0, 0, 0); |
141 |
my ($n_last, $n_start) = (Time::HiRes::time) x 2; |
142 |
|
143 |
sub printfn { |
144 |
my ($prefix, $files, $suffix) = @_; |
145 |
|
146 |
if ($opt_grep) { |
147 |
@$files = grep "$prefix$_" =~ $opt_grep, @$files; |
148 |
} |
149 |
|
150 |
if ($opt_print0) { |
151 |
print map "$prefix$_$suffix\0", @$files; |
152 |
} elsif (!$opt_silent) { |
153 |
print map "$prefix$_$suffix\n", @$files; |
154 |
} |
155 |
} |
156 |
|
157 |
sub scan { |
158 |
my ($path) = @_; |
159 |
|
160 |
$path .= "/"; |
161 |
|
162 |
IO::AIO::poll_cb; |
163 |
|
164 |
if ($opt_progress and $n_last + 1 < Time::HiRes::time) { |
165 |
$n_last = Time::HiRes::time; |
166 |
my $d = $n_last - $n_start; |
167 |
printf STDERR "\r%d dirs (%g/s) %d files (%g/s) %d stats (%g/s) ", |
168 |
$n_dirs, $n_dirs / $d, |
169 |
$n_files, $n_files / $d, |
170 |
$n_stats, $n_stats / $d; |
171 |
} |
172 |
|
173 |
aioreq_pri -1; |
174 |
++$n_dirs; |
175 |
aio_scandir $path, 8, sub { |
176 |
my ($dirs, $files) = @_ |
177 |
or return warn "$path: $!\n"; |
178 |
|
179 |
printfn "", [$path] unless $opt_nodirs; |
180 |
printfn $path, $files unless $opt_nofiles; |
181 |
|
182 |
$n_files += @$files; |
183 |
|
184 |
if ($opt_stat) { |
185 |
aio_wd $path, sub { |
186 |
my $wd = shift; |
187 |
|
188 |
aio_lstat [$wd, $_] for @$files; |
189 |
$n_stats += @$files; |
190 |
}; |
191 |
} |
192 |
|
193 |
if ($opt_sync) { |
194 |
aio_wd $path, sub { |
195 |
my $wd = shift; |
196 |
|
197 |
aio_pathsync [$wd, $_] for @$files; |
198 |
aio_pathsync $wd; |
199 |
}; |
200 |
} |
201 |
|
202 |
push @todo, "$path$_" |
203 |
for sort { $b cmp $a } @$dirs; |
204 |
}; |
205 |
} |
206 |
|
207 |
IO::AIO::max_outstanding 100; # two fds per directory, so limit accordingly |
208 |
IO::AIO::min_parallel 20; |
209 |
|
210 |
@todo = reverse @ARGV; |
211 |
|
212 |
while () { |
213 |
if (@todo) { |
214 |
my $seed = pop @todo; |
215 |
$seed =~ s/\/+$//; |
216 |
aio_lstat "$seed/.", sub { |
217 |
if ($_[0]) { |
218 |
print STDERR "$seed: $!\n"; |
219 |
} elsif (-d _) { |
220 |
scan $seed; |
221 |
} else { |
222 |
printfn "", $seed, "/"; |
223 |
} |
224 |
}; |
225 |
} else { |
226 |
IO::AIO::poll_wait; |
227 |
} |
228 |
|
229 |
last unless IO::AIO::nreqs; |
230 |
|
231 |
IO::AIO::poll_cb; |
232 |
} |
233 |
|