1 | #!/opt/bin/perl |
1 | #!/opt/bin/perl |
2 | |
2 | |
3 | # inspired by treescan by Jamie Lokier <jamie@imbolc.ucc.ie> |
3 | # inspired by treescan by Jamie Lokier <jamie@imbolc.ucc.ie> |
4 | # about 40% faster than the original version (on my fs and raid :) |
4 | # about 40% faster than the original version (on my fs and raid :) |
5 | |
5 | |
6 | use strict; |
6 | =head1 NAME |
|
|
7 | |
|
|
8 | treescan - scan directory trees, list dirs/files, stat, sync, grep |
|
|
9 | |
|
|
10 | =head1 SYNOPSIS |
|
|
11 | |
|
|
12 | treescan [OPTION...] [PATH...] |
|
|
13 | |
|
|
14 | -q, --quiet do not print list of files/directories |
|
|
15 | -0, --print0 use null character instead of newline to separate names |
|
|
16 | -s, --stat call stat on every entry, to get stat data into cache |
|
|
17 | -d, --dirs only list dirs |
|
|
18 | -f, --files only list files |
|
|
19 | -p, --progress regularly print progress to stderr |
|
|
20 | --sync open/fsync/close every entry |
|
|
21 | -g, --grep=RE only list files that match the gibven perl RegEx |
|
|
22 | |
|
|
23 | =head1 DESCRIPTION |
|
|
24 | |
|
|
25 | The F<treescan> command scans directories and their contents |
|
|
26 | recursively. By default it lists all files and directories (with trailing |
|
|
27 | C</>), but it can optionally do various other things. |
|
|
28 | |
|
|
29 | If no paths are given, F<treescan> will use C<.>, the current directory. |
|
|
30 | |
|
|
31 | =head2 OPTIONS |
|
|
32 | |
|
|
33 | =over 4 |
|
|
34 | |
|
|
35 | =item -q, --quiet |
|
|
36 | |
|
|
37 | By default, F<treescan> prints the full paths of all directories or files |
|
|
38 | it finds. This option disables printing of filenames completely. This is |
|
|
39 | useful if you want to run F<treescan> solely for its side effects, such as |
|
|
40 | pulling C<stat> data into memory. |
|
|
41 | |
|
|
42 | =item -0, --print0 |
|
|
43 | |
|
|
44 | Instead of using newlines, use null characters after each filename. This |
|
|
45 | is useful to avoid quoting problems when piping the result into other |
|
|
46 | programs (for example, GNU F<grep>, F<xargs> and so on all have options to |
|
|
47 | deal with this). |
|
|
48 | |
|
|
49 | =item -s, --stat |
|
|
50 | |
|
|
51 | Normally, F<treescan> will use heuristics to avoid most C<stat> calls, |
|
|
52 | which is what makes it so fast. This option forces it to C<stat> every file. |
|
|
53 | |
|
|
54 | This is only useful for the side effect of pulling the C<stat> data into |
|
|
55 | the cache. If your disk cache is big enough, it will be filled with |
|
|
56 | file meta data after F<treescan> is done, which can speed up subsequent |
|
|
57 | commands considerably. Often, you can run F<treescan> in parallel with |
|
|
58 | other directory-scanning programs to speed them up. |
|
|
59 | |
|
|
60 | =item -d, --dirs |
|
|
61 | |
|
|
62 | Only lists directories, not file paths. This is useful if you quickly want |
|
|
63 | a list of directories and their subdirectories. |
|
|
64 | |
|
|
65 | =item -f, --files |
|
|
66 | |
|
|
67 | Only list files, not directories. This is useful if you want to operate on |
|
|
68 | all files in a hierarchy, and the directories would ony get in the way. |
|
|
69 | |
|
|
70 | =item -p, --progress |
|
|
71 | |
|
|
72 | Regularly print some progress information to standard error. This is |
|
|
73 | useful to get some progress information on long running tasks. Since |
|
|
74 | the progress is printed to standard error, you can pipe the output of |
|
|
75 | F<treescan> into other programs as usual. |
|
|
76 | |
|
|
77 | =item --sync |
|
|
78 | |
|
|
79 | The C<--sync> option can be used to make sure all the files/dirs in a tree |
|
|
80 | are sync'ed to disk. For example this could be useful after unpacking an |
|
|
81 | archive, to make sure the files hit the disk before deleting the archive |
|
|
82 | file itself. |
|
|
83 | |
|
|
84 | =item -g, --grep=RE |
|
|
85 | |
|
|
86 | This applies a perl regular expression (see the L<perlre> manpage) to all paths that would normally be printed |
|
|
87 | and will only print matching paths. |
|
|
88 | |
|
|
89 | The regular expression uses an C</s> (single line) modifier by default, so |
|
|
90 | newlines are matched by C<.>. |
|
|
91 | |
|
|
92 | =back |
|
|
93 | |
|
|
94 | =head1 AUTHOR |
|
|
95 | |
|
|
96 | Marc Lehmann <schmorp@schmorp.de> |
|
|
97 | http://home.schmorp.de/ |
|
|
98 | |
|
|
99 | =cut |
|
|
100 | |
|
|
101 | use common::sense; |
7 | use Getopt::Long; |
102 | use Getopt::Long; |
8 | use Time::HiRes (); |
103 | use Time::HiRes (); |
9 | use IO::AIO; |
104 | use IO::AIO; |
10 | |
105 | |
11 | our $VERSION = $IO::AIO::VERSION; |
106 | our $VERSION = $IO::AIO::VERSION; |
12 | |
107 | |
13 | Getopt::Long::Configure ("bundling", "no_ignore_case", "require_order", "auto_help", "auto_version"); |
108 | Getopt::Long::Configure ("bundling", "no_ignore_case", "require_order", "auto_help", "auto_version"); |
14 | |
109 | |
15 | my ($opt_silent, $opt_print0, $opt_stat, $opt_nodirs, |
110 | my ($opt_silent, $opt_print0, $opt_stat, $opt_nodirs, $opt_help, |
16 | $opt_nofiles, $opt_grep, $opt_progress); |
111 | $opt_nofiles, $opt_grep, $opt_progress, $opt_sync); |
17 | |
112 | |
18 | GetOptions |
113 | GetOptions |
19 | "quiet|q" => \$opt_silent, |
114 | "quiet|q" => \$opt_silent, |
20 | "print0|0" => \$opt_print0, |
115 | "print0|0" => \$opt_print0, |
21 | "stat|s" => \$opt_stat, |
116 | "stat|s" => \$opt_stat, |
22 | "dirs|d" => \$opt_nofiles, |
117 | "dirs|d" => \$opt_nofiles, |
23 | "files|f" => \$opt_nodirs, |
118 | "files|f" => \$opt_nodirs, |
24 | "grep|g=s" => \$opt_grep, |
119 | "grep|g=s" => \$opt_grep, |
25 | "progress|p" => \$opt_progress, |
120 | "progress|p" => \$opt_progress, |
|
|
121 | "sync" => \$opt_sync, |
|
|
122 | "help" => \$opt_help, |
26 | or die "Usage: try $0 --help"; |
123 | or die "Usage: try $0 --help"; |
27 | |
124 | |
|
|
125 | if ($opt_help) { |
|
|
126 | require Pod::Usage; |
|
|
127 | |
|
|
128 | Pod::Usage::pod2usage ( |
|
|
129 | -verbose => 1, |
|
|
130 | -exitval => 0, |
|
|
131 | ); |
|
|
132 | } |
|
|
133 | |
28 | @ARGV = "." unless @ARGV; |
134 | @ARGV = "." unless @ARGV; |
29 | |
135 | |
30 | $opt_grep &&= qr{$opt_grep}s; |
136 | $opt_grep &&= qr{$opt_grep}s; |
31 | |
137 | |
32 | my ($n_dirs, $n_files, $n_stats) = (0, 0, 0); |
138 | my ($n_dirs, $n_files, $n_stats) = (0, 0, 0); |
33 | my $n_last; |
|
|
34 | my $n_start = Time::HiRes::time; |
139 | my ($n_last, $n_start) = (Time::HiRes::time) x 2; |
35 | |
140 | |
36 | sub printfn { |
141 | sub printfn { |
37 | my ($prefix, $files, $suffix) = @_; |
142 | my ($prefix, $files, $suffix) = @_; |
38 | |
143 | |
39 | if ($opt_grep) { |
144 | if ($opt_grep) { |
… | |
… | |
54 | |
159 | |
55 | IO::AIO::poll_cb; |
160 | IO::AIO::poll_cb; |
56 | |
161 | |
57 | if ($opt_progress and $n_last + 1 < Time::HiRes::time) { |
162 | if ($opt_progress and $n_last + 1 < Time::HiRes::time) { |
58 | $n_last = Time::HiRes::time; |
163 | $n_last = Time::HiRes::time; |
59 | printf STDERR "%d dirs %d files %d stats %g stats/s \r", $n_dirs, $n_files, $n_stats, $n_stats / ($n_last - $n_start) |
164 | my $d = $n_last - $n_start; |
60 | if $opt_progress; |
165 | printf STDERR "\r%d dirs (%g/s) %d files (%g/s) %d stats (%g/s) ", |
|
|
166 | $n_dirs, $n_dirs / $d, |
|
|
167 | $n_files, $n_files / $d, |
|
|
168 | $n_stats, $n_stats / $d; |
61 | } |
169 | } |
62 | |
170 | |
63 | aioreq_pri -1; |
171 | aioreq_pri -1; |
64 | ++$n_dirs; |
172 | ++$n_dirs; |
65 | aio_scandir $path, 8, sub { |
173 | aio_scandir $path, 8, sub { |
66 | my ($dirs, $files) = @_ |
174 | my ($dirs, $files) = @_ |
67 | or warn "$path: $!\n"; |
175 | or return warn "$path: $!\n"; |
68 | |
176 | |
69 | printfn "", [$path] unless $opt_nodirs; |
177 | printfn "", [$path] unless $opt_nodirs; |
70 | printfn $path, $files unless $opt_nofiles; |
178 | printfn $path, $files unless $opt_nofiles; |
71 | |
179 | |
72 | $n_files += @$files; |
180 | $n_files += @$files; |
… | |
… | |
78 | aio_lstat [$wd, $_] for @$files; |
186 | aio_lstat [$wd, $_] for @$files; |
79 | $n_stats += @$files; |
187 | $n_stats += @$files; |
80 | }; |
188 | }; |
81 | } |
189 | } |
82 | |
190 | |
|
|
191 | if ($opt_sync) { |
|
|
192 | aio_wd $path, sub { |
|
|
193 | my $wd = shift; |
|
|
194 | |
|
|
195 | aio_pathsync [$wd, $_] for @$files; |
|
|
196 | aio_pathsync $wd; |
|
|
197 | }; |
|
|
198 | } |
|
|
199 | |
83 | &scan ("$path$_") for @$dirs; |
200 | &scan ("$path$_") for @$dirs; |
84 | }; |
201 | }; |
85 | } |
202 | } |
86 | |
203 | |
87 | IO::AIO::max_outstanding 100; # two fds per directory, so limit accordingly |
204 | IO::AIO::max_outstanding 100; # two fds per directory, so limit accordingly |
88 | IO::AIO::min_parallel 20; |
205 | IO::AIO::min_parallel 20; |
89 | |
206 | |
90 | for my $seed (@ARGV) { |
207 | for my $seed (@ARGV) { |
91 | $seed =~ s/\/+$//; |
208 | $seed =~ s/\/+$//; |
92 | ++$n_stats; |
|
|
93 | aio_lstat "$seed/.", sub { |
209 | aio_lstat "$seed/.", sub { |
94 | if ($_[0]) { |
210 | if ($_[0]) { |
95 | print STDERR "$seed: $!\n"; |
211 | print STDERR "$seed: $!\n"; |
96 | } elsif (-d _) { |
212 | } elsif (-d _) { |
97 | scan $seed; |
213 | scan $seed; |