[ViewVC] Contents of: cvs/IO-AIO/bin/treescan

#!/opt/bin/perl

# inspired by treescan by Jamie Lokier <jamie@imbolc.ucc.ie>
# about 40% faster than the original version (on my fs and raid :)

=head1 NAME

treescan - scan directory trees, list dirs/files, stat, sync, grep

=head1 SYNOPSIS

   treescan [OPTION...] [PATH...]

      -q, --quiet    do not print list of files/directories
      -0, --print0   use null character instead of newline to separate names
      -s, --stat     call stat on every entry, to get stat data into cache
      -d, --dirs     only list dirs
      -f, --files    only list files
      -p, --progress regularly print progress to stderr
          --sync     open/fsync/close every entry
      -g, --grep=RE  only list files that match the given perl RegEx

=head1 DESCRIPTION

The F<treescan> command scans directories and their contents
recursively. By default it lists all files and directories (with trailing
C</>), but it can optionally do various other things.

If no paths are given, F<treescan> will use C<.>, the current directory.

=head2 OPTIONS

=over 4

=item -q, --quiet

By default, F<treescan> prints the full paths of all directories or files
it finds. This option disables printing of filenames completely. This is
useful if you want to run F<treescan> solely for its side effects, such as
pulling C<stat> data into memory.

=item -0, --print0

Instead of using newlines, use null characters after each filename. This
is useful to avoid quoting problems when piping the result into other
programs (for example, GNU F<grep>, F<xargs> and so on all have options to
deal with this).

=item -s, --stat

Normally, F<treescan> will use heuristics to avoid most C<stat> calls,
which is what makes it so fast. This option forces it to C<stat> every file.

This is only useful for the side effect of pulling the C<stat> data into
the cache. If your disk cache is big enough, it will be filled with
file meta data after F<treescan> is done, which can speed up subsequent
commands considerably. Often, you can run F<treescan> in parallel with
other directory-scanning programs to speed them up.

=item -d, --dirs

Only lists directories, not file paths. This is useful if you quickly want
a list of directories and their subdirectories.

=item -f, --files

Only list files, not directories. This is useful if you want to operate on
all files in a hierarchy, and the directories would ony get in the way.

=item -p, --progress

Regularly print some progress information to standard error. This is
useful to get some progress information on long running tasks. Since
the progress is printed to standard error, you can pipe the output of
F<treescan> into other programs as usual.

=item --sync

The C<--sync> option can be used to make sure all the files/dirs in a tree
are sync'ed to disk. For example this could be useful after unpacking an
archive, to make sure the files hit the disk before deleting the archive
file itself.

=item -g, --grep=RE

This applies a perl regular expression (see the L<perlre> manpage) to all paths that would normally be printed
and will only print matching paths.

The regular expression uses an C</s> (single line) modifier by default, so
newlines are matched by C<.>.

=back

=head1 AUTHOR

 Marc Lehmann <schmorp@schmorp.de>
 http://home.schmorp.de/

=cut

use common::sense;
use Getopt::Long;
use Time::HiRes ();
use IO::AIO;

our $VERSION = $IO::AIO::VERSION;

Getopt::Long::Configure ("bundling", "no_ignore_case", "require_order", "auto_help", "auto_version");

my ($opt_silent, $opt_print0, $opt_stat, $opt_nodirs, $opt_help,
    $opt_nofiles, $opt_grep, $opt_progress, $opt_sync);

GetOptions
   "quiet|q"    => \$opt_silent,
   "print0|0"   => \$opt_print0,
   "stat|s"     => \$opt_stat,
   "dirs|d"     => \$opt_nofiles,
   "files|f"    => \$opt_nodirs,
   "grep|g=s"   => \$opt_grep,
   "progress|p" => \$opt_progress,
   "sync"       => \$opt_sync,
   "help"       => \$opt_help,
   or die "Usage: try $0 --help";

if ($opt_help) {
   require Pod::Usage;

   Pod::Usage::pod2usage (
      -verbose => 1,
      -exitval => 0,
   );
}

@ARGV = "." unless @ARGV;

my @todo; # list of dirs/files still left to scan

$opt_grep &&= qr{$opt_grep}s;

my ($n_dirs, $n_files, $n_stats) = (0, 0, 0);
my ($n_last, $n_start) = (Time::HiRes::time) x 2;

sub printfn {
   my ($prefix, $files, $suffix) = @_;

   if ($opt_grep) {
      @$files = grep "$prefix$_" =~ $opt_grep, @$files;
   }
   
   if ($opt_print0) {
      print map "$prefix$_$suffix\0", @$files;
   } elsif (!$opt_silent) {
      print map "$prefix$_$suffix\n", @$files;
   }
}

sub scan {
   my ($path) = @_;

   $path .= "/";

   IO::AIO::poll_cb;

   if ($opt_progress and $n_last + 1 < Time::HiRes::time) {
      $n_last = Time::HiRes::time;
      my $d = $n_last - $n_start;
      printf STDERR "\r%d dirs (%g/s) %d files (%g/s) %d stats (%g/s)       ",
             $n_dirs, $n_dirs / $d,
             $n_files, $n_files / $d,
             $n_stats, $n_stats / $d;
   }

   aioreq_pri -1;
   ++$n_dirs;
   aio_scandir $path, 8, sub {
      my ($dirs, $files) = @_
         or return warn "$path: $!\n";

      printfn "", [$path]   unless $opt_nodirs;
      printfn $path, $files unless $opt_nofiles;

      $n_files += @$files;

      if ($opt_stat) {
         aio_wd $path, sub {
            my $wd = shift;

            aio_lstat [$wd, $_] for @$files;
            $n_stats += @$files;
         };
      }

      if ($opt_sync) {
         aio_wd $path, sub {
            my $wd = shift;

            aio_pathsync [$wd, $_] for @$files;
            aio_pathsync $wd;
         };
      }

      push @todo, "$path$_"
         for sort { $b cmp $a } @$dirs;
   };
}

IO::AIO::max_outstanding 100; # two fds per directory, so limit accordingly
IO::AIO::min_parallel 20;

@todo = reverse @ARGV;

while () {
   if (@todo) {
      my $seed = pop @todo;
      $seed =~ s/\/+$//;
      aio_lstat "$seed/.", sub {
         if ($_[0]) {
            print STDERR "$seed: $!\n";
         } elsif (-d _) {
            scan $seed;
         } else {
            printfn "", $seed, "/";
         }
      };
   } else {
      IO::AIO::poll_wait;
   }

   last unless IO::AIO::nreqs;

   IO::AIO::poll_cb;
}

Revision:	1.21
Committed:	Wed Dec 30 07:45:33 2020 UTC (3 years, 5 months ago) by root
Branch:	MAIN
CVS Tags:	rel-4_81, rel-4_80, rel-4_78, rel-4_79, rel-4_75, rel-4_76, rel-4_77, HEAD
Changes since 1.20:	+0 -2 lines
Log Message:	4.75
#	Content
1	#!/opt/bin/perl
2
3	# inspired by treescan by Jamie Lokier <jamie@imbolc.ucc.ie>
4	# about 40% faster than the original version (on my fs and raid :)
5
6	=head1 NAME
7
8	treescan - scan directory trees, list dirs/files, stat, sync, grep
9
10	=head1 SYNOPSIS
11
12	treescan [OPTION...] [PATH...]
13
14	-q, --quiet do not print list of files/directories
15	-0, --print0 use null character instead of newline to separate names
16	-s, --stat call stat on every entry, to get stat data into cache
17	-d, --dirs only list dirs
18	-f, --files only list files
19	-p, --progress regularly print progress to stderr
20	--sync open/fsync/close every entry
21	-g, --grep=RE only list files that match the given perl RegEx
22
23	=head1 DESCRIPTION
24
25	The F<treescan> command scans directories and their contents
26	recursively. By default it lists all files and directories (with trailing
27	C</>), but it can optionally do various other things.
28
29	If no paths are given, F<treescan> will use C<.>, the current directory.
30
31	=head2 OPTIONS
32
33	=over 4
34
35	=item -q, --quiet
36
37	By default, F<treescan> prints the full paths of all directories or files
38	it finds. This option disables printing of filenames completely. This is
39	useful if you want to run F<treescan> solely for its side effects, such as
40	pulling C<stat> data into memory.
41
42	=item -0, --print0
43
44	Instead of using newlines, use null characters after each filename. This
45	is useful to avoid quoting problems when piping the result into other
46	programs (for example, GNU F<grep>, F<xargs> and so on all have options to
47	deal with this).
48
49	=item -s, --stat
50
51	Normally, F<treescan> will use heuristics to avoid most C<stat> calls,
52	which is what makes it so fast. This option forces it to C<stat> every file.
53
54	This is only useful for the side effect of pulling the C<stat> data into
55	the cache. If your disk cache is big enough, it will be filled with
56	file meta data after F<treescan> is done, which can speed up subsequent
57	commands considerably. Often, you can run F<treescan> in parallel with
58	other directory-scanning programs to speed them up.
59
60	=item -d, --dirs
61
62	Only lists directories, not file paths. This is useful if you quickly want
63	a list of directories and their subdirectories.
64
65	=item -f, --files
66
67	Only list files, not directories. This is useful if you want to operate on
68	all files in a hierarchy, and the directories would ony get in the way.
69
70	=item -p, --progress
71
72	Regularly print some progress information to standard error. This is
73	useful to get some progress information on long running tasks. Since
74	the progress is printed to standard error, you can pipe the output of
75	F<treescan> into other programs as usual.
76
77	=item --sync
78
79	The C<--sync> option can be used to make sure all the files/dirs in a tree
80	are sync'ed to disk. For example this could be useful after unpacking an
81	archive, to make sure the files hit the disk before deleting the archive
82	file itself.
83
84	=item -g, --grep=RE
85
86	This applies a perl regular expression (see the L<perlre> manpage) to all paths that would normally be printed
87	and will only print matching paths.
88
89	The regular expression uses an C</s> (single line) modifier by default, so
90	newlines are matched by C<.>.
91
92	=back
93
94	=head1 AUTHOR
95
96	Marc Lehmann <schmorp@schmorp.de>
97	http://home.schmorp.de/
98
99	=cut
100
101	use common::sense;
102	use Getopt::Long;
103	use Time::HiRes ();
104	use IO::AIO;
105
106	our $VERSION = $IO::AIO::VERSION;
107
108	Getopt::Long::Configure ("bundling", "no_ignore_case", "require_order", "auto_help", "auto_version");
109
110	my ($opt_silent, $opt_print0, $opt_stat, $opt_nodirs, $opt_help,
111	$opt_nofiles, $opt_grep, $opt_progress, $opt_sync);
112
113	GetOptions
114	"quiet\|q" => \$opt_silent,
115	"print0\|0" => \$opt_print0,
116	"stat\|s" => \$opt_stat,
117	"dirs\|d" => \$opt_nofiles,
118	"files\|f" => \$opt_nodirs,
119	"grep\|g=s" => \$opt_grep,
120	"progress\|p" => \$opt_progress,
121	"sync" => \$opt_sync,
122	"help" => \$opt_help,
123	or die "Usage: try $0 --help";
124
125	if ($opt_help) {
126	require Pod::Usage;
127
128	Pod::Usage::pod2usage (
129	-verbose => 1,
130	-exitval => 0,
131	);
132	}
133
134	@ARGV = "." unless @ARGV;
135
136	my @todo; # list of dirs/files still left to scan
137
138	$opt_grep &&= qr{$opt_grep}s;
139
140	my ($n_dirs, $n_files, $n_stats) = (0, 0, 0);
141	my ($n_last, $n_start) = (Time::HiRes::time) x 2;
142
143	sub printfn {
144	my ($prefix, $files, $suffix) = @_;
145
146	if ($opt_grep) {
147	@$files = grep "$prefix$_" =~ $opt_grep, @$files;
148	}
149
150	if ($opt_print0) {
151	print map "$prefix$_$suffix\0", @$files;
152	} elsif (!$opt_silent) {
153	print map "$prefix$_$suffix\n", @$files;
154	}
155	}
156
157	sub scan {
158	my ($path) = @_;
159
160	$path .= "/";
161
162	IO::AIO::poll_cb;
163
164	if ($opt_progress and $n_last + 1 < Time::HiRes::time) {
165	$n_last = Time::HiRes::time;
166	my $d = $n_last - $n_start;
167	printf STDERR "\r%d dirs (%g/s) %d files (%g/s) %d stats (%g/s) ",
168	$n_dirs, $n_dirs / $d,
169	$n_files, $n_files / $d,
170	$n_stats, $n_stats / $d;
171	}
172
173	aioreq_pri -1;
174	++$n_dirs;
175	aio_scandir $path, 8, sub {
176	my ($dirs, $files) = @_
177	or return warn "$path: $!\n";
178
179	printfn "", [$path] unless $opt_nodirs;
180	printfn $path, $files unless $opt_nofiles;
181
182	$n_files += @$files;
183
184	if ($opt_stat) {
185	aio_wd $path, sub {
186	my $wd = shift;
187
188	aio_lstat [$wd, $_] for @$files;
189	$n_stats += @$files;
190	};
191	}
192
193	if ($opt_sync) {
194	aio_wd $path, sub {
195	my $wd = shift;
196
197	aio_pathsync [$wd, $_] for @$files;
198	aio_pathsync $wd;
199	};
200	}
201
202	push @todo, "$path$_"
203	for sort { $b cmp $a } @$dirs;
204	};
205	}
206
207	IO::AIO::max_outstanding 100; # two fds per directory, so limit accordingly
208	IO::AIO::min_parallel 20;
209
210	@todo = reverse @ARGV;
211
212	while () {
213	if (@todo) {
214	my $seed = pop @todo;
215	$seed =~ s/\/+$//;
216	aio_lstat "$seed/.", sub {
217	if ($_[0]) {
218	print STDERR "$seed: $!\n";
219	} elsif (-d _) {
220	scan $seed;
221	} else {
222	printfn "", $seed, "/";
223	}
224	};
225	} else {
226	IO::AIO::poll_wait;
227	}
228
229	last unless IO::AIO::nreqs;
230
231	IO::AIO::poll_cb;
232	}
233