Perl-Strip/Strip.pm

=head1 NAME

Perl::Strip - reduce file size by stripping whitespace, comments, pod etc.

=head1 SYNOPSIS

   use Perl::Strip;

=head1 DESCRIPTION

This module transforms perl sources into a more compact format. It does
this by removing most whitespace, comments, pod, and by some other means.

The resulting code looks obfuscated, but perl (and the deparser) don't
have any problems with that. Depending on the source file you can expect
about 30-60% "compression".

The main target for this module is low-diskspace environments, such as
L<App::Staticperl>, boot floppy/CDs/flash environments and so on.

See also the commandline utility L<perlstrip>.

=head1 METHODS

The C<Perl::Strip> class is a subclsass of L<PPI::Transform>, and as such
inherits all of it's methods, even the ones not documented here.

=over 4

=cut

package Perl::Strip;

our $VERSION = '1.1';
our $CACHE_VERSION = 2;

use common::sense;

use PPI;

use base PPI::Transform::;

=item my $transform = new Perl::Strip key => value...

Creates a new Perl::Strip transform object. It supports the following
parameters:

=over 4
 
=item optimise_size => $bool

By default, this module optimises I<compressability>, not raw size. This
switch changes that (and makes it slower).

=item keep_nl => $bool

By default, whitespace will either be stripped or replaced by a space. If
this option is enabled, then newlines will not be removed. This has the
advantage of keeping line number information intact (e.g. for backtraces),
but of course doesn't compress as well.

=item cache => $path

Since this module can take a very long time (minutes for the larger files
in the perl distribution), it can utilise a cache directory. The directory
will be created if it doesn't exist, and can be deleted at any time.

=back

=cut

# PPI::Transform compatible
sub document {
   my ($self, $doc) = @_;

   $self->{optimise_size} = 1; # more research is needed

   # special stripping for unicore/ files
   if (eval { $doc->child (1)->content =~ /^# .* (build by mktables|machine-generated .*mktables) / }) {

      for my $heredoc (@{ $doc->find (PPI::Token::HereDoc::) }) {
         my $src = join "", $heredoc->heredoc;

         # special stripping for unicore swashes and properties
         # much more could be done by going binary
         for ($src) {
            s/^(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?/$1\t$2\t$3/gm
               if $self->{optimise_size};

#            s{
#               ^([0-9a-fA-F]+)\t([0-9a-fA-F]*)\t
#            }{
#               # ww - smaller filesize, UU - compress better
#               pack "C0UU",
#                    hex $1,
#                    length $2 ? (hex $2) - (hex $1) : 0
#            }gemx;

            s/#.*\n/\n/mg;
            s/\s+\n/\n/mg;
         }

         # PPI seems to be mostly undocumented
         $heredoc->{_heredoc} = [split /$/, $src];
      }
   }

   $doc->prune (PPI::Token::Comment::);
   $doc->prune (PPI::Token::Pod::);

   # prune END stuff
   for (my $last = $doc->last_element; $last; ) {
      my $prev = $last->previous_token;

      if ($last->isa (PPI::Token::Whitespace::)) {
         $last->delete;
      } elsif ($last->isa (PPI::Statement::End::)) {
         $last->delete;
         last;
      } elsif ($last->isa (PPI::Token::Pod::)) {
         $last->delete;
      } else {
         last;
      }

      $last = $prev;
   }

   # prune some but not all insignificant whitespace
   for my $ws (@{ $doc->find (PPI::Token::Whitespace::) }) {
      my $prev = $ws->previous_token;
      my $next = $ws->next_token;

      if (!$prev || !$next) {
         $ws->delete;
      } else {
         if (
            $next->isa (PPI::Token::Operator::) && $next->{content} =~ /^(?:,|=|!|!=|==|=>)$/ # no ., because of digits. == float
            or $prev->isa (PPI::Token::Operator::) && $prev->{content} =~ /^(?:,|=|\.|!|!=|==|=>)$/
            or $prev->isa (PPI::Token::Structure::)
            or ($self->{optimise_size} &&
                ($prev->isa (PPI::Token::Word::)
                   && (PPI::Token::Symbol:: eq ref $next
                       || $next->isa (PPI::Structure::Block::)
                       || $next->isa (PPI::Structure::List::)
                       || $next->isa (PPI::Structure::Condition::)))
               )
         ) {
            $ws->delete;
         } elsif ($prev->isa (PPI::Token::Whitespace::)) {
            $ws->{content} = ' ';
            $prev->delete;
         } else {
            $ws->{content} = ' ';
         }
      }
   }

   # prune whitespace around blocks, also ";" at end of blocks
   if ($self->{optimise_size}) {
      # these usually decrease size, but decrease compressability more
      for my $struct (PPI::Structure::Block::, PPI::Structure::Condition::, PPI::Structure::List::) {
         for my $node (@{ $doc->find ($struct) }) {
            my $n1 = $node->first_token;
#            my $n2 = $n1->previous_token;
            my $n3 = $n1->next_token;
            $n1->delete if        $n1->isa (PPI::Token::Whitespace::);
#            $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::); # unsafe! AE::timer $MAX_SIGNAL_LATENCY -($NOW - int$NOW)
            $n3->delete if $n3 && $n3->isa (PPI::Token::Whitespace::);
            my $n1 = $node->last_token;
            my $n2 = $n1->next_token;
            my $n3 = $n1->previous_token;
            $n1->delete if        $n1->isa (PPI::Token::Whitespace::);
            $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::);
            $n3->{content} = "" # delete seems to trigger a bug inside PPI
                        if $n3 && ($n3->isa (PPI::Token::Whitespace::)
                                   || ($n3->isa (PPI::Token::Structure::) && $n3->content eq ";"));
         }
      }
   }

   # foreach => for
   for my $node (@{ $doc->find (PPI::Statement::Compound::) }) {
      if (my $n = $node->first_token) {
         $n->{content} = "for" if $n->{content} eq "foreach" && $n->isa (PPI::Token::Word::);
      }
   }

   # reformat qw() lists which often have lots of whitespace
   for my $node (@{ $doc->find (PPI::Token::QuoteLike::Words::) }) {
      if ($node->{content} =~ /^qw(.)(.*)(.)$/s) {
         my ($a, $qw, $b) = ($1, $2, $3);
         $qw =~ s/^\s+//;
         $qw =~ s/\s+$//;
         $qw =~ s/\s+/ /g;
         $node->{content} = "qw$a$qw$b";
      }
   }

   # prune return at end of sub-blocks
   #TODO:
   # PPI::Document
   #   PPI::Statement::Sub
   #     PPI::Token::Word    'sub'
   #     PPI::Token::Whitespace      ' '
   #     PPI::Token::Word    'f'
   #     PPI::Structure::Block       { ... }
   #       PPI::Statement
   #         PPI::Token::Word        'sub'
   #         PPI::Structure::Block   { ... }
   #           PPI::Statement::Break
   #             PPI::Token::Word    'return'
   #             PPI::Token::Whitespace      ' '
   #             PPI::Token::Number          '5'
   #         PPI::Token::Structure   ';'
   #       PPI::Statement::Compound
   #         PPI::Structure::Block   { ... }
   #           PPI::Statement::Break
   #             PPI::Token::Word    'return'
   #             PPI::Token::Whitespace      ' '
   #             PPI::Token::Number          '8'
   #       PPI::Statement::Break
   #         PPI::Token::Word        'return'
   #         PPI::Token::Whitespace          ' '
   #         PPI::Token::Number      '7'

   1
}

=item $perl = $transform->strip ($perl)

Strips the perl source in C<$perl> and returns the stripped source.

=cut

sub strip {
   my ($self, $src) = @_;

   my $filter = sub {
      my $ppi = new PPI::Document \$src
         or return;

      $self->document ($ppi)
         or return;

      $src = $ppi->serialize;
   };

   if (exists $self->{cache} && (2048 <= length $src)) {
      my $file = "$self->{cache}/" . Digest::MD5::md5_hex "$CACHE_VERSION \n" . (!!$self->{optimise_size}) . "\n\x00$src";

      if (open my $fh, "<:perlio", $file) {
         # zero size means unchanged
         if (-s $fh) {
            local $/;
            $src = <$fh>
         }
      } else {
         my $oldsrc = $src;

         $filter->();

         mkdir $self->{cache};

         if (open my $fh, ">:perlio", "$file~") {
            # write a zero-byte file if source is unchanged
            if ($oldsrc eq $src or (syswrite $fh, $src) == length $src) {
               close $fh;
               rename "$file~", $file;
            }
         }
      }
   } else {
      $filter->();
   }

   $src
}

=back

=head1 SEE ALSO

L<App::Staticperl>, L<Perl::Squish>.

=head1 AUTHOR

   Marc Lehmann <schmorp@schmorp.de>
   http://home.schmorp.de/

=cut

1;

Revision:	1.3
Committed:	Mon Jan 23 02:24:31 2012 UTC (12 years, 4 months ago) by root
Branch:	MAIN
CVS Tags:	rel-1_1
Changes since 1.2:	+11 -4 lines
Log Message:	1.1
#	User	Rev	Content
1	root	1.1	=head1 NAME
2
3			Perl::Strip - reduce file size by stripping whitespace, comments, pod etc.
4
5			=head1 SYNOPSIS
6
7			use Perl::Strip;
8
9			=head1 DESCRIPTION
10
11	root	1.2	This module transforms perl sources into a more compact format. It does
12			this by removing most whitespace, comments, pod, and by some other means.
13
14			The resulting code looks obfuscated, but perl (and the deparser) don't
15			have any problems with that. Depending on the source file you can expect
16			about 30-60% "compression".
17
18			The main target for this module is low-diskspace environments, such as
19			L<App::Staticperl>, boot floppy/CDs/flash environments and so on.
20
21			See also the commandline utility L<perlstrip>.
22
23	root	1.1	=head1 METHODS
24
25			The C<Perl::Strip> class is a subclsass of L<PPI::Transform>, and as such
26			inherits all of it's methods, even the ones not documented here.
27
28			=over 4
29
30			=cut
31
32			package Perl::Strip;
33
34	root	1.3	our $VERSION = '1.1';
35			our $CACHE_VERSION = 2;
36	root	1.1
37			use common::sense;
38
39			use PPI;
40
41			use base PPI::Transform::;
42
43			=item my $transform = new Perl::Strip key => value...
44
45			Creates a new Perl::Strip transform object. It supports the following
46			parameters:
47
48			=over 4
49
50			=item optimise_size => $bool
51
52			By default, this module optimises I<compressability>, not raw size. This
53			switch changes that (and makes it slower).
54
55	root	1.3	=item keep_nl => $bool
56
57			By default, whitespace will either be stripped or replaced by a space. If
58			this option is enabled, then newlines will not be removed. This has the
59			advantage of keeping line number information intact (e.g. for backtraces),
60			but of course doesn't compress as well.
61
62	root	1.1	=item cache => $path
63
64			Since this module can take a very long time (minutes for the larger files
65			in the perl distribution), it can utilise a cache directory. The directory
66			will be created if it doesn't exist, and can be deleted at any time.
67
68			=back
69
70			=cut
71
72			# PPI::Transform compatible
73			sub document {
74			my ($self, $doc) = @_;
75
76	root	1.2	$self->{optimise_size} = 1; # more research is needed
77
78	root	1.1	# special stripping for unicore/ files
79	root	1.3	if (eval { $doc->child (1)->content =~ /^# .* (build by mktables\|machine-generated .*mktables) / }) {
80	root	1.1
81			for my $heredoc (@{ $doc->find (PPI::Token::HereDoc::) }) {
82			my $src = join "", $heredoc->heredoc;
83
84			# special stripping for unicore swashes and properties
85			# much more could be done by going binary
86			for ($src) {
87	root	1.3	s/^(?:0([0-9a-fA-F]+))?\t(?:0([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?/$1\t$2\t$3/gm
88	root	1.1	if $self->{optimise_size};
89
90			# s{
91			# ^([0-9a-fA-F]+)\t([0-9a-fA-F]*)\t
92			# }{
93			# # ww - smaller filesize, UU - compress better
94			# pack "C0UU",
95			# hex $1,
96			# length $2 ? (hex $2) - (hex $1) : 0
97			# }gemx;
98
99			s/#.*\n/\n/mg;
100			s/\s+\n/\n/mg;
101			}
102
103			# PPI seems to be mostly undocumented
104			$heredoc->{_heredoc} = [split /$/, $src];
105			}
106			}
107
108			$doc->prune (PPI::Token::Comment::);
109			$doc->prune (PPI::Token::Pod::);
110
111			# prune END stuff
112			for (my $last = $doc->last_element; $last; ) {
113			my $prev = $last->previous_token;
114
115			if ($last->isa (PPI::Token::Whitespace::)) {
116			$last->delete;
117			} elsif ($last->isa (PPI::Statement::End::)) {
118			$last->delete;
119			last;
120			} elsif ($last->isa (PPI::Token::Pod::)) {
121			$last->delete;
122			} else {
123			last;
124			}
125
126			$last = $prev;
127			}
128
129			# prune some but not all insignificant whitespace
130			for my $ws (@{ $doc->find (PPI::Token::Whitespace::) }) {
131			my $prev = $ws->previous_token;
132			my $next = $ws->next_token;
133
134			if (!$prev \|\| !$next) {
135			$ws->delete;
136			} else {
137			if (
138			$next->isa (PPI::Token::Operator::) && $next->{content} =~ /^(?:,\|=\|!\|!=\|==\|=>)$/ # no ., because of digits. == float
139			or $prev->isa (PPI::Token::Operator::) && $prev->{content} =~ /^(?:,\|=\|\.\|!\|!=\|==\|=>)$/
140			or $prev->isa (PPI::Token::Structure::)
141			or ($self->{optimise_size} &&
142			($prev->isa (PPI::Token::Word::)
143			&& (PPI::Token::Symbol:: eq ref $next
144			\|\| $next->isa (PPI::Structure::Block::)
145			\|\| $next->isa (PPI::Structure::List::)
146			\|\| $next->isa (PPI::Structure::Condition::)))
147			)
148			) {
149			$ws->delete;
150			} elsif ($prev->isa (PPI::Token::Whitespace::)) {
151			$ws->{content} = ' ';
152			$prev->delete;
153			} else {
154			$ws->{content} = ' ';
155			}
156			}
157			}
158
159			# prune whitespace around blocks, also ";" at end of blocks
160			if ($self->{optimise_size}) {
161			# these usually decrease size, but decrease compressability more
162			for my $struct (PPI::Structure::Block::, PPI::Structure::Condition::, PPI::Structure::List::) {
163			for my $node (@{ $doc->find ($struct) }) {
164			my $n1 = $node->first_token;
165	root	1.2	# my $n2 = $n1->previous_token;
166	root	1.1	my $n3 = $n1->next_token;
167			$n1->delete if $n1->isa (PPI::Token::Whitespace::);
168	root	1.2	# $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::); # unsafe! AE::timer $MAX_SIGNAL_LATENCY -($NOW - int$NOW)
169	root	1.1	$n3->delete if $n3 && $n3->isa (PPI::Token::Whitespace::);
170			my $n1 = $node->last_token;
171			my $n2 = $n1->next_token;
172			my $n3 = $n1->previous_token;
173			$n1->delete if $n1->isa (PPI::Token::Whitespace::);
174			$n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::);
175	root	1.2	$n3->{content} = "" # delete seems to trigger a bug inside PPI
176			if $n3 && ($n3->isa (PPI::Token::Whitespace::)
177	root	1.1	\|\| ($n3->isa (PPI::Token::Structure::) && $n3->content eq ";"));
178			}
179			}
180			}
181
182			# foreach => for
183			for my $node (@{ $doc->find (PPI::Statement::Compound::) }) {
184			if (my $n = $node->first_token) {
185			$n->{content} = "for" if $n->{content} eq "foreach" && $n->isa (PPI::Token::Word::);
186			}
187			}
188
189			# reformat qw() lists which often have lots of whitespace
190			for my $node (@{ $doc->find (PPI::Token::QuoteLike::Words::) }) {
191			if ($node->{content} =~ /^qw(.)(.*)(.)$/s) {
192			my ($a, $qw, $b) = ($1, $2, $3);
193			$qw =~ s/^\s+//;
194			$qw =~ s/\s+$//;
195			$qw =~ s/\s+/ /g;
196			$node->{content} = "qw$a$qw$b";
197			}
198			}
199
200			# prune return at end of sub-blocks
201			#TODO:
202			# PPI::Document
203			# PPI::Statement::Sub
204			# PPI::Token::Word 'sub'
205			# PPI::Token::Whitespace ' '
206			# PPI::Token::Word 'f'
207			# PPI::Structure::Block { ... }
208			# PPI::Statement
209			# PPI::Token::Word 'sub'
210			# PPI::Structure::Block { ... }
211			# PPI::Statement::Break
212			# PPI::Token::Word 'return'
213			# PPI::Token::Whitespace ' '
214			# PPI::Token::Number '5'
215			# PPI::Token::Structure ';'
216			# PPI::Statement::Compound
217			# PPI::Structure::Block { ... }
218			# PPI::Statement::Break
219			# PPI::Token::Word 'return'
220			# PPI::Token::Whitespace ' '
221			# PPI::Token::Number '8'
222			# PPI::Statement::Break
223			# PPI::Token::Word 'return'
224			# PPI::Token::Whitespace ' '
225			# PPI::Token::Number '7'
226
227			1
228			}
229
230			=item $perl = $transform->strip ($perl)
231
232			Strips the perl source in C<$perl> and returns the stripped source.
233
234			=cut
235
236			sub strip {
237			my ($self, $src) = @_;
238
239			my $filter = sub {
240			my $ppi = new PPI::Document \$src
241			or return;
242
243			$self->document ($ppi)
244			or return;
245
246			$src = $ppi->serialize;
247			};
248
249			if (exists $self->{cache} && (2048 <= length $src)) {
250			my $file = "$self->{cache}/" . Digest::MD5::md5_hex "$CACHE_VERSION \n" . (!!$self->{optimise_size}) . "\n\x00$src";
251
252			if (open my $fh, "<:perlio", $file) {
253			# zero size means unchanged
254			if (-s $fh) {
255			local $/;
256			$src = <$fh>
257			}
258			} else {
259			my $oldsrc = $src;
260
261			$filter->();
262
263			mkdir $self->{cache};
264
265			if (open my $fh, ">:perlio", "$file~") {
266			# write a zero-byte file if source is unchanged
267			if ($oldsrc eq $src or (syswrite $fh, $src) == length $src) {
268			close $fh;
269			rename "$file~", $file;
270			}
271			}
272			}
273			} else {
274			$filter->();
275			}
276
277			$src
278			}
279
280			=back
281
282			=head1 SEE ALSO
283
284			L<App::Staticperl>, L<Perl::Squish>.
285
286			=head1 AUTHOR
287
288			Marc Lehmann <schmorp@schmorp.de>
289			http://home.schmorp.de/
290
291			=cut
292
293			1;
294