ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/Perl-Strip/Strip.pm
Revision: 1.3
Committed: Mon Jan 23 02:24:31 2012 UTC (12 years, 4 months ago) by root
Branch: MAIN
CVS Tags: rel-1_1
Changes since 1.2: +11 -4 lines
Log Message:
1.1

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     Perl::Strip - reduce file size by stripping whitespace, comments, pod etc.
4    
5     =head1 SYNOPSIS
6    
7     use Perl::Strip;
8    
9     =head1 DESCRIPTION
10    
11 root 1.2 This module transforms perl sources into a more compact format. It does
12     this by removing most whitespace, comments, pod, and by some other means.
13    
14     The resulting code looks obfuscated, but perl (and the deparser) don't
15     have any problems with that. Depending on the source file you can expect
16     about 30-60% "compression".
17    
18     The main target for this module is low-diskspace environments, such as
19     L<App::Staticperl>, boot floppy/CDs/flash environments and so on.
20    
21     See also the commandline utility L<perlstrip>.
22    
23 root 1.1 =head1 METHODS
24    
25     The C<Perl::Strip> class is a subclsass of L<PPI::Transform>, and as such
26     inherits all of it's methods, even the ones not documented here.
27    
28     =over 4
29    
30     =cut
31    
32     package Perl::Strip;
33    
34 root 1.3 our $VERSION = '1.1';
35     our $CACHE_VERSION = 2;
36 root 1.1
37     use common::sense;
38    
39     use PPI;
40    
41     use base PPI::Transform::;
42    
43     =item my $transform = new Perl::Strip key => value...
44    
45     Creates a new Perl::Strip transform object. It supports the following
46     parameters:
47    
48     =over 4
49    
50     =item optimise_size => $bool
51    
52     By default, this module optimises I<compressability>, not raw size. This
53     switch changes that (and makes it slower).
54    
55 root 1.3 =item keep_nl => $bool
56    
57     By default, whitespace will either be stripped or replaced by a space. If
58     this option is enabled, then newlines will not be removed. This has the
59     advantage of keeping line number information intact (e.g. for backtraces),
60     but of course doesn't compress as well.
61    
62 root 1.1 =item cache => $path
63    
64     Since this module can take a very long time (minutes for the larger files
65     in the perl distribution), it can utilise a cache directory. The directory
66     will be created if it doesn't exist, and can be deleted at any time.
67    
68     =back
69    
70     =cut
71    
72     # PPI::Transform compatible
73     sub document {
74     my ($self, $doc) = @_;
75    
76 root 1.2 $self->{optimise_size} = 1; # more research is needed
77    
78 root 1.1 # special stripping for unicore/ files
79 root 1.3 if (eval { $doc->child (1)->content =~ /^# .* (build by mktables|machine-generated .*mktables) / }) {
80 root 1.1
81     for my $heredoc (@{ $doc->find (PPI::Token::HereDoc::) }) {
82     my $src = join "", $heredoc->heredoc;
83    
84     # special stripping for unicore swashes and properties
85     # much more could be done by going binary
86     for ($src) {
87 root 1.3 s/^(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?/$1\t$2\t$3/gm
88 root 1.1 if $self->{optimise_size};
89    
90     # s{
91     # ^([0-9a-fA-F]+)\t([0-9a-fA-F]*)\t
92     # }{
93     # # ww - smaller filesize, UU - compress better
94     # pack "C0UU",
95     # hex $1,
96     # length $2 ? (hex $2) - (hex $1) : 0
97     # }gemx;
98    
99     s/#.*\n/\n/mg;
100     s/\s+\n/\n/mg;
101     }
102    
103     # PPI seems to be mostly undocumented
104     $heredoc->{_heredoc} = [split /$/, $src];
105     }
106     }
107    
108     $doc->prune (PPI::Token::Comment::);
109     $doc->prune (PPI::Token::Pod::);
110    
111     # prune END stuff
112     for (my $last = $doc->last_element; $last; ) {
113     my $prev = $last->previous_token;
114    
115     if ($last->isa (PPI::Token::Whitespace::)) {
116     $last->delete;
117     } elsif ($last->isa (PPI::Statement::End::)) {
118     $last->delete;
119     last;
120     } elsif ($last->isa (PPI::Token::Pod::)) {
121     $last->delete;
122     } else {
123     last;
124     }
125    
126     $last = $prev;
127     }
128    
129     # prune some but not all insignificant whitespace
130     for my $ws (@{ $doc->find (PPI::Token::Whitespace::) }) {
131     my $prev = $ws->previous_token;
132     my $next = $ws->next_token;
133    
134     if (!$prev || !$next) {
135     $ws->delete;
136     } else {
137     if (
138     $next->isa (PPI::Token::Operator::) && $next->{content} =~ /^(?:,|=|!|!=|==|=>)$/ # no ., because of digits. == float
139     or $prev->isa (PPI::Token::Operator::) && $prev->{content} =~ /^(?:,|=|\.|!|!=|==|=>)$/
140     or $prev->isa (PPI::Token::Structure::)
141     or ($self->{optimise_size} &&
142     ($prev->isa (PPI::Token::Word::)
143     && (PPI::Token::Symbol:: eq ref $next
144     || $next->isa (PPI::Structure::Block::)
145     || $next->isa (PPI::Structure::List::)
146     || $next->isa (PPI::Structure::Condition::)))
147     )
148     ) {
149     $ws->delete;
150     } elsif ($prev->isa (PPI::Token::Whitespace::)) {
151     $ws->{content} = ' ';
152     $prev->delete;
153     } else {
154     $ws->{content} = ' ';
155     }
156     }
157     }
158    
159     # prune whitespace around blocks, also ";" at end of blocks
160     if ($self->{optimise_size}) {
161     # these usually decrease size, but decrease compressability more
162     for my $struct (PPI::Structure::Block::, PPI::Structure::Condition::, PPI::Structure::List::) {
163     for my $node (@{ $doc->find ($struct) }) {
164     my $n1 = $node->first_token;
165 root 1.2 # my $n2 = $n1->previous_token;
166 root 1.1 my $n3 = $n1->next_token;
167     $n1->delete if $n1->isa (PPI::Token::Whitespace::);
168 root 1.2 # $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::); # unsafe! AE::timer $MAX_SIGNAL_LATENCY -($NOW - int$NOW)
169 root 1.1 $n3->delete if $n3 && $n3->isa (PPI::Token::Whitespace::);
170     my $n1 = $node->last_token;
171     my $n2 = $n1->next_token;
172     my $n3 = $n1->previous_token;
173     $n1->delete if $n1->isa (PPI::Token::Whitespace::);
174     $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::);
175 root 1.2 $n3->{content} = "" # delete seems to trigger a bug inside PPI
176     if $n3 && ($n3->isa (PPI::Token::Whitespace::)
177 root 1.1 || ($n3->isa (PPI::Token::Structure::) && $n3->content eq ";"));
178     }
179     }
180     }
181    
182     # foreach => for
183     for my $node (@{ $doc->find (PPI::Statement::Compound::) }) {
184     if (my $n = $node->first_token) {
185     $n->{content} = "for" if $n->{content} eq "foreach" && $n->isa (PPI::Token::Word::);
186     }
187     }
188    
189     # reformat qw() lists which often have lots of whitespace
190     for my $node (@{ $doc->find (PPI::Token::QuoteLike::Words::) }) {
191     if ($node->{content} =~ /^qw(.)(.*)(.)$/s) {
192     my ($a, $qw, $b) = ($1, $2, $3);
193     $qw =~ s/^\s+//;
194     $qw =~ s/\s+$//;
195     $qw =~ s/\s+/ /g;
196     $node->{content} = "qw$a$qw$b";
197     }
198     }
199    
200     # prune return at end of sub-blocks
201     #TODO:
202     # PPI::Document
203     # PPI::Statement::Sub
204     # PPI::Token::Word 'sub'
205     # PPI::Token::Whitespace ' '
206     # PPI::Token::Word 'f'
207     # PPI::Structure::Block { ... }
208     # PPI::Statement
209     # PPI::Token::Word 'sub'
210     # PPI::Structure::Block { ... }
211     # PPI::Statement::Break
212     # PPI::Token::Word 'return'
213     # PPI::Token::Whitespace ' '
214     # PPI::Token::Number '5'
215     # PPI::Token::Structure ';'
216     # PPI::Statement::Compound
217     # PPI::Structure::Block { ... }
218     # PPI::Statement::Break
219     # PPI::Token::Word 'return'
220     # PPI::Token::Whitespace ' '
221     # PPI::Token::Number '8'
222     # PPI::Statement::Break
223     # PPI::Token::Word 'return'
224     # PPI::Token::Whitespace ' '
225     # PPI::Token::Number '7'
226    
227     1
228     }
229    
230     =item $perl = $transform->strip ($perl)
231    
232     Strips the perl source in C<$perl> and returns the stripped source.
233    
234     =cut
235    
236     sub strip {
237     my ($self, $src) = @_;
238    
239     my $filter = sub {
240     my $ppi = new PPI::Document \$src
241     or return;
242    
243     $self->document ($ppi)
244     or return;
245    
246     $src = $ppi->serialize;
247     };
248    
249     if (exists $self->{cache} && (2048 <= length $src)) {
250     my $file = "$self->{cache}/" . Digest::MD5::md5_hex "$CACHE_VERSION \n" . (!!$self->{optimise_size}) . "\n\x00$src";
251    
252     if (open my $fh, "<:perlio", $file) {
253     # zero size means unchanged
254     if (-s $fh) {
255     local $/;
256     $src = <$fh>
257     }
258     } else {
259     my $oldsrc = $src;
260    
261     $filter->();
262    
263     mkdir $self->{cache};
264    
265     if (open my $fh, ">:perlio", "$file~") {
266     # write a zero-byte file if source is unchanged
267     if ($oldsrc eq $src or (syswrite $fh, $src) == length $src) {
268     close $fh;
269     rename "$file~", $file;
270     }
271     }
272     }
273     } else {
274     $filter->();
275     }
276    
277     $src
278     }
279    
280     =back
281    
282     =head1 SEE ALSO
283    
284     L<App::Staticperl>, L<Perl::Squish>.
285    
286     =head1 AUTHOR
287    
288     Marc Lehmann <schmorp@schmorp.de>
289     http://home.schmorp.de/
290    
291     =cut
292    
293     1;
294