ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/Perl-Strip/Strip.pm
Revision: 1.4
Committed: Thu Aug 3 03:05:56 2023 UTC (10 months ago) by root
Branch: MAIN
Changes since 1.3: +13 -7 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     Perl::Strip - reduce file size by stripping whitespace, comments, pod etc.
4    
5     =head1 SYNOPSIS
6    
7     use Perl::Strip;
8    
9     =head1 DESCRIPTION
10    
11 root 1.2 This module transforms perl sources into a more compact format. It does
12     this by removing most whitespace, comments, pod, and by some other means.
13    
14     The resulting code looks obfuscated, but perl (and the deparser) don't
15     have any problems with that. Depending on the source file you can expect
16     about 30-60% "compression".
17    
18     The main target for this module is low-diskspace environments, such as
19     L<App::Staticperl>, boot floppy/CDs/flash environments and so on.
20    
21     See also the commandline utility L<perlstrip>.
22    
23 root 1.1 =head1 METHODS
24    
25     The C<Perl::Strip> class is a subclsass of L<PPI::Transform>, and as such
26     inherits all of it's methods, even the ones not documented here.
27    
28     =over 4
29    
30     =cut
31    
32     package Perl::Strip;
33    
34 root 1.4 our $VERSION = '1.2';
35     our $CACHE_VERSION = 3;
36 root 1.1
37     use common::sense;
38    
39     use PPI;
40    
41     use base PPI::Transform::;
42    
43     =item my $transform = new Perl::Strip key => value...
44    
45     Creates a new Perl::Strip transform object. It supports the following
46     parameters:
47    
48     =over 4
49    
50     =item optimise_size => $bool
51    
52     By default, this module optimises I<compressability>, not raw size. This
53     switch changes that (and makes it slower).
54    
55 root 1.3 =item keep_nl => $bool
56    
57     By default, whitespace will either be stripped or replaced by a space. If
58     this option is enabled, then newlines will not be removed. This has the
59     advantage of keeping line number information intact (e.g. for backtraces),
60     but of course doesn't compress as well.
61    
62 root 1.1 =item cache => $path
63    
64     Since this module can take a very long time (minutes for the larger files
65     in the perl distribution), it can utilise a cache directory. The directory
66     will be created if it doesn't exist, and can be deleted at any time.
67    
68     =back
69    
70     =cut
71    
72     # PPI::Transform compatible
73     sub document {
74     my ($self, $doc) = @_;
75    
76 root 1.2 $self->{optimise_size} = 1; # more research is needed
77    
78 root 1.1 # special stripping for unicore/ files
79 root 1.3 if (eval { $doc->child (1)->content =~ /^# .* (build by mktables|machine-generated .*mktables) / }) {
80 root 1.1
81     for my $heredoc (@{ $doc->find (PPI::Token::HereDoc::) }) {
82     my $src = join "", $heredoc->heredoc;
83    
84     # special stripping for unicore swashes and properties
85     # much more could be done by going binary
86     for ($src) {
87 root 1.3 s/^(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?/$1\t$2\t$3/gm
88 root 1.1 if $self->{optimise_size};
89    
90     # s{
91     # ^([0-9a-fA-F]+)\t([0-9a-fA-F]*)\t
92     # }{
93     # # ww - smaller filesize, UU - compress better
94     # pack "C0UU",
95     # hex $1,
96     # length $2 ? (hex $2) - (hex $1) : 0
97     # }gemx;
98    
99     s/#.*\n/\n/mg;
100     s/\s+\n/\n/mg;
101     }
102    
103     # PPI seems to be mostly undocumented
104     $heredoc->{_heredoc} = [split /$/, $src];
105     }
106     }
107    
108     $doc->prune (PPI::Token::Comment::);
109     $doc->prune (PPI::Token::Pod::);
110    
111     # prune END stuff
112     for (my $last = $doc->last_element; $last; ) {
113     my $prev = $last->previous_token;
114    
115     if ($last->isa (PPI::Token::Whitespace::)) {
116     $last->delete;
117     } elsif ($last->isa (PPI::Statement::End::)) {
118     $last->delete;
119     last;
120     } elsif ($last->isa (PPI::Token::Pod::)) {
121     $last->delete;
122     } else {
123     last;
124     }
125    
126     $last = $prev;
127     }
128    
129     # prune some but not all insignificant whitespace
130     for my $ws (@{ $doc->find (PPI::Token::Whitespace::) }) {
131     my $prev = $ws->previous_token;
132     my $next = $ws->next_token;
133    
134     if (!$prev || !$next) {
135     $ws->delete;
136     } else {
137 root 1.4 if ($next->isa (PPI::Token::Whitespace::)) {
138     $ws->delete;
139     } elsif (
140 root 1.1 $next->isa (PPI::Token::Operator::) && $next->{content} =~ /^(?:,|=|!|!=|==|=>)$/ # no ., because of digits. == float
141     or $prev->isa (PPI::Token::Operator::) && $prev->{content} =~ /^(?:,|=|\.|!|!=|==|=>)$/
142     or $prev->isa (PPI::Token::Structure::)
143     or ($self->{optimise_size} &&
144     ($prev->isa (PPI::Token::Word::)
145     && (PPI::Token::Symbol:: eq ref $next
146     || $next->isa (PPI::Structure::Block::)
147     || $next->isa (PPI::Structure::List::)
148     || $next->isa (PPI::Structure::Condition::)))
149     )
150     ) {
151 root 1.4 # perl has some idiotic warnigns about nonexisting operators
152     if ($prev->isa (PPI::Token::Operator::) && $prev->{content} eq "="
153     && $next->isa (PPI::Token::Operator::) && $next->{content} =~ /[+\-]/
154     ) {
155     # avoid "Reverse %s operator" diagnostic
156     } else {
157     $ws->delete;
158     }
159 root 1.1 } else {
160     $ws->{content} = ' ';
161     }
162     }
163     }
164    
165     # prune whitespace around blocks, also ";" at end of blocks
166     if ($self->{optimise_size}) {
167     # these usually decrease size, but decrease compressability more
168     for my $struct (PPI::Structure::Block::, PPI::Structure::Condition::, PPI::Structure::List::) {
169     for my $node (@{ $doc->find ($struct) }) {
170     my $n1 = $node->first_token;
171 root 1.2 # my $n2 = $n1->previous_token;
172 root 1.1 my $n3 = $n1->next_token;
173     $n1->delete if $n1->isa (PPI::Token::Whitespace::);
174 root 1.2 # $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::); # unsafe! AE::timer $MAX_SIGNAL_LATENCY -($NOW - int$NOW)
175 root 1.1 $n3->delete if $n3 && $n3->isa (PPI::Token::Whitespace::);
176     my $n1 = $node->last_token;
177     my $n2 = $n1->next_token;
178     my $n3 = $n1->previous_token;
179     $n1->delete if $n1->isa (PPI::Token::Whitespace::);
180     $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::);
181 root 1.2 $n3->{content} = "" # delete seems to trigger a bug inside PPI
182     if $n3 && ($n3->isa (PPI::Token::Whitespace::)
183 root 1.1 || ($n3->isa (PPI::Token::Structure::) && $n3->content eq ";"));
184     }
185     }
186     }
187    
188     # foreach => for
189     for my $node (@{ $doc->find (PPI::Statement::Compound::) }) {
190     if (my $n = $node->first_token) {
191     $n->{content} = "for" if $n->{content} eq "foreach" && $n->isa (PPI::Token::Word::);
192     }
193     }
194    
195     # reformat qw() lists which often have lots of whitespace
196     for my $node (@{ $doc->find (PPI::Token::QuoteLike::Words::) }) {
197     if ($node->{content} =~ /^qw(.)(.*)(.)$/s) {
198     my ($a, $qw, $b) = ($1, $2, $3);
199     $qw =~ s/^\s+//;
200     $qw =~ s/\s+$//;
201     $qw =~ s/\s+/ /g;
202     $node->{content} = "qw$a$qw$b";
203     }
204     }
205    
206     # prune return at end of sub-blocks
207     #TODO:
208     # PPI::Document
209     # PPI::Statement::Sub
210     # PPI::Token::Word 'sub'
211     # PPI::Token::Whitespace ' '
212     # PPI::Token::Word 'f'
213     # PPI::Structure::Block { ... }
214     # PPI::Statement
215     # PPI::Token::Word 'sub'
216     # PPI::Structure::Block { ... }
217     # PPI::Statement::Break
218     # PPI::Token::Word 'return'
219     # PPI::Token::Whitespace ' '
220     # PPI::Token::Number '5'
221     # PPI::Token::Structure ';'
222     # PPI::Statement::Compound
223     # PPI::Structure::Block { ... }
224     # PPI::Statement::Break
225     # PPI::Token::Word 'return'
226     # PPI::Token::Whitespace ' '
227     # PPI::Token::Number '8'
228     # PPI::Statement::Break
229     # PPI::Token::Word 'return'
230     # PPI::Token::Whitespace ' '
231     # PPI::Token::Number '7'
232    
233     1
234     }
235    
236     =item $perl = $transform->strip ($perl)
237    
238     Strips the perl source in C<$perl> and returns the stripped source.
239    
240     =cut
241    
242     sub strip {
243     my ($self, $src) = @_;
244    
245     my $filter = sub {
246     my $ppi = new PPI::Document \$src
247     or return;
248    
249     $self->document ($ppi)
250     or return;
251    
252     $src = $ppi->serialize;
253     };
254    
255     if (exists $self->{cache} && (2048 <= length $src)) {
256     my $file = "$self->{cache}/" . Digest::MD5::md5_hex "$CACHE_VERSION \n" . (!!$self->{optimise_size}) . "\n\x00$src";
257    
258     if (open my $fh, "<:perlio", $file) {
259     # zero size means unchanged
260     if (-s $fh) {
261     local $/;
262     $src = <$fh>
263     }
264     } else {
265     my $oldsrc = $src;
266    
267     $filter->();
268    
269     mkdir $self->{cache};
270    
271     if (open my $fh, ">:perlio", "$file~") {
272     # write a zero-byte file if source is unchanged
273     if ($oldsrc eq $src or (syswrite $fh, $src) == length $src) {
274     close $fh;
275     rename "$file~", $file;
276     }
277     }
278     }
279     } else {
280     $filter->();
281     }
282    
283     $src
284     }
285    
286     =back
287    
288     =head1 SEE ALSO
289    
290     L<App::Staticperl>, L<Perl::Squish>.
291    
292     =head1 AUTHOR
293    
294     Marc Lehmann <schmorp@schmorp.de>
295     http://home.schmorp.de/
296    
297     =cut
298    
299     1;
300