#!/usr/pkg/bin/perl # use strict; use warnings; use HTML::Strip; my $infile = shift; my $outfile = shift; $infile = "-" unless $infile; $outfile = "STDOUT" unless $outfile; my $hs = HTML::Strip->new(emit_spaces => 0); my $text_ref = _slurp_file ($infile); my $clean_text = $hs->parse($$text_ref); _burp_file ($outfile, \$clean_text); sub _slurp_file { my $infile = shift; open( my $fh, $infile ) or die "Unable to open $infile in _slurp_file: $!\n"; my $text = do { local( $/ ) ; <$fh> } ; return \$text; } sub _burp_file { my $outfile = shift; my $text_ref = shift; if ($outfile eq "STDOUT") { print $$text_ref; } else { open( my $fh, ">$outfile" ) or die "Unable to open $outfile in _burp_file: $!\n" ; print $fh $$text_ref ; } } =head1 SYNOPSIS unhtml is a perl script that strips HTML tags from text. =head1 VERSION This documentation describes version 1.3 of unhtml =head1 DESCRIPTION Uses HTML::Strip to do the real work; this is a wrapper around that module that allows you to specify command line arguments - standard input/output is assumed if no args are given. If only one arg is given, it is assumed to be the input pathname. =head1 USAGE Examples (the following have equivalent results): =over 4 =item unhtml < foo.html > foo.txt =item unhtml foo.html > foo.txt =item unhtml foo.html foo.txt =back =head1 REQUIRED ARGUMENTS None. Acts as a STDIN/STDOUT pipe with no arguments. =head1 OPTIONS None. =head1 DEPENDENCIES Requires HTML::Strip (perl -MCPAN -e 'install HTML::Strip' as root on any Unix-based OS will work). =head1 LICENSE Copyright (c) 2010 slugmax@sdf.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =cut