#!/usr/bin/perl -w ############################################################################### # # htmlf.pl -- A simple http proxy server that fixes nasty html. # # Copyright (C) 1998-2000 Tony Finch # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA, or go to . # # Some of the code was inspired by Randal Schwartz's 100-line # anonymizing proxy. Thanks are due to Gisele Aas for libwww-perl and # to Larry Wall for perl itself. # # Version History: # # 1998/08/02 # 0.01 initial dead-end that didn't use libwww-perl # 0.02 simple libww-perl daemon based on HTTP::Daemon man page # 0.03 polished logging, signal handling, etc # 0.04 working proxying again # 0.05 filtering added # 0.90 pipelined upstream -> downstream content delivery # filtering doesn't work with pipelining # problems with old tcp conenctions hanging around in # the TIME_WAIT state # 1998/08/04 # 0.91 SO_REUSEADDR added # 0.92 generalised HTTP header adding # better log file handling # 0.93 access control list (suggestion from Mike Bristow) # restart on SIGHUP # 0.94 filter out selected URLs (advertisemants) # 1998/08/05 # 0.95 detach from terminal # 0.96 more tags stripped # 0.97 lynx-style frames (unbelievably cute code) # more comments in configuration # 0.98 fixed pipelined filtering tag fragmentation bug # (it was starting to get annoying) # fixed regexp performance bug # fixed bugs in bug fixes # 1998/08/09 # 0.99 configurable relocations # fixed bug in fragment handling caused by comments # improved handling of errors at startup # # 1998/08/28 # better error reporting for accept call # # 1998/09/09 # ability to lose privileges after startup # # 1999/03/24 # 1.00 un-animating GIFs (using code from Matt Welsh's Krisna proxy # http://now.cs.berkeley.edu/~mdw/projects/siva/krisna.pl) # 1.01 option to chroot at start-up # # 1999/03/25 # 1.10 httpd mode & completely overhauled rewriting engine # 1.11 lynx-style image links # # 2000/03/02 # belatedly CVSify # # 2000/04/17 # be more relaxed about hyperlink syntax # # $Header: /home/fanf/cvs/htmlf/htmlf.pl,v 1.3 2000/04/17 21:44:07 fanf Exp $ # require 5.004; use strict; my $version = "1.11"; my $name = "htmlf"; my $verstr = "$name/$version"; ############################################################################### # # subroutine declarations # sub alc (@); sub msg (@); sub bomb (@); sub do_client ($); sub do_request ($$); sub do_valid_request ($$); sub rcv_headers ($$); sub header_msg ($); sub respond403 ($$); sub relocate ($$); sub redirect ($$); ############################################################################### # # config stuff # my $confname = "shirt.www.demon.net"; # working directory my $root = "/proxy"; # if we are running as root, chroot to the working directory then # change to a nonprivileged user after opening the socket my $user = "web"; # these may be left undefined, in which case no pid file is created # and logging goes to stdout. my $pidfile = "etc/htmlf.pid"; my $logfile = "etc/htmlf.log"; # 0 = no messages # 1 = errors only # 2 = daemon activity # 3 = details of requests & responses my $msglevel = 3; # see also the arguments to use LWP::Debug below # arguments to HTTP::Daemon::new -- see the IO::Socket manpage my %socketargs = (LocalPort => 8080, # LocalHost => 'localhost', Reuse => 'yes'); # client access control list (each element is a regexp) my @allowed = ("^localhost\$", "^server\\.noc\\.demon\\.net\$", "^dotat\\.at\$"); my $paranoid = 'yes'; # or undef # protocol mode: either "httpd" or "proxy" my $mode = "httpd"; # html to remove my @shite_html_tags = ( 'small', 'big', # old fashioned size changers 'font', 'basefont', # newfangled size/font/colour changers 'style', # even newer size/font/colour changers 'script', # these are so broken it hurts 'blink', # shoot the man who thought of this one 'bgsound', # this is cretinous 'embed', # this just looks dubious 'noembed', # and this 'marquee', # why? what is it for? aargh! 'object', # nasty dynamic content 'param', # yet more nasty stuff 'meta', # belongs in the headers 'frameset' # yet more sucky implementation # 'base' # can cause frameish problems ); my @shite_html_attrs = ( 'bgcolor', # and horrors 'background', # images from the 'text', 'alink', # from the again 'vlink', 'link', # from the again again # 'color', # things (stripped with the tag) 'width', 'height', # table trouble 'style', # inline style info # 'type', # advisory content-type often seen with CSS 'target', # frame-related 'onmouseover', # arseing script shite 'onmouseout', # more of the same 'onclick' # yet again ); # mess up frames? my $framemode = 'lynx'; # or undef # mess up images? my $imagemode = 'lynx'; # or undef # (libwww-perl doesn't know about keep-alive, so leave the Connection # headers in these lists) # headers to remove my @shite_client_headers = qw(User-Agent From Referer Cookie Connection Proxy-Connection); my @shite_server_headers = qw(Set-Cookie); # headers to add my %nice_client_headers = (); my %nice_server_headers = (Connection => 'close'); # urls to block my @shite_sites = ("doubleclick\\.net"); # urls to remap my %move_site = ("/ftp\\.demon\\.co\\.uk/" => "/ftp.demon.net/"); # parent proxy configuration my @proxies = (['http','ftp'], 'http://www-cache.demon.co.uk:8080'); my @no_proxy = ('demon.net'); # get proxy config from environment? my $env_proxy = 0; ############################################################################### # # miscellaneous init stuff # use Socket; use HTTP::Daemon; use HTTP::Request; use HTTP::Response; use LWP::UserAgent; use LWP::Debug qw(+); # qw(+ -conns); # qw(-); # use IO::Handle; # my version of HTTP::Daemon with a different server name @HTMLF::Daemon::ISA = qw(HTTP::Daemon); sub HTMLF::Daemon::product_tokens { return $verstr ." ". HTTP::Daemon::product_tokens; } # the main socket my $daemon = new HTMLF::Daemon %socketargs; die "$name: new HTMLF::Daemon: $!\n" unless defined $daemon; if ($>) # already non-privileged { if ($root) { chdir $root or die "$name: chdir $root: $!\n"; } } else # lose privileges { my ($n,$p,$uid,$gid,@rest) = getpwnam $user; die "$name: bad user `$user'\n" if !$uid || !$gid; if ($root) { chdir $root or die "$name: chdir $root: $!\n"; # chroot $root or die "$name: chroot $root: $!\n"; } $< = $> = $uid; $( = $) = $gid; } # detach from tty { my $pid = fork; exit if $pid; die "$name: fork: $!" unless defined $pid; } if (defined $logfile) { open STDERR, ">> $logfile" or die "$name: open >> $logfile: $!"; STDERR->autoflush(1); } if (defined $pidfile) { open PID, "> $pidfile" or die "$name: open > $pidfile: $!"; print PID "$$\n"; close PID; } msg 2, "$verstr ($confname) listening on <", $daemon->url, ">"; ############################################################################### # # signal handlers # sub exit_handler { $SIG{KILL} = $SIG{QUIT} = $SIG{INT} = 'DEFAULT'; bomb "Exiting: SIG", shift; } $SIG{KILL} = $SIG{QUIT} = $SIG{INT} = \&exit_handler; sub pipe_handler { $SIG{PIPE} = \&pipe_handler; msg 2, "whoops! SIG", shift; # goto main; } $SIG{PIPE} = \&pipe_handler; sub hup_handler { $SIG{HUP} = $SIG{KILL} = $SIG{QUIT} = $SIG{INT} = 'DEFAULT'; close $daemon; msg 2, "Re-executing: SIG", shift; # funky syntax to shut up the stupid warning eval { exec $0 }; bomb "exec $0: $!"; } $SIG{HUP} = \&hup_handler; sub child_handler { $SIG{CHLD} = \&child_handler; my $pid = wait; msg 2, "Child $pid returned $?"; } $SIG{CHLD} = \&child_handler; ############################################################################### # # how to munge things # # optimise the list matching my $allowed = join "|", @allowed; my $shite_site = join "|", @shite_sites; # information needed for URL rewriting in httpd mode my ($localhost,$remotehost); my %munger = do { my $dquotstr = '"[^"]*"'; # match double-quoted string my $squotstr = "'[^']*'"; # match single-quoted string my $word = "[^>='\"\\s-]+"; # an attribute value my $notstr = "[^>\'\"]"; # text that isn't a string and doesn't end a tag # no quantifier on $notstr because it's used inside quantified brackets # handlers for tags my %taghandler = map {( $_ => sub {""} )} @shite_html_tags, map "/$_", @shite_html_tags; if ($framemode eq 'lynx') { $taghandler{'frame'} = sub { my $tag = shift; my $attr = shift; my ($s,$t,$d) = @$attr{'src', 'title', 'longdesc'}; $d ||= $s; $t &&= "$t: "; return "

Frame: $t$d

"; } ; } if ($imagemode eq 'lynx') { $taghandler{'img'} = sub { my $tag = shift; my $attr = shift; my ($s,$a) = @$attr{'src', 'alt'}; $a ||= $s; $a =~ s/"(.*)"/$1/; return "$a (image)"; } ; } msg 3, "we have handlers for the tags ", join ', ', keys %taghandler; # handlers for attributes my $attrkiller = sub { my $attr = shift; my $val = shift; msg 3, "attrkiller removing $attr"; delete $val->{$attr}; } ; my %attrhandler = map {( $_ => $attrkiller )} @shite_html_attrs; if ($mode eq 'httpd') { my $urlmangler = sub { my $attr = shift; my $val = shift; msg 3, "urlmangler $localhost $remotehost $val->{$attr}"; $val->{$attr} =~ s|^["']?http://(.*)["']?$|"http://$localhost/$1"|; $val->{$attr} =~ s|^["']?/(.*)["']?$|"/$remotehost/$1"|; msg 3, "urlmangler output = $val->{$attr}"; } ; $attrhandler{'src'} = $urlmangler; $attrhandler{'href'} = $urlmangler; } msg 3, "we have handlers for the attributes ", join ', ', keys %attrhandler; # match a whole tag my $tagarg = "$notstr|$dquotstr|$squotstr"; my $tag_re = "<\\s*(/?$word)\\s*((?:$tagarg)*)>"; msg 3, "Regexp to match tags: $tag_re"; # match an attribute my $attrval = "$dquotstr|$squotstr|$word"; my $attr_re1 = "($word\\s*=\\s*(?:$attrval)|$word)"; my $attr_re2 = "($word)\\s*=\\s*($attrval)|($word)"; msg 3, "Regexp to match attributes: $attr_re2"; # dispatch the appropriate handler functions my $dispatcher = sub { my $tag = lc shift; my $attr = shift; msg 3, "dispatching $tag $attr"; my @attr = ($attr =~ m/$attr_re1/gio); msg 3, "attributes are @attr"; my %attr = alc map { m/$attr_re2/io; $2 ? ($1 => $2) : ($3 => "") } @attr; msg 3, "parsed attributes are ", join " ", map $attr{$_} ? "$_=$attr{$_}" : $_, keys %attr; foreach $attr (keys %attr) { $attrhandler{$attr}->($attr, \%attr) if defined $attrhandler{$attr}; } my $ret; if ($taghandler{$tag}) { return $taghandler{$tag}->($tag, \%attr); } else { my $ret = join " ", $tag, map $attr{$_} ? "$_=$attr{$_}" : $_, keys %attr; return "<$ret>"; } } ; # save a fragment of a tag at the end of a buffer my $pdqstr = '"[^"]*'; my $psqstr = "'[^']*"; # partial strings # the [^!] prevents matching comments that may contain unmatched quotes my $tagfrag = "(<|<[^!](?:$tagarg)*(?:$pdqstr|$psqstr)?)\$"; my $fragment; msg 3, "Regexp to match tag fragments: $tagfrag"; my $htmlmunger = sub { my ($html) = $fragment . shift; msg 3, "Munging html. length = ", length $html; # call the dispatcher for each tag $html =~ s/$tag_re/&$dispatcher($1,$2)/gioe; # save fragments; if the html is ok this will be empty at eof $fragment = ($html =~ s/$tagfrag// ? $1 : ""); msg 3, "Munged. fragment = $fragment"; return $html; } ; my $gifmunger = sub { my $gif = $fragment.shift; return $gif if $gif !~ /^GIF/; # $magic & 128 indicates the presence of a gct # $magic & 7 is the size of the gct my $magic = substr $gif, 10, 1; my $gctlen = ($magic & 128) ? 2 << ($magic & 7) : 0; msg 3, "Munging GIF, gctlen = $gctlen"; if (13 # header and logical screen descriptor + $gctlen*3 # global colour table + 19 # animation info < length $gif) { $gif =~ s|\x21\xff\x0bNETSCAPE|\x21\xfe\x0bNETSCAPE|; $gif =~ s|\x21\xff\x0b\xffNETSCAPE|\x21\xfe\x0b\xffNETSCAPE|; } else { $fragment = $gif; $gif = ""; } return $gif; } ; ('text/html' => $htmlmunger, 'image/gif' => $gifmunger); } ; ############################################################################### # # set up a user-agent for the fetches # (it's global for efficiency reasons) # # a version of LWP::UserAgent that doesn't do redirects or authentication @HTMLF::UserAgent::ISA = qw(LWP::UserAgent); sub HTMLF::UserAgent::redirect_ok { 0 } sub HTMLF::UserAgent::request { LWP::Debug::trace('()'); my $self = shift; $self->simple_request(@_); } my $useragent = new HTMLF::UserAgent; $useragent->agent($verstr); # AFAICT, leaving parse_head on causes the response object to change # (by acquiring headers) while html content is received; however, # we've probably already sent the response, so there seems little # point in doing this extra parsing. Alternatively, perhaps subverting # HTML::HeadParser so that it does the mangling would be a cute # implementation... $useragent->parse_head(0); # set up proxying config if ($env_proxy) { $useragent->env_proxy; } else { $useragent->no_proxy(@no_proxy); while (@proxies) { my $scheme = shift @proxies; my $proxy = shift @proxies; $useragent->proxy($scheme, $proxy); } } ############################################################################### # # main loop # main: while (1) { my $client = $daemon->accept; $client or bomb "accept: $!"; my $clientname = gethostbyaddr $client->peeraddr, AF_INET; msg 2, "Connection from ", $client->peerhost, " [", $clientname || "?", "] port ", $client->peerport; if ($paranoid) { my ($name, $aliases, $addrtype, $length, @addrs) = gethostbyname $clientname; unless (grep { $client->peeraddr eq $_ } @addrs) { msg 2, "Paranoid DNS lookup failed -- reverse addrs are ", join ", ", map { inet_ntoa $_ } @addrs; next; } } if ($client->peerhost !~ m/$allowed/io && $clientname && $clientname !~ m/$allowed/io) { msg 2, "Access denied to client"; next; } my $pid = fork; msg 1, "fork: $!" unless defined $pid; msg 3, "Forked: $pid" if $pid; next if $pid != 0; # closes $client as a side-effect do_client $client; close $client; exit; } ############################################################################### # # handle a connection from a client # sub do_client ($) { my $client = shift; my $request = $client->get_request; if (!defined $request) { msg 1, "get_request: ", $! || "null request"; } else { do_request $client, $request; } } ############################################################################### # # deal with a request # sub do_request ($$) { my ($client, $request) = @_; my $url = $request->url; msg 2, "Request: $url"; $request->headers->scan(headermsg('>')); if ($url->scheme !~ /^(http|gopher|ftp|nntp)$/) { respond403 $client, "This proxy cannot handle @{[$url->scheme]} URLs.\n"; } # Randal's funky httpd/proxy mode URL check hack elsif ($mode ne "proxy" and $url->rel->netloc) { respond403 $client, "Absolute URLs are invalid in httpd mode.\n"; } elsif ($mode ne "httpd" and not $url->rel->netloc) { respond403 $client, "Relative URLs are invalid in proxy mode.\n"; } elsif ($url =~ m/$shite_site/io) { respond403 $client, "That site is too shite."; } else { foreach my $regexp (keys %move_site) { return relocate $client, $url if $url =~ s/$regexp/$move_site{$regexp}/i; } if ($mode ne 'httpd') { do_valid_request $client, $request; } else { if ($url =~ s|http://([^/]+)/([^/]+)/(.*)|http://$2/$3|) { $localhost = $1; $remotehost = $2; $request->url($url); $request->header(Host => $remotehost); do_valid_request $client, $request; } elsif ($url =~ s|http://([^/]+)/([^/]+)|http://$1/$2/|) { redirect $client, $url; } else { respond403 $client, "Please use a URL of the form ${url}www.example.com/\n"; } } } } ############################################################################### # # pass a valid request on to a server # sub do_valid_request ($$) { my ($client, $request) = @_; $request->remove_header(@shite_client_headers); $request->header(%nice_client_headers); msg 3, "Cleaned request:"; $request->headers->scan(headermsg('>')); msg 3, "Fetching..."; my $munger = undef; my $donehead = 0; my $response = $useragent->request($request, sub { my ($content, $response) = @_; $munger = rcv_headers $response, $client unless $donehead++; # handle pipelined content transfer msg 3, "Received ", length($content), " bytes"; $content = &$munger($content) if $munger; $client->print($content); msg 3, "Sent ", length($content), " bytes"; } ); # in case of empty content rcv_headers $response, $client unless $donehead++; msg 3, "Done."; } ############################################################################### # # handle response headers # sub rcv_headers ($$) { my ($response, $client) = @_; msg 2, "Response: ", $response->status_line; $response->headers->scan(headermsg('<')); my $munger = $munger{$response->content_type}; # this header is no longer valid $response->remove_header('Content-Length') if $munger && $response->content_length; $response->remove_header(@shite_server_headers); $response->header(%nice_server_headers); msg 3, "Cleaned response: "; $response->headers->scan(headermsg('<')); $client->send_response($response); msg 3, "Sent response"; return $munger; } ############################################################################### # # canned responses # sub respond403 ($$) { my ($client, $content) = @_; my $response = HTTP::Response->new(403, "Forbidden"); $response->content($content); msg 2, "Response: ", $response->status_line; $response->headers->scan(headermsg('<')); $client->send_response($response); msg 3, "Done."; } sub relocate ($$) { my ($client, $url) = @_; my $response = HTTP::Response->new(302, "Moved Temporarily"); $response->header('location', $url); msg 2, "Response: ", $response->status_line; $response->headers->scan(headermsg('<')); $client->send_response($response); msg 3, "Done."; } sub redirect ($$) { my ($client, $url) = @_; my $response = HTTP::Response->new(301, "Moved Permanently"); $response->header('location', $url); msg 2, "Response: ", $response->status_line; $response->headers->scan(headermsg('<')); $client->send_response($response); msg 3, "Done."; } ############################################################################### # # prettyprint headers # sub headermsg ($) { my ($quotechar) = @_; return sub { my ($header,$value) = @_; $value =~ s/\n/ /; msg 3, "$quotechar $header: $value"; } } ############################################################################### # # messages # sub msg (@) { my $level = shift; print STDERR scalar gmtime, " : $0 ($$) : ", @_, "\n" unless $level > $msglevel; } sub bomb (@) { msg 1, @_; exit; } # taking liberties with someone else's private parts here sub LWP::Debug::_log { my $msg = shift; $msg =~ s/\n$//; # ensure no trailing "\n" my($package,$filename,$line,$sub) = caller(2); msg 0, "$sub: $msg"; } ############################################################################### # # lower-case alternate elements of an array # sub alc (@) { my $i; map { $i++ & 1 ? $_ : lc $_ } @_; } ###############################################################################