Corpora: Re: Html concordancing

From: Noord G.J.M. van (vannoord@let.rug.nl)
Date: Wed May 10 2000 - 15:19:12 MET DST

  • Next message: Mark Davies: "Corpora: Subsets and "partially-tagged" corpora"

    below is a web-script that does something similar to webcorp.

    -- 
    Gertjan van Noord Alfa-informatica, RUG,  Postbus 716, 9700 AS Groningen
    vannoord at let dot rug dot nl            http://www.let.rug.nl/~vannoord/
    

    #!/usr/local/bin/perl -w # © Gertjan van Noord, 1997. # mailto:vannoord@let.rug.nl BEGIN { use CGI::Carp qw(carpout); open(LOG,">>/users1/vannoord/tmp/log/nc") or die "Unable to open /users1/vannoord/tmp/log/nc: $!\n"; carpout(*LOG); }

    use strict; use CGI qw(:standard); use LWP::UserAgent; use HTML::LinkExtor; use HTML::TreeBuilder; use HTML::FormatText;

    my %languages= ( 'XX' => 'Any language', 'zh' => 'Chinese', 'cs' => 'Czech', 'da' => 'Danish', 'nl' => 'Dutch', 'en' => 'English', 'et' => 'Estonian', 'fi' => 'Finnish', 'fr' => 'French', 'de' => 'German', 'el' => 'Greek', 'he' => 'Hebrew', 'hu' => 'Hungarian', 'is' => 'Icelandic', 'it' => 'Italian', 'ja' => 'Japanese', 'ko' => 'Korean', 'lv' => 'Latvian', 'lt' => 'Lithuanian', 'no' => 'Norwegian', 'pl' => 'Polish', 'pt' => 'Portuguese', 'ro' => 'Romanian', 'ru' => 'Russian', 'es' => 'Spanish', 'sv' => 'Swedish', );

    my ($url,$ignore_url,@links,$word);

    param('nr',0) unless param('nr'); param('nr',200) unless param('nr') < 200;

    param('lang','nl') unless param('lang'); param('lang','XX') unless exists $languages{param('lang')};

    $word=param("word");

    my %engines = ( 'surfnet' => sub { $url='http://search.surfnet.nl/cgi-bin/search.pl?nbq=10&fmt=c&lang='. param('lang').'&Web=on&zoekterm=' . $word . '&stq='.param('nr'); $ignore_url='^[/#]|surfnet.nl|pdf|mailto:|altavista'; }, 'altavista' => sub { $url='http://www.altavista.com/cgi-bin/query?pg=q&kl='.param('lang'). '&text=yes&q=' . $word . '&stq='.param('nr'); $ignore_url='^[/#]|www.altavista.com'; }, 'telegraaf' => sub { $url='http://green.telegraafnet.nl/?query='.$word . '&page='.int((param('nr')+10)/10); $ignore_url='^[/#]|telegraaf|www.autovisie.nl|www.prive.nl|www.linux.org'; } );

    if (param('engine')) { param('engine','surfnet') unless exists $engines{param('engine')}; }

    my $opt_s='(?:(?:[\.\?\!][\'\"]*)\s+|(?:[\t ]*\n\n\s*))'; # end-of-sentence (wie het weet mag het zeggen). # herkent geen afkortingen...

    print header, start_html(-'title'=>'NetKwic', -'author'=>'vannoord@let.rug.nl', -'style'=>{'src'=>'/~vannoord/vn.css'}), h1('NetKwic'), start_form, "Type in a word:\n", textfield(-'name'=>'word'), radio_group(-'name'=>'engine', -'values'=>['surfnet','altavista','telegraaf'], -'default'=>'surfnet'), popup_menu(-'name'=>'lang', -'values'=> [ sort { $languages{$a} cmp $languages{$b} }(keys(%languages))], -'default'=>'nl', -'labels'=>\%languages), submit();

    if ($word && param('engine')) { my $ua = new LWP::UserAgent; $ua->agent('NetKwic © 1997 G.J. van Noord, vannoord@let.rug.nl'.$ua->agent); $ua->timeout(10); $ua->max_size(30000);

    $engines{param('engine')}->();

    my $self = self_url; $self =~ s/&nr=[0-9]+//g;

    print "<a href=", $self, "&nr=", param('nr')+10, ">More with <i>$word</i></a><br><p><ol>\n";

    # links extractor. my $p = HTML::LinkExtor->new( sub { my ($tag,$attr,$link)=@_; if ( $tag eq 'a' && $attr eq 'href' && $link !~ /$ignore_url/o) { push(@links,$link); } });

    my $req=new HTTP::Request GET => $url; my $res = $ua->request($req,sub {$p->parse($_[0])}); die "Error: " . $res->code . " " . $res->message ."\n" if $res->is_error; my $link; foreach $link (@links) { printf("<li><a href=%s>%s</a><br>\n<table>\n",$link,$link); $req = new HTTP::Request GET => $link; $res = $ua->request($req); if ($res->is_success) { if ($res->header('Content_Type') =~ m|text/|) { my $html=HTML::TreeBuilder->new(); $html->parse($res->content); my $formatter=HTML::FormatText->new(); my $input = $formatter->format($html); $input =~ tr/\r//d; # does this get through? $input =~ s/\s*[-=][-=][-=]+\s*/\n\n/sg; $input =~ s/\[IMAGE\]/\n\n/sg; $input =~ s/\n +/\n/sg; $input =~ s/\[TABLE NOT SHOWN\]/\n\n/sg; my $znr; while ($input =~ /.*?(?:$opt_s)/sg) { $_=$&; $znr++; tr/\n\t / /s; if (/$word/io) { s/($word)/<b>$1<\/b>/ig; printf("<tr><td>%s</td><td>%s</td></tr>",$znr,$_); } } print "</table>\n"; if ($res->header('X-Content-Range')) { print "(truncated)"; } } else { #not text print "</table><i>\n"; print $res->header('Content_Type'); print "</i></li>\n"; } } else { #link failed print "</table><i>\n"; print $res->code() . ": ".$res->message(); print "</i></li>\n"; } print "</li>\n"; } } print end_form;

    print "</ol>", hr(), end_html;



    This archive was generated by hypermail 2b29 : Wed May 10 2000 - 15:18:18 MET DST