#!/usr/bin/env jperl # -*-perl-*- ## Googlia-0.1 ## by Yusuke Shinyama ## ## 必要なもの: jperl5, chasen, jcode.pl ## ## つかいかた: ## $ ./googlia.prl < input.html > output.html ## use IPC::Open2; require 'jcode.pl'; $CHASEN='chasen'; # chasen のパス # 無視したい名詞のリストを @nounlist に入れてください。 @nounlist = ( "新山" ); foreach my $i (@nounlist) { $fnouns{$i} = 1; } sub noun($$) { my ($n, $r) = @_; if (4 <= $r && $n !~ /^[ぁ-ん]+$/) { $n =~ s/^\s+//; $n =~ s/\s+$//; my $x = join('', map { sprintf("%%%02x",$_) } unpack('C*', jcode'sjis($n))); printf '%s', $x, $n; } else { print $n; } } sub conv($) { my ($t) = @_; my $s = '', $nouns = 0, $r = 0; $t =~ s/\s/__SPC__/g; print TO "$t\n"; while() { chop; if ($_ eq 'EOS') { if ($nouns) { noun($s, $r); } else { print $s; } return; } my @F = split("\t", $_); $F[0] =~ s/__SPC__/ /g; if ($F[3] =~ /^名詞/ || $F[3] =~ /^接頭辞/ || $F[3] =~ /^未知語/) { if (!$nouns) { print $s; $s = ''; } $nouns = 1; $s .= $F[0]; $r += length($F[0]) if (!$fnouns{$F[0]}); } else { if ($nouns) { noun($s, $r); $s = ''; } $nouns = 0; $r = 0; $s .= $F[0]; } } } open2(FROM, TO, $CHASEN); select(TO); $|=1; select(STDOUT); while(<>) { chop; my $x, $s = jcode'euc($_); while($s ne '') { if ($s =~ /^(<[^>]+>)/ || $s =~ /^(\s+)/) { $x = $1; print $x; if ($s =~ /^/) { $link = 0; } } elsif ($s =~ /^([^<]+)/) { $x = $1; if (!$link && $x =~ /[ぁ-んァ-ヴー亜-熙々]/) { conv($x); } else { print $x; } } else { last; } $s = substr($s, length($x)); } print "\n"; } close(FROM);