#!/usr/local/bin/perl -w # # Copyright (c) 1998,1999,2005 David Hiebeler # For licensing information, see the "printLicense" function below. # # File: cedictlookup, version 1.2.1, July 2005 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # # Version 1.2.1: July 2005 # Version 1.2: June 2005 # Version 1.1.01: June 2005 # Version 1.1: June 1999 # Version 1.0: August 1998 (under the name `chnvlookup') # # Any comments about this software are appreciated, especially # suggested improvements. # # Perl script for doing Chinese vocabulary lookup (by Chinese, pinyin, or # English) from CEDICT-format vocabulary files. # (See "http://www.mandarintools.com/cedict.html" for # information about CEDICT.) # # See cedictlookup.doc for documentation. # Quick & dirty "how-to" for those who don't like documentation: # cedictlookup -vf /my/chinese/vocab/file.gb # Then type in words in Chinese, pinyin, or English, and it will try # to look them up in your vocab file. You'll have to read the docs (or # try to decipher the output of "cedictlookup -help") if you want to use # multiple vocabulary files simultaneously, do non-exact matches, or # enable/disable FastExact searching. # # Note that this script will ignore any lines it encounters in # vocabulary files which are not in cedict format. In particular, it # will ignore any blank lines, and discard any comments which # begin with '#' (whether the comment is the only thing on a line, or # at the end of a line). You may want to use the "cedictcheckformat" # script to catch any lines in your vocabulary files which are not in # strict CEDICT format. If you don't have it already, it's also available # at my web site, at the URL above. # # This script should work correctly on both GB and BIG5 files. # It is not yet tested on UTF-8. # # # Wishlist / known bugs: # o) Allow FastExact match searching for English (although that will probably # turn out to be too much of a memory hog). # o) Allow you to change the FastExact settings without restarting # (as you can change Match Mode and Anchor Mode). # o) "0-tone" (or "forgotten-tone") pinyin lookups do not work with # Match Mode = Shorter. # o) Further improve performance of vocabulary lookups. # o) Allow some options to sort the output in different ways, e.g. # alphabetically by pinyin (using the algorithm that "cedictsort" uses), # or maybe first by length of Chinese and then alphabetically within that. # It should also remove duplicate entries after sorting. # # History: # 22 July 2005: version 1.2.1 # 24 June 2005: version 1.2 # Just renumberings to stay synchronized with the whole package. # 24 June 2005: version 1.1.01 # Updated my address info above # 08 June 1999: version 1.1 # Added "fastexact" matches in Chinese and pinyin. These use # Perl's built-in hashing algorithm to find exact matches to Chinese # words much more quickly than the code was able to do by searching # through the entire vocabulary list. # 10 Dec 1998: version 1.0.1 # added code to turn "uu" into "u:" or vice-versa in the # pinyin field if the user requests it, to handle the fact that both # forms were present in cedict for some time, or may be present in # various Chinese documents you encounter. Note, if you activate # this conversion, it will be done both when reading in the # vocabulary files, and when processing pinyin user input. # You can activate conversion by using either the "-uu2u:" or "-u:2uu" # command-line arguments. # 05 August 1998: original version, 1.0 # ################################################################ # I was using the following for timing the code; commented out for now. # use Benchmark; # Define a couple of constants $uu2uc = 1; $uc2uu = 2; sub printLicense { print <<"END_OF_LICENSE"; cedictlookup version 1.2 June 24, 2005 Copyright (C) 1998,1999,2005 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Set up default values for parameters # sub setdefaults { # 1=case-insensitive matching of English words, 0=case-sensitive matching $caseInsensitive = 1; # whether we are logging to a file (for debugging) $doLog = 0; # whether to give some extra output (for debugging) $verbose = 0; # we slightly change our behavior when called by emacs $underEmacs = 0; $HOME = $ENV{'HOME'}; # directory where vocab files are $vocabDir = "$HOME/chinese/words/vocab"; # colon-separated list of vocab files # $vocabFiles = "vocabulary.gb:syhykb3.gb:yuwen5.gb:yuyan.gb:cedict.gb"; $vocabFiles = "vocabulary.gb:yuwen5.gb:yuyan.gb:cedict.gb"; # default MatchMode and AnchorMode to use $matchMode = "exact"; $anchorMode = "start"; # What kind (if any) of conversion to do in the pinyin field # (converting "uu" to "u:" or vice-versa). $uConvert = 0; # default is to do no conversion # whether or not to remove skill level information from English field $removeSkillLevels = 1; # What kind fast exact matches to do. For now, only Chinese fast exact # matches are supported (and enabled by default). # Note that if you want the default to be "c+" for example, you should # still set this value to "c", and set the next variable to 1 (i.e. # the continueAfterFastMatch variable indicates whether or not you # want the '+'). $fastExactMatches = "c"; # whether or not to continue with general matches after a successful # fast match $continueAfterFastMatch = 0; # explicitly initialize empty hashes %fastMatchChineseHash = (); %fastMatchPinyinHash = (); } # # Print parameter values # sub printsettings { print "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n"; print "vocabDir = `$vocabDir'\n"; print "vocabFiles = `$vocabFiles'\n"; print "matchMode = `$matchMode', anchorMode = `$anchorMode'\n"; print "uConvert = "; if ($uConvert == 0) { print "none\n"; } elsif ($uConvert == $uu2uc) { print "uu2u:\n"; } elsif ($uConvert == $uc2uu) { print "u:2uu\n"; } else { die "Illegal value: $uConvert\n"; } print "rmskill = $removeSkillLevels\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } } # # Print usage message and exit # sub printusage { print "Usage: $0 [-v] [-vd path] [-i | +i] [-emacs] [-uu2u: | -u:2uu]\n"; print " [-vf fname1:fname2:...:fnameN] [-rmskill 0|1] [-mm e|s|l] [-am s|e|n]\n"; print " [-fastexact 0|c] [-license]\n"; print "\n"; print " -v : verbose mode (print extra info at startup and when scanning user input)\n"; print " -emacs : for use when invoked as subprocess under emacs\n"; print " -i : case-insensitive matching for English (enabled by default)\n"; print " +i : do case-sensitive matching of English words\n"; print " -vd path : set vocabDir, directory containing vocabulary files\n"; print " -vf fname1:fname2:...:fnameN : colon-separated list of vocabulary filenames\n"; print " -rmskill 0|1 : do(1) or don't(0) remove skill level info from English field\n"; print " -mm e|s|l : match mode -- (e)xact, (s)horter, (l)onger\n"; print " -am s|e|n : anchor mode -- (s)tart, (e)nd, or (n)one\n"; print " Only used for non-exact match mode; e.g. anchor mode of `start'\n"; print " means your word must be at the beginning of the match\n."; print " -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n"; print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n"; print " -fastexact 0|c : Do fast exact matches in addition to slower pattern matches\n"; print " (This option can also be abbreviated as `-fe').\n"; print " `0' means don't do any fast exact matches.\n"; print " `c' means do fast exact matches for Chinese (which uses extra memory).\n"; print " -license: Print license information and exit.\n"; exit 2; } # # Read a line from a file or stdin, removing comments which begin with "#", # and ignoring empty lines (or lines which only have a comment). # sub getline { if ($#_ == -1) { while (<>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument or no arguments"; } while (<$fh>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } # # Read in a cedict-format vocabulary file # sub readvocabfile { my $chinese; my $english; my $pinyin; my $chineseLength; my $i = 0; open(INFILE, $_[0]) or die "Couldn't open infile `$_[0]'"; READVOCABLOOP: while ($line=getline("INFILE")) { # handle case where line has skill level(s) at beginning if ($line =~ m@^\s*[0-9]+\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese } # line doen't have skill level numbers at beginning elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese } else { # die "Invalid line '$line'"; # ok, don't be so harsh, we will just ignore it next READVOCABLOOP; } # print "chinese='$chinese', pinyin='$pinyin', english='$english'\n"; $chineseLength = length($chinese); # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert == $uu2uc) { $pinyin =~ s/uu/u:/; } elsif ($uConvert == $uc2uu) { $pinyin =~ s/u:/uu/; } # Remove skill level info from English field if requested if ($removeSkillLevels) { $english =~ s@/=[0-9]+=/$@/@; } # now put everything into the main array of hashes $wordList[$vocabIndex]->{"chinese"} = $chinese; $wordList[$vocabIndex]->{"english"} = $english; $wordList[$vocabIndex]->{"pinyin"} = $pinyin; $wordList[$vocabIndex]->{"chineseLength"} = $chineseLength; if ($fastExactMatches =~ m/c/) { # store index of this entry in a hash for fast lookup later. push @{$fastMatchChineseHash{$chinese}}, $vocabIndex; } if ($fastExactMatches =~ m/p/) { # store index of this entry in a hash for fast lookup later. push @{$fastMatchPinyinHash{$pinyin}}, $vocabIndex; } $vocabIndex++; $i++; } close INFILE; print "read $i words\n"; if ($doLog) { print LOGFP "read $i words\n"; } } # # Parse the command-line arguments # sub getargs { $thisarg = shift(); while (defined($thisarg)) { if ($thisarg eq "-v") { # verbose $verbose = 1; } elsif ($thisarg eq "-emacs") { # being run under emacs $underEmacs = 1; } elsif ($thisarg eq "-i") { # case-insensitive English matching $caseInsensitive = 1; } elsif ($thisarg eq "+i") { # case-sensitive English matching $caseInsensitive = 0; } elsif ($thisarg eq "-uu2u:") { $uConvert = $uu2uc; } elsif ($thisarg eq "-u:2uu") { $uConvert = $uc2uu; } elsif ($thisarg eq "-license") { printLicense(); exit(0); } else { $secondarg = shift; if (! defined($secondarg)) { printusage; } if ($thisarg eq "-vd") { # vocabulary directory $vocabDir = $secondarg; } elsif ($thisarg eq "-vf") { # vocabulary file(s) $vocabFiles = $secondarg; } elsif ($thisarg eq "-rmskill") { $removeSkillLevels = $secondarg; } elsif (($thisarg eq "-fastexact") || ($thisarg eq "-fe")) { if ($secondarg =~ s/0//) { $fastExactMatches = ""; } else { $fastExactMatches = ""; if ($secondarg =~ s/c//) { $fastExactMatches .= "c"; } if ($secondarg =~ s/p//) { $fastExactMatches .= "p"; } if ($secondarg =~ s/\+$//) { $continueAfterFastMatch = 1; } if ($secondarg =~ s/-$//) { $continueAfterFastMatch = 0; } } if ($secondarg ne "") { printusage; } } elsif ($thisarg eq "-mm") { # match mode $matchModeStr = $secondarg; if ($matchModeStr eq "e") { $matchMode = "exact"; } elsif ($matchModeStr eq "s") { $matchMode = "shorter"; } elsif ($matchModeStr eq "l") { $matchMode = "longer"; } else { printusage; } } elsif ($thisarg eq "-am") { # anchor mode $anchorModeStr = $secondarg; if ($anchorModeStr eq "s") { $anchorMode = "start"; } elsif ($anchorModeStr eq "e") { $anchorMode = "end"; } elsif ($anchorModeStr eq "n") { $anchorMode = "none"; } else { printusage; } } elsif ($thisarg eq "-log") { # do extra logging to a file # (for debugging) $logFname = $secondarg; $doLog = 1; } else { printusage; } } $thisarg = shift; } } # # Check to see whether a string contains Chinese, pinyin, or English. # This routine returns one of the following strings: # "english", "chinese", or "pinyin" # It also handles a couple of special flags, "-wp" or "-we" at the # beginning of the string, which indicates the user wants to do "wildcard # matching". # sub classifyCPE { $mycopy = $_[0]; # so we can modify the string $firstChar = ord($mycopy); if ($firstChar >= 128) { # if high bit on first char is set, assume chinese return "chinese"; } $mycopy =~ s/[\.,\(\)]+/ /g; # strip out periods, commas, and parentheses if (!(@words = split(" ", $_[0]))) { # split into words die "Fatal error in classifyCPE(), on `$_[0]'\n"; } # Special case -- if the beginning of the string is a "-wp", it # indicates "wildcard pinyin", i.e. the person wants to do partial matches, # letting "jin" match "jin4", "jing1", etc. if ($words[0] =~ m/^(?=-)(?:-\w+\s+)*-wp/) { $_[0] =~ s/^\s*-wp\s*//; # surgically remove the "-wp" from string # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $_[0] =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $_[0] =~ s/u:/uu/; } @words = split(" ", $_[0]); # split string into words again foreach $word (@words) { $word =~ s/^(.*)$/\[\^\\s\]\*$1/; $word =~ s/^(.*[^0-5])$/$1\[\^\\s\]\*/; # turn word # into a nice regexp so it can # be used for matching later # (unless this word already # has a tone on it) } $_[0] = join(' ', @words); # join words back into one string # See comment below about pinyin tone 0 for explanation of this line $_[0] =~ s/0/[1-5]/g; return "pinyin"; } # If the beginning of the string is a "-we", it # indicates "wildcard english", with partial matches (very much like # "-wp" above). if ($words[0] =~ m/^(?=-)(?:-\w+\s+)*-we/) { $_[0] =~ s/^\s*-we\s*//; # surgically remove "-we" from string @words = split(" ", $_[0]); foreach $word (@words) { $word =~ s/^(.*)$/\.\*$1\.\*/; } $_[0] = join(' ', @words); return "english"; } # If we got this far, just check the first word and make a guess from that if ($words[0] =~ m/[a-zA-z]+[012345]$/) { # if the first word is letters followed by a digit from 0-5 (presumably # a pinyin tone), then assume it really is pinyin. # But pinyin tone 0 is a special case, meaning the person doesn't # actually know which tone it is, so we should replace any zero's # by a character class matching any digit 1-5. if ($_[0] =~ m/0(\s|$)/) { $forgotTone = 1; $_[0] =~ s/0/[1-5]/g; } # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $_[0] =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $_[0] =~ s/u:/uu/; } return "pinyin"; } else { return "english"; # otherwise, assume it's English } } # # Used for sorting an array of references to hashes, by the Chinese length sub byChineseLength { $$a->{"chineseLength"} <=> $$b->{"chineseLength"}; } # # Look up a word. This is the main routine which does all the work. # The word to look up will be the first element of the argument array @_, # and the language to use (chinese, pinyin, or english) will be in $_[1]. # The matchMode and anchorMode parameters to use are passed in as # $_[2] and $_[3], respectively. # # Performance note: when MatchMode is "exact" or "longer", I use "eval" # to do the main search loop. This lets me build the pattern-matching # regexp dynamically each time this function is executed, but not have # the regexp be recompiled 12,000+ times as I loop through the # vocabulary file (or however big cedict is when you read this). # From my timings, however, this doesn't seem to make a very big difference. # If anyone has some suggestions for improving the performance of vocab # lookups, I'd appreciate hearing them, since in my opinion this routine # is somewhat slow (not surprising, since it's pretty simple-minded). # sub lookupWord { my ($i, $foundMatch, $numMatchesFound, $matchMode, $anchorMode, $fastExactMatches, $continueAfterFastMatch); my ($lastChinese, $lastPinyin, $lastEnglish, $headStr, $fastRef); my (@matchingWordList, @fastIndexArray); my %fastSeen; $lookFor = $_[0]; # just to make things more readable $languageToUse = $_[1]; # same here $caseStr = ""; if ($_[1] eq "english") { if ($caseInsensitive) { $caseStr= "(?i)"; } } elsif (($_[1] ne "chinese") && ($_[1] ne "pinyin")) { die "Unknown lookup method `$_[1]'"; } $matchMode = $_[2]; $anchorMode = $_[3]; $fastExactMatches = $_[4]; $continueAfterFastMatch = $_[5]; $foundMatch = 0; %fastSeen = (); if (($_[1] eq "chinese") && (($fastExactMatches =~ m/c/) || ($matchMode eq "exact"))) { $fastRef = $fastMatchChineseHash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { print "$wordList[$i]->{chinese} [$wordList[$i]->{pinyin}] $wordList[$i]->{english}\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } } if (($_[1] eq "pinyin") && (($fastExactMatches =~ m/p/) || ($matchMode eq "exact"))) { $fastRef = $fastMatchPinyinHash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { print "$wordList[$i]->{chinese} [$wordList[$i]->{pinyin}] $wordList[$i]->{english}\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } } if ($matchMode eq "exact") { if ($_[1] eq "chinese") { $matchStr = "^$lookFor\$"; } if ($_[1] eq "pinyin") { $matchStr = "^$lookFor\$"; } elsif ($_[1] eq "english") { $matchStr = "$caseStr/$lookFor/"; } # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) { print "matchStr = `$matchStr'\n"; } $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {"; $searchCmd .= "if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {"; $searchCmd .= "print \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";"; if ($doLog) { $searchCmd .= "print LOGFP \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";"; } $searchCmd .= "\$foundMatch = 1;"; $searchCmd .= "}"; $searchCmd .= "}"; eval $searchCmd; } elsif ($matchMode eq "longer") { if ($anchorMode eq "start") { if ($_[1] eq "chinese") { $matchStr = "^$lookFor"; } elsif ($_[1] eq "pinyin") { $matchStr = "^$lookFor"; } elsif ($_[1] eq "english") { $matchStr = "$caseStr/${lookFor}[\\s/]"; } } elsif ($anchorMode eq "end") { # since Chinese characters are 2 bytes each, we need this little # bit of nonsense to avoid matching 2 bytes which straddle 2 # separate Chinese characters (it happens more than you might # expect). if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor\$"; } elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor\$"; } elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr$lookFor/"; } } elsif ($anchorMode eq "none") { if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor"; } elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor"; } elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr${lookFor}[\\s/]"; } } else { die "Unknown anchorMode `$anchorMode'\n"; } # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) { print "matchStr = `$matchStr'\n"; } $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {\n"; # for: i $searchCmd .= " unless (\$fastSeen{\$i}) {\n"; # unless: fastSeen $searchCmd .= " if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {\n"; # if: wordList $searchCmd .= " print \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";\n"; if ($doLog) { $searchCmd .= " print LOGFP \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";\n"; } $searchCmd .= " \$foundMatch = 1;\n"; $searchCmd .= " }"; # if: wordList $searchCmd .= " }"; # unless: fastSeen $searchCmd .= "}"; # for: i if ($verbose) { print "searchCmd = `$searchCmd'\n"; } eval $searchCmd; } elsif ($matchMode eq "shorter") { if (($_[1] ne "chinese") && ($_[1] ne "pinyin")) { print "`shorter' matchMode can only be used with chinese and pinyin lookups\n"; return; } $numMatchesFound = 0; if ($anchorMode eq "start") { $headStr = "^"; for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse}; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } elsif ($anchorMode eq "end") { if ($_[1] eq "chinese") { $headStr = "^(..)*"; } else { $headStr = "(^|\\s)"; } for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse} . "\$"; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } elsif ($anchorMode eq "none") { if ($_[1] eq "chinese") { $headStr = "^(..)*"; } else { $headStr = "(^|\\s)"; } for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse}; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } # print out the matches sorted by length (we want longer matches # displayed last where they will be seen more easily, since they # are presumably closer to the whole phrase which we wanted to match). $lastChinese = ""; $lastPinyin = ""; $lastEnglish = ""; foreach $i (sort byChineseLength @matchingWordList) { # If this entry is identical to the previous one, don't bother # printing it. Although for this to be maximally useful, # we should do alphabetical sorting first if (! ((${$i}->{chinese} eq $lastChinese) && (${$i}->{pinyin} eq $lastPinyin) && (${$i}->{english} eq $lastEnglish))) { print ${$i}->{"chinese"}, " ", ${$i}->{"pinyin"}, " ", ${$i}->{"english"}, "\n"; if ($doLog) { print LOGFP ${$i}->{"chinese"}, " ", ${$i}->{"pinyin"}, " ", ${$i}->{"english"}, "\n"; } $lastChinese = ${$i}->{chinese}; $lastPinyin = ${$i}->{pinyin}; $lastEnglish = ${$i}->{english}; } } } if ($foundMatch == 0) { print "No match found.\n"; } } sub prompt { if ($underEmacs) { print "====\n"; } else { print "\nEnter word:\n"; } if ($doLog) { print LOGFP "\nEnter word:\n"; } } ############## # Main program ############## my $thisTimeMatchMode; my $thisTimeAnchorMode; setdefaults(); # split apart arguments which have spaces in them; this is because when # this script is called from emacs, sometimes multiple arguments may # be packed into one string, separated by spaces (e.g. "-mm e"). @ARGV = split(" ", join(" ", @ARGV)); getargs(@ARGV); print "cedictlookup version 1.2 June 24, 2005\n"; print "Chinese Vocabulary Lookup\n"; print "Copyright (C) 1998,1999,2005\n"; print " David Hiebeler\n"; print " Dept of Mathematics and Statistics\n"; print " University of Maine\n"; print " Orono, ME 04469-5752\n"; print " http://www.math.umaine.edu/faculty/hiebeler\n"; print "cedictlookup comes with ABSOLUTELY NO WARRANTY.\n"; print "Enter `-license' for more information.\n"; use FileHandle; if ($doLog) { open(LOGFP, ">$logFname") or die "Couldn't open logfile `$logFname'\n"; select(LOGFP); $| = 1; # make unbuffered LOGFP->autoflush(1); print LOGFP "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n"; print LOGFP "vocabDir = `$vocabDir'\n"; print LOGFP "vocabFiles = `$vocabFiles'\n"; print LOGFP "matchMode = `$matchMode', anchorMode = `$anchorMode'\n"; } select(STDOUT); $| = 1; STDOUT->autoflush(1); print "cedictlookup: Reading Chinese vocabulary files...\n"; # Append "/" to vocabDir if it doesn't already have one if (! ($vocabDir =~ m@/$@)) { $vocabDir .= "/"; } if ($verbose) { printsettings(); } $vocabIndex = 0; foreach $vFile (split(/:/, $vocabFiles)) { $tmpStr = $vocabDir . $vFile; print "reading `$tmpStr'..."; if ($doLog) { print LOGFP "reading `$tmpStr'... "; } readvocabfile($tmpStr); } $numVocabWords = $vocabIndex; print "cedictlookup: Done reading vocabulary files.\n"; print "cedictlookup: Got $numVocabWords entries.\n"; prompt(); INPUTLOOP: while (defined($line = )) { chop $line; $line =~ s/^\s+//; # remove any leading spaces $line =~ s/\s+$//; # and trailing spaces if ($line eq "q") { exit 0; } $thisTimeMatchMode = $matchMode; $thisTimeAnchorMode = $anchorMode; $thisTimeFastExactMatches = $fastExactMatches; $thisTimeContinueAfterFastMatch = $continueAfterFastMatch; if ($line eq "-license") { printLicense(); prompt(); next INPUTLOOP; } if ($line eq "-q") { print "matchMode = $matchMode, anchorMode = $anchorMode\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } prompt(); next INPUTLOOP; } while ($line =~ m/^(?=-)(?:-\w+\s+)*-(mm|am) (\S+)/) { if ($1 eq "mm") { if ($2 eq "e") { $thisTimeMatchMode = "exact"; } elsif ($2 eq "s") { $thisTimeMatchMode = "shorter"; } elsif ($2 eq "l") { $thisTimeMatchMode = "longer"; } $line =~ s/\-mm \w\s*//; } elsif ($1 eq "am") { if ($2 eq "s") { $thisTimeAnchorMode = "start"; } elsif ($2 eq "e") { $thisTimeAnchorMode = "end"; } elsif ($2 eq "n") { $thisTimeAnchorMode = "none"; } $line =~ s/\-am \w\s*//; } elsif (($1 eq "fe") || ($1 eq "fastexact")) { # This code will never actually get executed, because # the "fastexact" and "fe" flags aren't in the pattern match # above. Later I may allow the user to change the FastExact # mode in the middle of things, by building the hash table # if necessary. But not yet. $firstarg = $1; $secondarg = $2; if ($secondarg =~ s/0//) { $thisTimeFastExactMatches = ""; } else { $thisTimeFastExactMatches = ""; if ($secondarg =~ s/c//) { $thisTimeFastExactMatches .= "c"; } if ($secondarg =~ s/c//) { $thisTimeFastExactMatches .= "p"; } if ($secondarg =~ s/\+$//) {$thisTimeContinueAfterFastMatch=1;} if ($secondarg =~ s/\-$//) {$thisTimeContinueAfterFastMatch=0;} } if ($secondarg ne "") {print "Illegal option for `$firstarg' argument\n";} $line =~ s/\-$firstarg \S+\s*//; } else { die "That's funny, I shouldn't be able to die this way."; } } if ($verbose) { print "line = `$line'\n"; print "mm = $thisTimeMatchMode, am = $thisTimeAnchorMode, fastExactMatches = $thisTimeFastExactMatches, continueAfterFastMatch = $thisTimeContinueAfterFastMatch\n"; } if ($line eq "") { # special case -- if we encounter an empty line, set the local # values of matchMode and anchorMode to the global copies. This # allows you to enter e.g. the line "+mm l -am n" to "permanently" # set the matchMode to "longer", and the anchorMode to "none", # at least until the user changes them again. We also display # the new settings for matchMode and anchorMode. $matchMode = $thisTimeMatchMode; $anchorMode = $thisTimeAnchorMode; $fastExactMatches = $thisTimeFastExactMatches; $continueAfterFastMatch = $thisTimeContinueAfterFastMatch; print "matchMode = $matchMode, anchorMode = $anchorMode\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } prompt(); next INPUTLOOP; } $forgotTone = 0; $typeOfInput = classifyCPE($line); if ($line =~ m/^-/) { warn "Hmm, looks like you tried to specify a flag on the input,\n"; warn "but I didn't recognize it. Ignoring input; try again!\n"; prompt(); next INPUTLOOP; } if ($verbose) { print "typeOfInput = `$typeOfInput'\n"; print "line = `$line'\n"; } if ($doLog) { print LOGFP "typeOfInput = `$typeOfInput'\n"; print LOGFP "line = `$line'\n"; } if ($forgotTone && ($thisTimeMatchMode eq "shorter")) { print "0-tone (forgotten-tone) pinyin lookups currently do not\n"; print "work with Match Mode = Shorter, sorry.\n"; prompt(); next INPUTLOOP; } # timing code is commented out for now... # $t0 = new Benchmark; if ($typeOfInput eq "pinyin") { # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $line =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $line =~ s/u:/uu/; } } if ($underEmacs) { print "Searching...\n"; } lookupWord($line, $typeOfInput, $thisTimeMatchMode, $thisTimeAnchorMode, $thisTimeFastExactMatches, $thisTimeContinueAfterFastMatch); # $t1 = new Benchmark; # $td = timediff($t1, $t0); # print "word lookup took: ", timestr($td), "\n"; prompt(); }