#!/usr/local/bin/perl -w # # Copyright (c) 2005 David Hiebeler # For licensing information, see the "printLicense" function below. # # File: cedictnew, version 1.2.1, July 2005 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # # Version 1.2.1, July 2005 # # # This is a perl script for operating on CEDICT-format files (see # "http://www.mandarintools.com/cedict.html" for information about CEDICT). # First it reads definitions one or more files. Then, it reads definitions # from a second set of files. It only outputs entries from the second # set of files which were not contained in the first set. This is e.g. # a good way to take a set of personal vocabulary files you have and find # out which ones are not contained in CEDICT, so that you can submit them # to the CEDICT maintainers. # # Usage: cedictmerge [-o outFile] [-newtype 0|1|2|3] file1a file2a file3a ... + file1b file2b file3b ... # The "+" flag separates the first and second sets of files. # # The default newtype is 3 (see explanation below). # # Matches are determined by both the Chinese and pinyin fields, because # sometimes a Chinese character has different pronunciations (and also # different meanings). So if the Chinese field matches in two entries, # but the pinyin does not, they are NOT considered to be the same. # And if the Chinese and pinyin match, the two entries are considered # to be a match, even if the English definitions are different. # # When checking files, if the current entry does NOT match any seen yet, # it will be appended to the output. # # If the current entry HAS already been seen, then one of several things # will happen, depending on which "newtype" you specify (0, 1, 2, or 3): # 0) The current entry will simply be discarded, even if any of its English # definitions are different from the earlier ones. # 1) The current entry will still be displayed on the output, but the special # field "/!!!!!/" will be appended to its English definition. This is # basically to let you do checks by hand, by giving you something easy # to search for. # 2) The current entry's English definitions will be compared with previous # matching entries for this Chinese/pinyin term. If there are any # English definitions which have not been seen yet, then this term will # be displayed on the output with only the new English definitions. # 3) This is like option 2 in that only English definitions which have not been # seen before will be displayed, but the special "/!!!!!/" definition will # also be appended to any terms that were displayed in this way. This is # so you can check the merged terms by hand. This is the default behavior. # # You can use the "-nma x" argument to set the "NeutralMatchesAny" flag. # The value "x" should be either 0 or 1. If you use 1, it means a neutral # tone (i.e. tone 5) matches any tone. This is because it's a pretty # common mistake (at least it used to be, for me) to put the character's # regular tone in the pinyin field for a character, if I don't realize its # tone becomes neutral in that particular word. E.g. consider the word # "jie4 zhi5", which means "(finger) ring", I might have listed it as # "jie4 zhi3" in my vocabulary files, since that "zhi" character normally # has the third tone when it's on its own. This option helps catch such # things when merging. # # You can use the "-uu2u:" command-line argument to turn all pinyin entries # like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite, # i.e. turn "nu:3" into "nuu3". (This feature is available because # both forms have appeared in various versions of CEDICT). # # This script should work correctly on both GB and BIG5 files. # It is not yet tested on UTF-8. # # Note that this script will exit if it encounters any lines not # in cedict format, with the following exception: it will ignore (and # discard) any blank lines, and discard any comments which begin # with '#' (whether the comment is the only thing on a line, or at the # end of a line). You may want to use the "cedictcheckformat" script # first to catch any lines in your vocabulary file which are not in strict # CEDICT format. You may also want to use "cedictsort" after merging, # to sort the results. # # Wishlist: # o) Allow the option of ignoring the pinyin field, since then it could # catch entries which have mistakes in the pinyin (other than just # mistakes about characters changing to neutral tone). It would probably # be best to do this only for multi-character words, so that it wouldn't # flag all of the single characters which have multiple pronunciations. # # History: # 22 July 2005: original version, 1.2.1 (numbered to be in synch w/ package) # Define a couple of constants $uu2uc = 1; $uc2uu = 2; $debug = 0; sub printLicense { print <<"END_OF_LICENSE"; cedictnew version 1.2 June 24, 2005 Copyright (C) 1998,1999 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Set up default parameter values. # sub setupdefaults { $outFname = "-"; $neutralMatchesAny = 0; $uConvert = 0; $newType = 2; } # # Print a usage message and exit. # sub printusage { print "Usage: $0 [-V] [-nma 0|1] [-o outFname] [-newtype 0|1|2|3] [-uu2u: | -u:2uu] file1 [ file2 file3 ... fileN]\n"; print " -V: turn on verbose (debugging) info\n"; print " -nma 0|1 : Neutral Matches Any (1 for on, 0 for off) -- for comparing two\n"; print " entries to see if they are the same; on means neutral tone\n"; print " matches any tone\n"; print " -o outFname : specify where to put the output (default = stdout)\n"; print " -newtype 0|1|2|3 : 0: Don't output duplicate entries\n"; print " 1: Output duplicates as is, with extra '/!!!!!/' at end of English def\n"; print " 2: Output duplicates by displaying only new unique English definitions\n"; print " 3: combo of 1,2: show dupes intelligently, but also mark with '/!!!!!/'\n"; print " (default = 3: intelligent marked display of unique definitions)\n"; print " -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n"; print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n"; exit 2; } # # Read a line, removing comments which begin with "#", and ignoring # empty lines (or lines which only have a comment). # $getlinelinenum = 0; sub getline { if ($#_ == -1) { while (<>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument"; } while (<$fh>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return; } # # Return 1 if the two strings have the same pinyin, otherwise return 0 # The two strings are in $_[0] and $_[1] # sub samePinyin { my (@words1, @words2, $i, $tmpWord1, $tmpWord2); @words1 = split(" ", $_[0]); @words2 = split(" ", $_[1]); if (scalar(@words1) != scalar(@words2)) { return 0; } for ($i=0; $i < scalar(@words1); $i++) { if ($neutralMatchesAny) { if ($words1[$i] =~ m/5$/) { # word1 is neutral tone, so we swap the words, since # we use word2 for the pattern-matching, and we want # to turn the tone 5 into a pattern which matches any tone. $tmpWord1 = $words2[$i]; $tmpWord2 = $words1[$i]; $tmpWord2 =~ s/5/\\d/; } else { # no need to swap, but if word2 is neutral tone, turn # it into a pattern which matches any tone. $tmpWord1 = $words1[$i]; $tmpWord2 = $words2[$i]; $tmpWord2 =~ s/5/\\d/; } if (! ($tmpWord1 =~ m/^$tmpWord2$/)) { return 0; } } else { if ($words1[$i] ne $words2[$i]) { return 0; } } } # if we got this far, it must have been a match return 1; } # merge a English definitions into a definition list # $_[0] contains the original definition string, containing one or more # English definitions # $_[1] contains the new definitions # If $_[2] contains 2: intelligently merge new def (i.e. append it if it's # not already in the existing list of definitions, otherwise do nothing) # If $_[2] contains 3: intelligently merge new def, and if we actually # append anything to the original defs, mark the new definition string # with /!!!!!/ if that mark isn't already there sub newEnglish { my $origEnglishStr; my $newStr; my $newEnglishStr; my @origEnglish; my @newEnglish; my $mergeType; my %origEnglishHash; my $changedDef = 0; $origEnglishStr= $_[0]; $newEnglishStr = $_[1]; $mergeType = $_[2]; $newStr = ""; @origEnglish = split("/", $origEnglishStr); @newEnglish = split("/", $newEnglishStr); shift(@origEnglish); shift(@newEnglish); if ($debug) { print "# origEnglish:\n# "; foreach $englishWord (@origEnglish) { print "`$englishWord' "; } print "\n"; print "# newEnglish:\n# "; foreach $englishWord (@newEnglish) { print "`$englishWord' "; } print "\n"; } %origEnglishHash = (); foreach $oldE (@origEnglish) { $origEnglishHash{$oldE} = 1; } foreach $newE (@newEnglish) { if (!$origEnglishHash{$newE}) { if ($debug) { print "# got new word: `$newE'\n"; } $origEnglishStr .= $newE . "/"; if ($newStr eq "") { $newStr = "/"; } $newStr .= $newE . "/"; $changedDef = 1; } else { if ($debug) { print "# word was old: `$newE'\n"; } } } if ($changedDef && ($mergeType == 3)) { # add special "!!!!!" field to English # definition, if it isn't there already if (! ($newStr =~ m@/!!!!!/@)) { if ($newStr eq "") { $newStr = "/"; } $newStr .= "!!!!!/"; } } return $newStr; } # # read in a vocabulary file # Filename to read from is in $_[0] # Reference to array of references to hashes for first vocabulary list # to use is in $_[1] # Reference to array of references to hashes for second vocabulary list # to use is in $_[2] # If $_[3] is 1, we are saving to "hidden" vocab list, which won't # be displayed. If $_[3] is 2, we are processing the second batch of # files, and should check against things we've already seen. # sub readvocabfile { my $levels; my $chinese; my $english; my $pinyin; my $arrayRef1; my $arrayRef2; my $i; $arrayRef1 = $_[1]; $arrayRef2 = $_[2]; open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'"; READVOCABLOOP: while ($line=getline("INFILE")) { # handle case where line has skill level(s) at beginning if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels .= " "; } # line doen't have skill level numbers at beginning elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels = ""; } else { $line =~ s/[\n\r]//; print "Invalid line: `$line'\n"; die "Invalid line encountered"; } # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert == $uu2uc) { $pinyin =~ s/uu/u:/; } elsif ($uConvert == $uc2uu) { $pinyin =~ s/u:/uu/; } if ($_[3] == 1) { # Just store this in first vocabulary list. # But quietly do intelligent merges as we do so. $foundDup = 0; $fastRef = $fastMatchChineseHash{$chinese}; if (defined($fastRef)) { @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { if (samePinyin($$arrayRef1[$i]->{pinyin}, $pinyin)) { # we've seen this entry before $tmpStr = newEnglish($$arrayRef1[$i]->{english},$english,2); $tmpStr =~ s@^/@@; # remove leading slash $$arrayRef1[$i]->{english} .= $tmpStr; next READVOCABLOOP; } } } # now put everything into the main array of hashes # (we only get here if this entry hadn't already been seen) $$arrayRef1[$vocabIndex1]->{"levels"} = $levels; $$arrayRef1[$vocabIndex1]->{"chinese"} = $chinese; $$arrayRef1[$vocabIndex1]->{"english"} = $english; $$arrayRef1[$vocabIndex1]->{"pinyin"} = $pinyin; push @{$fastMatchChineseHash{$chinese}}, $vocabIndex1; $vocabIndex1++; } else { # $_[3] is 2 # add this to the second vocabulary list, if it's new $foundDup = 0; $fastRef = $fastMatchChineseHash{$chinese}; if (defined($fastRef)) { @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { if (samePinyin($$arrayRef1[$i]->{pinyin}, $pinyin)) { # we've seen this entry before if ($newType == 0) { # don't do duplicates, so just drop this entry next READVOCABLOOP; } elsif ($newType == 1) { # just add "!!!!!" to English definition # of duplicate words $english .= "!!!!!/"; } else { # first merge any new English defs into # existing entry $tmpStr = newEnglish($$arrayRef1[$i]->{english},$english,$newType); # then, only output new definitions if ($tmpStr eq "") { next READVOCABLOOP; } $english = $tmpStr; $tmpStr =~ s@^/@@; # remove leading slash $$arrayRef1[$i]->{english} .= $tmpStr; } } } } # now put everything into the main array of hashes $$arrayRef2[$vocabIndex2]->{"levels"} = $levels; $$arrayRef2[$vocabIndex2]->{"chinese"} = $chinese; $$arrayRef2[$vocabIndex2]->{"english"} = $english; $$arrayRef2[$vocabIndex2]->{"pinyin"} = $pinyin; $vocabIndex2++; } } close INFILE; } # # Print out the vocabulary list # sub printVocab { my $fh = $_[0]; foreach $word (@wordList2) { print $fh "$word->{levels}", "$word->{chinese} [$word->{pinyin}] $word->{english}\n"; } } ############## # Main program ############## setupdefaults(); while ($thisarg = shift()) { if ($thisarg eq "-o") { if (!defined($outFname = shift())) { printusage; } } elsif ($thisarg eq "-nodup") { $newType = 0; } elsif ($thisarg eq "-license") { printLicense(); exit(0); } elsif ($thisarg eq "-V") { $debug = 1; } elsif ($thisarg eq "-nma") { if (!defined($neutralMatchesAny = shift())) { printusage; } } elsif ($thisarg eq "-uu2u:") { $uConvert = $uu2uc; } elsif ($thisarg eq "-u:2uu") { $uConvert = $uc2uu; } elsif ($thisarg eq "-newtype") { if (!defined($newType = shift)) { printusage; } } else { last; } } open (OUTFP, ">$outFname") or die "Couldn't open output file `$outFname'\n"; # These next 2 lines are just to avoid warnings about using OUTFP once # and it being a possible typo... $myJunk = \*OUTFP; $myJunk = ""; @wordList1 = (); @wordList2 = (); $vocabIndex1 = 0; $vocabIndex2 = 0; %fastMatchChineseHash = (); if ($debug) { print "# ==== Reading first set of files ====\n"; } while (defined($thisarg) && ($thisarg ne "+")) { $savedVocabIndex = $vocabIndex1; readvocabfile($thisarg, \@wordList1, \@wordList2, 1); print "# Read ", $vocabIndex1 - $savedVocabIndex, " new entries from file `$thisarg'\n"; $thisarg = shift(); } if ($debug) { print "# ==== Reading second set of files ====\n"; } $thisarg = shift; # skip over "+" while (defined($thisarg) && ($thisarg ne "+")) { $savedVocabIndex = $vocabIndex2; readvocabfile($thisarg, \@wordList1, \@wordList2, 2); print "# Added ", $vocabIndex2 - $savedVocabIndex, " new entries from file `$thisarg'\n"; $thisarg = shift(); } printVocab("OUTFP");