#!/usr/local/bin/perl -w # # Copyright (c) 1998,2005 David Hiebeler # For licensing information, see the "printLicense" function below. # # File: cedictmerge, version 1.2.1, July 2005 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # # Version 1.2.1: July 2005 # Version 1.2: June 2005 # Version 1.1.01: June 2005 # Version 1.1: December 1998 # Version 1.0: July 1998 # # # This is a perl script for merging one or more CEDICT-format files (see # "http://www.mandarintools.com/cedict.html" for # information about CEDICT). # # Usage: cedictmerge [-o outFile] [-mergetype 0|1|2|3] file1 [file2 file3 file4 ... fileN] # # The program sequentially reads through all of the files in order, # sequentially processing entries as it finds them. It doesn't really care # which file an entry was in, thus e.g. it merges duplicates even within # one file, which is why it can be useful to run cedictmerge on a single file. # # The default mergetype is 2 (see explanation below). # # Matches are determined by both the Chinese and pinyin fields, because # sometimes a Chinese character has different pronunciations (and also # different meanings). So if the Chinese field matches in two entries, # but the pinyin does not, they are NOT considered to be the same. # And if the Chinese and pinyin match, the two entries are considered # to be a match, even if the English definitions are different. # # When merging files, if the current entry has NOT been seen yet, it will # be appended to the output. # # If the current entry HAS already been seen, then one of several things # will happen, depending on which "mergetype" you specify (0, 1, 2, or 3): # 0) The current entry will simply be discarded, even if any of its English # definitions are different from the earlier ones. # 1) The current entry will still be displayed on the output, but the special # field "/!!!!!/" will be appended to its English definition. This is # basically to let you do merges by hand, by giving you something easy # to search for. # 2) The current entry will be merged into previous matching entries. # Each English definition in the current entry will be compared with the # English definitions seen so far for this Chinese/pinyin term; if this # English definition has not been seen so far, it will be appended to # the definition of the earlier instance of this entry. This is the # most automated option, and is the default. # 3) This is like option 2 in that English definitions which have not been # seen before will be merged into already-encountered entries, except that # the special "/!!!!!/" definition is also appended to any terms that # were merged in this way. This is so you can check the merged terms; # e.g. maybe two definitions were the same but one had a typo, or used # an abbreviation, etc. so they didn't technically match, but you want to # step in and fix tihngs. # # You can use the "-nma x" argument to set the "NeutralMatchesAny" flag. # The value "x" should be either 0 or 1. If you use 1, it means a neutral # tone (i.e. tone 5) matches any tone. This is because it's a pretty # common mistake (at least for me) to put the "intrinsic tone" in the pinyin # field for a character, if I don't realize the character's tone becomes # neutral in that particular word. E.g. consider the word "jie4 zhi5", which # means "(finger) ring", I would tend to list it as "jie4 zhi3" in my # vocabulary files, since that "zhi" character normally has the third tone # when it's on its own. This option helps catch such things when merging. # # Use can use the "-uu2u:" command-line argument to turn all pinyin entries # like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite, # i.e. turn "nu:3" into "nuu3". (This feature is available because # both forms have appeared in various versions of CEDICT). # # This script should work correctly on both GB and BIG5 files. # It is not yet tested on UTF-8. # # Note that this script will exit if it encounters any lines not # in cedict format, with the following exception: it will ignore (and # discard) any blank lines, and discard any comments which begin # with '#' (whether the comment is the only thing on a line, or at the # end of a line). You may want to use the "cedictcheckformat" script # first to catch any lines in your vocabulary file which are not in strict # CEDICT format. You may also want to use "cedictsort" after merging, # to sort the results. # # Wishlist: # o) Allow the option of ignoring the pinyin field, since then it could # catch entries which have mistakes in the pinyin (other than just # mistakes about characters changing to neutral tone). It would probably # be best to do this only for multi-character words, so that it wouldn't # flag all of the single characters which have multiple pronunciations. # # History: # 22 July 2005: version 1.2.1 # The code was updated to use hashes, which makes it run MUCH faster. # It used to take several minutes for me to run it against the current # version of CEDICT, but now only takes a couple of seconds. Note that # this uses more memory. If you have very little memory and need to # merge large files, use an older version of this program. # 24 June 2005: version 1.2 # Just a renumbering to stay synchronized with the whole package. # 24 June 2005: version 1.1.01 # Updated my address info above, and some slight tweaks to # documentation above, mainly adding an example for the "-nma" argument. # 10 Dec 1998: version 1.0.1 # Added code to turn "uu" into "u:" or vice-versa in the # pinyin field if the user requests it, to handle the fact that both # forms have been present in cedict for some time now. # 29 July 1998: original version, 1.0 # Define a couple of constants $uu2uc = 1; $uc2uu = 2; sub printLicense { print <<"END_OF_LICENSE"; cedictsort version 1.2 June 24, 2005 Copyright (C) 1998,1999 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Set up default parameter values. # sub setupdefaults { $outFname = "-"; $neutralMatchesAny = 0; $uConvert = 0; $mergeType = 2; } # # Print a usage message and exit. # sub printusage { print "Usage: $0 [-nma 0|1] [-o outFname] [-mergetype 0|1|2|3] [-uu2u: | -u:2uu] file1 [ file2 file3 ... fileN]\n"; print " -nma 0|1 : Neutral Matches Any (1 for on, 0 for off) -- for comparing two\n"; print " entries to see if they are the same; on means neutral tone\n"; print " matches any tone\n"; print " -o outFname : specify where to put the output (default = stdout)\n"; print " -mergetype 0|1|2|3 : 0: Don't output duplicate entries\n"; print " 1: Output duplicates as is, with extra '/!!!!!/' at end of English def\n"; print " 2: Merge duplicates by including only new unique English definitions\n"; print " 3: combo of 1,2: merge dupes intelligently, but also mark with '/!!!!!/'\n"; print " (default = 2: intelligent merging of unique definitions)\n"; print " -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n"; print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n"; exit 2; } # # Read a line, removing comments which begin with "#", and ignoring # empty lines (or lines which only have a comment). # $getlinelinenum = 0; sub getline { if ($#_ == -1) { while (<>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument"; } while (<$fh>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return; } # # Return 1 if the two strings have the same pinyin, otherwise return 0 # The two strings are in $_[0] and $_[1] # sub samePinyin { my (@words1, @words2, $i, $tmpWord1, $tmpWord2); @words1 = split(" ", $_[0]); @words2 = split(" ", $_[1]); if (scalar(@words1) != scalar(@words2)) { return 0; } for ($i=0; $i < scalar(@words1); $i++) { if ($neutralMatchesAny) { if ($words1[$i] =~ m/5$/) { # word1 is neutral tone, so we swap the words, since # we use word2 for the pattern-matching, and we want # to turn the tone 5 into a pattern which matches any tone. $tmpWord1 = $words2[$i]; $tmpWord2 = $words1[$i]; $tmpWord2 =~ s/5/\\d/; } else { # no need to swap, but if word2 is neutral tone, turn # it into a pattern which matches any tone. $tmpWord1 = $words1[$i]; $tmpWord2 = $words2[$i]; $tmpWord2 =~ s/5/\\d/; } if (! ($tmpWord1 =~ m/^$tmpWord2$/)) { return 0; } } else { if ($words1[$i] ne $words2[$i]) { return 0; } } } # if we got this far, it must have been a match return 1; } # merge a English definitions into a definition list # $_[0] contains the original definition string, containing one or more # English definitions # $_[1] contains the new definitions # If $_[2] contains 2: intelligently merge new def (i.e. append it if it's # not already in the existing list of definitions, otherwise do nothing) # If $_[2] contains 3: intelligently merge new def, and if we actually # append anything to the original defs, mark the new definition string # with /!!!!!/ if that mark isn't already there sub mergeEnglish { my $origEnglishStr; my $newEnglishStr; my @origEnglish; my @newEnglish; my $mergeType; my %origEnglishHash; $origEnglishStr= $_[0]; $newEnglishStr = $_[1]; $mergeType = $_[2]; @origEnglish = split("/", $origEnglishStr); @newEnglish = split("/", $newEnglishStr); shift(@origEnglish); shift(@newEnglish); print "origEnglish:\n"; foreach $englishWord (@origEnglish) { print "`$englishWord' "; } print "\n"; print "newEnglish:\n"; foreach $englishWord (@newEnglish) { print "`$englishWord' "; } print "\n"; %origEnglishHash = (); foreach $oldE (@origEnglish) { $origEnglishHash{$oldE} = 1; } foreach $newE (@newEnglish) { if (!$origEnglishHash{$newE}) { print "got new word: `$newE'\n"; $origEnglishStr .= $newE . "/"; $changedDef = 1; } } if ($changedDef && ($mergeType == 3)) { # add special "!!!!!" field to English # definition, if it isn't there already if (! ($origEnglishStr =~ m@/!!!!!/@)) { $origEnglishStr .= "!!!!!/"; } } return $origEnglishStr; } # # read in a vocabulary file # Filename to read from is in $_[0] # Reference to array of references to hashes to use is in $_[1] # sub readvocabfile { my $levels; my $chinese; my $english; my $pinyin; my $arrayRef; my $i; $arrayRef = $_[1]; open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'"; READVOCABLOOP: while ($line=getline("INFILE")) { # handle case where line has skill level(s) at beginning if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels .= " "; } # line doen't have skill level numbers at beginning elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels = ""; } else { $line =~ s/[\n\r]//; print "Invalid line: `$line'\n"; die "Invalid line encountered"; } # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert == $uu2uc) { $pinyin =~ s/uu/u:/; } elsif ($uConvert == $uc2uu) { $pinyin =~ s/u:/uu/; } # push @{$fastMatchChineseHash{$chinese}}, $vocabIndex; $foundDup = 0; $fastRef = $fastMatchChineseHash{$chinese}; if (defined($fastRef)) { @fastIndexArray = @{$fastRef}; DUPLOOP: foreach $i (@fastIndexArray) { # everything we found in the hash has the same chinese, # but we need to check to see if they have the same pinyin. # It may be a character with different sounds/meanings. if (samePinyin($$arrayRef[$i]->{pinyin}, $pinyin)) { $foundDup = 1; if ($mergeType == 0) { # don't do duplicates, so don't add this entry # to the vocabulary list; go read the next # entry. next READVOCABLOOP; } elsif ($mergeType == 1) { # just add "!!!!!" to English definition # of duplicate words $english .= "!!!!!/"; } elsif (($mergeType == 2) || ($mergeType == 3)) { # merge in the English definitions of the # duplicated words. $$arrayRef[$i]->{english} = mergeEnglish($$arrayRef[$i]->{english},$english,$mergeType); next READVOCABLOOP; } last DUPLOOP; # we found duplicate, so exit # inner loop } } } # now put everything into the main array of hashes $$arrayRef[$vocabIndex]->{"levels"} = $levels; $$arrayRef[$vocabIndex]->{"chinese"} = $chinese; $$arrayRef[$vocabIndex]->{"english"} = $english; $$arrayRef[$vocabIndex]->{"pinyin"} = $pinyin; if ($foundDup == 0) { push @{$fastMatchChineseHash{$chinese}}, $vocabIndex; } $vocabIndex++; } close INFILE; } # # Print out the vocabulary list # sub printVocab { my $fh = $_[0]; foreach $word (@wordList) { print $fh "$word->{levels}", "$word->{chinese} [$word->{pinyin}] $word->{english}\n"; } } ############## # Main program ############## setupdefaults(); while ($thisarg = shift()) { if ($thisarg eq "-o") { if (!defined($outFname = shift())) { printusage; } } elsif ($thisarg eq "-nodup") { $mergeType = 0; } elsif ($thisarg eq "-license") { printLicense(); exit(0); } elsif ($thisarg eq "-nma") { if (!defined($neutralMatchesAny = shift())) { printusage; } } elsif ($thisarg eq "-uu2u:") { $uConvert = $uu2uc; } elsif ($thisarg eq "-u:2uu") { $uConvert = $uc2uu; } elsif ($thisarg eq "-mergetype") { if (!defined($mergeType = shift)) { printusage; } } else { last; } } open (OUTFP, ">$outFname") or die "Couldn't open output file `$outFname'\n"; # These next 2 lines are just to avoid warnings about using OUTFP once # and it being a possible typo... $myJunk = \*OUTFP; $myJunk = ""; @wordList = (); $vocabIndex = 0; %fastMatchChineseHash = (); while (defined($thisarg)) { my $savedVocabIndex = $vocabIndex; readvocabfile($thisarg, \@wordList); print "# Added ", $vocabIndex - $savedVocabIndex, " new entries from file `$thisarg'\n"; $thisarg = shift(); } printVocab("OUTFP");