#!/usr/local/bin/perl -w # # Copyright (c) 1998,2005 David Hiebeler # For licensing information, see the "printLicense" function below. # # # File: cedictsort, version 1.2.1, July 2005 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # # Version 1.2.1: July 2005 # Version 1.2: June 2005 # Version 1.1.01: June 2005 # Version 1.1: December 1998 # Version 1.0: July 1998 # # This is a Perl script for sorting a CEDICT-format file (see # "http://www.mandarintools.com/cedict.html" for # information about CEDICT), based on pinyin romanization of the # pronunciation. It tries to sort in the standard order that you # see in most Chinese (hardcopy) dictionaries. # # Usage: cedictsort file1 [file2 file3 ...] # You can sort stdin, by using "-" as a file name, e.g. # cat myfile.gb | cedictsort - # The results are displayed on stdout. # # Use can use the "-uu2u:" command-line argument to turn pinyin entries # like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite, # i.e. turn "nu:3" into "nuu3". (This feature is available because # both forms have appeared in various versions of CEDICT). # # This script should work correctly on both GB and BIG5 files. # It is not yet tested on UTF-8. # # Note that this script will exit if it encounters any lines not # in cedict format, with the following exception: it will ignore (and # discard) any blank lines, and discard any comments which begin # with '#' (whether the comment is the only thing on a line, or at the # end of a line). You should use the "cedictcheckformat" script # to catch any lines in your vocabulary file which are not in strict # CEDICT format, before trying to sort. # # History: # 22 July 2005: version 1.2.1 # 24 June 2005: version 1.2 # Just renumberings to stay synchronized with the whole package. # 24 Jun 2005: version 1.1.01 # Updated my address info above, and commented out the # 'print "hi there\n";' debugging line which seemed to have been # accidentally left in for the past 6.5 years! # 10 Dec 1998: version 1.0.1 # Added code to turn "uu" into "u:" or vice-versa in the # pinyin field if the user requests it, to handle the fact that both # forms were present in cedict for some time, or may be present in # various Chinese documents you encounter. # 31 July 1998: version 1.0.0.1 # Added code to print out "levels" info for each entry, # if present (the code previously could read files with levels info, # but discarded the information when outputting sorted results) # 29 July 1998: original version, 1.0 # # The two Chinese characters "bu4" and "yi1" will be sorted by their # "standalone" pinyin pronunciation, even though they actually change their # tones depending on the character following them. $bu4gb = "²»"; $yi1gb = "Ò»"; $bu4big5 = "¤£"; $yi1big5 = "¤@"; # Define a couple of constants $uu2uc = 1; $uc2uu = 2; sub printLicense { print <<"END_OF_LICENSE"; cedictsort version 1.2 June 24, 2005 Copyright (C) 1998,1999 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Print a usage message and exit. # sub printusage { print "Usage: $0 [-uu2u: | -u:2uu] file1 [file2 file3 ...]\n"; print "You can sort standard input, by using '-' as a file name, e.g.\n"; print " cat myfile.gb | cedictsort -\n"; print " -uu2u: Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n"; print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n"; exit 2; } # # Read a line from a file or stdin, removing comments which begin with "#", # and ignoring empty lines (or lines which only have a comment) # sub getline { if ($#_ == -1) { while (<>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument or no arguments"; } while (<$fh>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } # # read in a vocabulary file # sub readvocabfile { my $levels; my $chinese; my $english; my $pinyin; my $chineseLength; my $i = 0; my $tmpChineseChar; my $tmpChinese; open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'"; while ($line=getline("INFILE")) { # handle case where line has skill level(s) at beginning if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels .= " "; } # line doesn't have skill level numbers at beginning elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3); $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $levels = ""; } else { $line =~ s/[\n\r]//; print "Invalid line: `$line'\n"; die "Invalid line encountered"; } # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert == $uu2uc) { $pinyin =~ s/uu/u:/; } elsif ($uConvert == $uc2uu) { $pinyin =~ s/u:/uu/; } # we associate a unique number (kept track of by the variable # $chineseCharsHashCounter) with every Chinese character which appears # in an entry, to use for sorting later $tmpChinese = $chinese; $tmpChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers $chineseLength = length($tmpChinese) / 2; for ($i=0; $i < $chineseLength; $i++) { $tmpChineseChar = substr($tmpChinese, $i*2, 2); if (! defined($chineseChars{$tmpChineseChar})) { $chineseChars{"$tmpChineseChar"} = ++$chineseCharsHashCounter; } } # now put everything into the main array of hashes $wordList[$vocabIndex]->{"levels"} = "$levels"; $wordList[$vocabIndex]->{"chinese"} = $chinese; $wordList[$vocabIndex]->{"english"} = $english; $wordList[$vocabIndex]->{"pinyin"} = $pinyin; # used for extreme tie-breaking when sorting $wordList[$vocabIndex]->{"index"} = $vocabIndex; $vocabIndex++; } close INFILE; } # # Here is the heart of this Perl script. This is the function which # is used (via the Perl "sort" routine) to sort everything. # sub byRealPinyinAndChinese { # if we just sort by the pinyin (or rather the adjusted "realpinyin" # which handles the special characters "bu4" and "yi1"), it won't properly # group characters with the same pronunciation (i.e. it may group entries # with different characters with the same sound, and split up entries # with the same character). So we have to go through the realpinyin a # word (or rather, a Chinese character) at a time, and if the sounds are # the same, check the actual Chinese characters. If the Chinese # characters are the same too, go to the next pinyin sound (and character, # if necessary), until a difference is found or we fall off the end of # one of the strings. If we fall off the end of a string, the shorter # string is considered to come "first". # first some local variables to use my ($aLength, $bLength, $minLength, $aChinese, $bChinese); my ($aChineseChar,$bChineseChar, $aCharInd, $bCharInd, @aPinyin, @bPinyin); $aChinese = $a->{chinese}; $bChinese = $b->{chinese}; $aChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers $bChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers $aLength = length($aChinese) / 2; $bLength = length($bChinese) / 2; $minLength = ($aLength < $bLength) ? $aLength : $bLength; # split pinyin into separate words so we can go through one at a time @aPinyin = split(' ', $a->{realpinyin}); @bPinyin = split(' ', $b->{realpinyin}); for ($i=0; $i < $minLength; $i++) { # if for some reason one of the pinyin fields is shorter than we # expect, just let that one "win" (come first). if (!defined($aPinyin[$i])) { return -1; } if (!defined($bPinyin[$i])) { return 1; } if ($aPinyin[$i] lt $bPinyin[$i]) { return -1; } elsif ($bPinyin[$i] lt $aPinyin[$i]) { return 1; } else { # shoot, the pinyin is the same, have to look at characters $aChineseChar = substr($aChinese, $i*2, 2); $aCharInd = $chineseChars{$aChineseChar}; $bChineseChar = substr($bChinese, $i*2, 2); $bCharInd = $chineseChars{$bChineseChar}; if ($aCharInd < $bCharInd) { return -1; } elsif ($bCharInd < $aCharInd) { return 1; } # if we get here, then the chinese characters were the same # too, so we just continue the loop and try the next char } } # if we get here, the strings were the same, up to the length of the # shorter one. The shorter one will be considered "first". if ($aLength < $bLength) { return -1; } elsif ($bLength < $aLength) { return 1; } else { # strings are totally the same; use their indices as tie-breakers if ($a->{"index"} < $b->{"index"}) { return -1; } elsif ($b->{"index"} < $a->{"index"}) { return 1; } else { die "That's funny, I really shouldn't be able to die this way while sorting"; } } # This was the old simple way, which I hoped would work, but didn't, because # shorter strings would win over longer ones, so all the short strings # beginning with different "bu4" characters would come before all the longer # strings beginning with the different "bu4" characters, etc. # # $a->{"realpinyin"} cmp $b->{"realpinyin"} # or # $a->{"chinese"} cmp $b->{"chinese"} # or # $a cmp $b; } # # This routine takes the pinyin of a given vocabulary entry, and modifies # it (or rather, a copy of it) slightly. If it sees the Chinese character # "bu4" (meaning "not/no"), then it makes sure the character is labelled # as "bu4" in pinyin, rather than "bu2" as it is pronounced when it comes # before another 4th-tone character. Similarly, it makes sure "yi1" (one) # is always labelled as "yi1" in pinyin, rather than "yi2" or "yi4" as it # can also be pronounced. This is so entries containing these characters # will be sorted by the Chinese characters with their original pronunciation, # as real Chinese dictionaries do (or at least the ones I've seen). sub buildRealPinyin { # local vars my ($tmpChinese, $chineseLength); foreach $word (@wordList) { $tmpChinese = $word->{"chinese"}; $tmpChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers $chineseLength = length($tmpChinese) / 2; $word->{"realpinyin"} = $word->{"pinyin"}; $word->{"realpinyin"} =~ s/[\.,\(\)]+/ /g; # strip out periods, commas, and parentheses # separate the pinyin into words @pinyinWords = split(" ", $word->{"realpinyin"}); # now go through one pinyin word at a time, and adjust if necessary for ($i=0; $i < $chineseLength; $i++) { $tmpStr = substr($tmpChinese, $i*2, 2); if (($tmpStr eq $bu4gb) || ($tmpStr eq $bu4big5)) { $pinyinWords[$i] = "bu4"; } elsif (($tmpStr eq $yi1gb) || ($tmpStr eq $yi1big5)) { $pinyinWords[$i] = "yi1"; } } # put the words back together $word->{"realpinyin"} = join(' ', @pinyinWords); } } # # Print out the vocabulary list # sub printVocab { foreach $word (@wordList) { print "$word->{levels}", "$word->{chinese} [$word->{pinyin}] $word->{english}\n"; } } # # I just used this routine for debugging when writing the script... # sub printHash { foreach $key (keys %chineseChars) { print "$key -> $chineseChars{$key}\n"; } } ############## # Main program ############## # Gee, the main program is pretty simple after all the above stuff. # print "hi there\n"; $uConvert = 0; while ($thisarg = shift()) { if ($thisarg eq "-uu2u:") { $uConvert = $uu2uc; } elsif ($thisarg eq "-u:2uu") { $uConvert = $uc2uu; } elsif ($thisarg eq "-license") { printLicense(); exit(0); } else { # put back this arg, it is supposed to be a filename unshift(@ARGV, $thisarg); last; } } $chineseCharsHashCounter = 0; # used to give every Chinese char a unique number %chineseChars = (); # hash which will hold the unique numbers for Chinese chars if ($#{ARGV} < 0) { printusage; } $vocabIndex = 0; # read in the file(s) foreach $fileName ( @ARGV ) { readvocabfile($fileName); } buildRealPinyin(); # the next 4 lines were only here for debugging... #printVocab(); #print "================\n"; #printHash(); #print "================\n"; @wordList = sort byRealPinyinAndChinese @wordList; printVocab();