#!/usr/local/bin/perl -w # # Copyright (c) 1998,1999,2005 David Hiebeler # For licensing information, see the "printLicense" function below. # # File: cedictcheckformat, version 1.2.1, July 2005 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # # Version 1.2.1: July 2005 # Version 1.2: June 2005 # Version 1.1.01: June 2005 # Version 1.1: June 1999 # Version 1.0: July 1998 # # This is a Perl script for checking a cedict-format vocabulary file. # For now, it just reads the vocabulary list from stdin. # # Basically, it checks to see if lines are of the form # "chinese-characters [pinyin stuff] /English definitions/maybe more defs/", # as follows: # # 1) It catches lines that are missing trailing slashes, the square brackets # around the pinyin, etc. # 2) It checks to see whether every pinyin word consists of lower-case letters # and ends with a digit from 1-5. # 3) If the "-num" flag is specified, it also allows a string of digits at # the beginning of the line, indicating difficulty levels for practicing # the characters. However, note that this feature will be discontinued # in a future release (unless someone says they really want it). I now # embed the skill-level information that I use inside the English definition # field. # 4) It checks to see whether the number of Chinese characters and number # of pinyin words match, if you use the "-checklen" argument. # # Sample usage: "checkvocabformat < cedict.gb" # Run "checkvocabformat -help" for usage info. # # This script should work correctly on both GB and BIG5 files. # It is not yet tested on UTF-8. # # Wish-list: # * Allow square brackets inside the English field (even though CEDICT # format says not to put them there) # * Allow commas and slashes inside the pinyin field (again despite CEDICT # recommendations) # * Actually check the pinyin field to make sure it only contains valid # pinyin, as opposed to any random sequences of lower-case letters followed # by a digit from 1-5. # # History: # 22 July 2005: version 1.2.1 # 24 June 2005: version 1.2 # Just renumberings to stay synchronized with the whole package. # 24 Jun 2005: version 1.1.01 # Updated my address info above # 10 Jun 1999: version 1.1 # Added code to check whether the number of Chinese characters # and number of pinyin words are equal. # 08 Sep 1998: version 1.0.1 # Bug fix to allow a colon after a "u" in the pinyin field # (e.g. so pinyin words such as "nu:3" will be recognized, # which is an alternate form of writing "nuu3"). # 29 Jul 1998: original version, 1.0 sub printLicense { print <<"END_OF_LICENSE"; cedictcheckformat version 1.2 June 24, 2005 Copyright (C) 1998,1999 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Read a line, removing comments which begin with "#", and ignoring # empty lines (or lines which only have a comment). # $getlinelinenum = 0; sub getline { if ($#_ == -1) { while (<>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument"; } while (<$fh>) { $getlinelinenum++; next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } # # Print a usage message and exit. # sub printusage { print "Usage: $0 [-strictpy | -nostrictpy] [-strictfmt | -nostrictfmt]\n"; print " [-filter | -nofilter] [-checklen | -nochecklen] [-license]\n"; print " -strictpy: don't allow commas, periods, and parentheses in the pinyin\n"; print " -strictfmt: be more picky about spaces between Chinese and pinyin, etc.\n"; print " -filter: run as a filter, output good lines (otherwise, bad lines are\n"; print " printed, along with their line numbers)\n"; print " -checklen: check the lengths of the Chinese and pinyin fields (i.e. check to\n"; print " see that the number of Chinese characters and number of pinyin words\n"; print " are equal)\n"; print " -license: Print licensing information\n"; print "\n\n"; print "By default, \"strictfmt\" is enabled.\n"; exit(2); } # # Check the validity of a pinyin phrase. # Return 1 if the pinyin is bad, return 0 if it is good # sub checkpinyin { if ($strictpy == 0) { $_[0] =~ s/[\.,\(\)]+/ /g; # remove periods, commas, and parentheses } if (!(@words = split(" ", $_[0]))) { return 1; } foreach $word (@words) { if (!($word =~ m/^([a-z](u:)?)*[12345]$/)) { return 1; } } return 0; } ############## # Main program ############## $donum = 0; $strictpy = 0; $strictfmt = 1; $filter = 0; $checklen = 0; while ($thisarg = shift()) { if ($thisarg eq "-num") { $donum = 1; } elsif ($thisarg eq "-nonum") {$donum = 0;} elsif ($thisarg eq "-strictpy") {$strictpy = 1;} elsif ($thisarg eq "-nostrictpy") {$strictpy = 0;} elsif ($thisarg eq "-strictfmt") {$strictfmt = 1;} elsif ($thisarg eq "-nostrictfmt") {$strictfmt = 0;} elsif ($thisarg eq "-filter") {$filter = 1;} elsif ($thisarg eq "-nofilter") {$filter = 0;} elsif ($thisarg eq "-checklen") {$checklen = 1;} elsif ($thisarg eq "-nochecklen") {$checklen = 1;} elsif ($thisarg eq "-license") { printLicense(); exit(0); } else { printusage(); } } if ($strictfmt) { $chnStr = "[^\\s]+\\s+"; } else { $chnStr = ".*"; } if ($donum) { while ($line=getline()) { if (!($line =~ m@^\s*[0-9]+\s+($chnStr)\[(.+)\]\s*/.*/\s*$@)) { if ($filter == 0) { print "$getlinelinenum: $line\n"; } } else { $chinese = $1; $pinyin = $2; if (checkpinyin($pinyin)) { if ($filter == 0) { print "$getlinelinenum: $line\n"; } } else { if ($filter == 1) { print "$line\n"; } } } } } else { @throwAway = (); while ($line=getline()) { if (!($line =~ m@^\s*([^\s]+)\s+\[(.+)\]\s*/.*/\s*$@)) { if ($filter == 0) { print "$getlinelinenum: $line\n"; } } else { $bad = 0; $chinese = $1; $pinyin = $2; $chnlen = length($chinese); $pylen = scalar(@throwAway = split(" ", $pinyin)); # print "chn=`$chinese' ($chnlen), py=`$pinyin' ($pylen)\n"; if ($filter == 0) { if (checkpinyin($pinyin)) { print "$getlinelinenum: $line\n"; $bad = 1; } if ($checklen) { if ($chnlen != $pylen*2) { print "$getlinelinenum: $line\n"; $bad = 1; } } } else { if ($bad == 0) { print "$line\n"; } } } } }