774 lines
26 KiB
Plaintext
Executable File
774 lines
26 KiB
Plaintext
Executable File
: Use /bin/sh
|
|
#
|
|
# $Id: munchlist.X,v 1.38 1992/01/04 22:08:18 geoff Exp $
|
|
#
|
|
# Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA
|
|
# Permission for non-profit use is hereby granted.
|
|
# All other rights reserved.
|
|
# See "version.h" for a more complete copyright notice.
|
|
#
|
|
# $Log: munchlist.X,v $
|
|
# Revision 1.38 1992/01/04 22:08:18 geoff
|
|
# Don't use the non-portable "-x" test; use "-r" as a substitute. Fix a
|
|
# misplaced switch in a sort command.
|
|
#
|
|
# Revision 1.37 91/09/11 23:22:53 geoff
|
|
# Add the "-u" flag to the sort of CROSSEXPANDED. Add a PAIRUP step to
|
|
# the calculation of MINIMALAFFIXES to protect against affix files that
|
|
# generate so many options that they break limited versions of awk.
|
|
#
|
|
# Revision 1.36 91/07/27 20:48:33 geoff
|
|
# Make the default language tables be configurable from the Makefile,
|
|
# the way they should have been all along.
|
|
#
|
|
# Revision 1.35 91/07/04 00:04:22 geoff
|
|
# Add support for the -T switch (passed to icombine).
|
|
#
|
|
# Revision 1.34 91/07/03 18:21:04 geoff
|
|
# Don't use the ":-" notation in defining TMPDIR, since some
|
|
# braindamaged Bourne shells don't handle it.
|
|
#
|
|
# Revision 1.33 91/06/23 22:14:51 geoff
|
|
# Fix a typo that was a syntax error
|
|
#
|
|
# Revision 1.32 91/01/27 00:43:36 geoff
|
|
# Add the "-u" switch to the final sort to make absolutely sure that there
|
|
# are no duplicates in the output file.
|
|
#
|
|
# Revision 1.31 90/09/05 02:35:38 geoff
|
|
# Before ln'ing things, rm -f them to handle BSD systems in which ln correctly
|
|
# refuses to make links to existing files.
|
|
#
|
|
# Revision 1.30 89/12/27 22:30:48 geoff
|
|
# Be sure to specify -W0 when crunching the word list.
|
|
#
|
|
# Revision 1.29 89/04/27 23:33:13 geoff
|
|
# Add support for the selectable flag marker character.
|
|
#
|
|
# Revision 1.28 89/02/18 00:52:32 geoff
|
|
# Add another icombine step to make sure that unnecessary capitalizations
|
|
# are dropped.
|
|
#
|
|
# Revision 1.27 88/12/26 02:31:35 geoff
|
|
# Update the copyright notice.
|
|
#
|
|
# Revision 1.26 88/11/25 19:53:09 geoff
|
|
# Add the -c (convert old language table) option.
|
|
#
|
|
# Revision 1.25 88/11/16 02:20:09 geoff
|
|
# Make the default "wchars" argument be "-wA" so that ispell won't get
|
|
# a null argument. Make sure the fake hash file ends in ".hash".
|
|
#
|
|
# Revision 1.24 88/04/11 01:37:47 geoff
|
|
# Accept "-" to indicate standard input, and "--" to indicate the end of
|
|
# the options.
|
|
#
|
|
# Revision 1.23 88/04/03 23:11:49 geoff
|
|
# Fix the -w option so it doesn't require quoting any more, and so it
|
|
# doesn't unintentionally add a blank to the -w list.
|
|
#
|
|
# Revision 1.22 88/03/30 00:14:11 geoff
|
|
# Replace two sorts with uniqs for speed.
|
|
#
|
|
# Revision 1.21 88/03/29 00:29:43 geoff
|
|
# Return to using icombine rather than buildhash to combine suffixes.
|
|
# Also, be sure to include the suffixes in the sort -u steps so we don't get
|
|
# stuck with one suffix per root.
|
|
#
|
|
# Revision 1.20 88/03/27 01:03:22 geoff
|
|
# Make the large awk script be a separate file, so it doesn't core-dump
|
|
# some shells. Run buildhash with the -s switch. Build and use
|
|
# various hash files only if the word lists are of nonzero size. Save
|
|
# the statistics files for MINIMALAFFIXES in debug mode.
|
|
#
|
|
# Revision 1.19 88/03/12 02:45:00 geoff
|
|
# Add a comment about the uselessness of the -w option. Fix error exits
|
|
# on buildhash failures to actually exit. Sort MINIMALAFFIXES correctly
|
|
# (folded) so the following buildhash will run properly. Replace the final
|
|
# comm step with an ispell -l step.
|
|
#
|
|
#
|
|
# Revision 1.18 88/02/28 23:17:44 geoff
|
|
# Fix an English error in a comment.
|
|
#
|
|
# Revision 1.17 88/02/20 23:13:33 geoff
|
|
# Many major changes to support the new dictionary structure, capitalization
|
|
# handling, and ispell/buildhash options.
|
|
#
|
|
# Revision 1.16 87/09/24 23:47:00 geoff
|
|
# Allow icombine to be gotten from the current directory, so that the
|
|
# script can be run before installation is complete.
|
|
#
|
|
# Revision 1.15 87/09/14 22:38:47 geoff
|
|
# Add copyright comments
|
|
#
|
|
# Revision 1.14 87/07/28 22:50:23 geoff
|
|
# Remove the -e switch, and add -D. Completely redo the cross-product
|
|
# handling.
|
|
#
|
|
# Revision 1.13 87/07/20 23:23:11 geoff
|
|
# Major rewrites and improvements to make it independent of the
|
|
# contents of the language tables.
|
|
#
|
|
# Revision 1.12 87/04/24 20:32:43 geoff
|
|
# Sort the input to icombine properly.
|
|
#
|
|
# Revision 1.11 87/04/21 23:29:07 geoff
|
|
# Swap the order of the final sort and combine, so that combine is sure
|
|
# to get stuff in the right order. This unfortunately increases temp file
|
|
# requirements.
|
|
#
|
|
# Revision 1.10 87/04/19 22:53:22 geoff
|
|
# Add capitalization handling. Mostly this consists of folding the sorts
|
|
# and running more expand scripts.
|
|
#
|
|
# Revision 1.9 87/03/28 23:17:58 geoff
|
|
# Get rid of a now-obsolete tr
|
|
#
|
|
# Revision 1.8 87/03/28 19:22:01 geoff
|
|
# Fix the problem that prevented recognition of root words that were
|
|
# already in the main dictionary. Also make sure every ispell is
|
|
# passed the stuff from the -w option.
|
|
#
|
|
# Revision 1.7 87/03/27 17:21:36 geoff
|
|
# Replace all the awks and one of the sorts with icombines.
|
|
#
|
|
# Revision 1.6 87/03/26 19:09:45 geoff
|
|
# Remove DEFDICT; it's obsolete
|
|
#
|
|
# Revision 1.5 87/03/26 00:30:46 geoff
|
|
# Integrate Rich Salz's changes/improvements
|
|
#
|
|
# Revision 1.4 87/03/24 22:54:42 geoff
|
|
# Handle zero arguments correctly
|
|
#
|
|
# Revision 1.3 87/03/13 22:37:55 geoff
|
|
# Add the -d option and code to strip out words covered by the main dictionary.
|
|
# Also make sure that /dev/null is always used for main and personal
|
|
# dictionaries so ispell doesn't get confused or start slow.
|
|
#
|
|
# Revision 1.2 87/03/08 20:31:16 geoff
|
|
# Major changes to make faster and to make it work right (or at least better).
|
|
#
|
|
# Revision 1.1 87/03/01 02:20:15 geoff
|
|
# Initial revision
|
|
#
|
|
#
|
|
# Given a list of words for ispell, generate a reduced list
|
|
# in which all possible affixes have been collapsed. The reduced
|
|
# list will match the same list as the original.
|
|
#
|
|
# Usage:
|
|
#
|
|
# munchlist [-l lang] [-c lang] [-s hashfile] [-D] [-w chars] [file] ...
|
|
#
|
|
# Options:
|
|
#
|
|
# -l lang Specifies the language table to be used. The default
|
|
# is "$LIBDIR/english.aff".
|
|
# -c lang Specifies "conversion" language table. If this option is
|
|
# given, the input file(s) will be assumed to be described by
|
|
# this table, rather than the table given in the -l option.
|
|
# This may be used to convert between incompatible language
|
|
# tables. (When in doubt, use this option -- it doesn't
|
|
# hurt, and it may save you from creating a dictionary that has
|
|
# illegal words in it). The default is no conversion.
|
|
# -T suff Specifies that the source word lists are in the format
|
|
# of a "suff"-suffixed file, rather than in the
|
|
# canonical form. For example, "-T tex" specifies that
|
|
# string characters in the word lists are in TeX format.
|
|
# The string character conversions are taken from the language
|
|
# table specified by the "-l" switch.
|
|
# -s Remove any words that are already covered by the
|
|
# dictionary in 'hashfile'. The words will be removed
|
|
# only if all affixes are covered. This option should not be
|
|
# specified when the main dictionary is being munched.
|
|
# 'Hashfile' must have been created with the language
|
|
# table given in the -l option, but this is not checked.
|
|
# -D Leave temporary files for debugging purposes
|
|
# -w Passed on to ispell (specify chars that are part of a word)
|
|
# Unfortunately, special characters must be quoted twice
|
|
# rather than once when invoking this script. Also, since
|
|
# buildhash doesn't accept this option, the final ispell -l
|
|
# step ignores it, making it somewhat less than useful.
|
|
#
|
|
# The given input files are merged, then processed by 'ispell -c'
|
|
# to generate possible affix lists; these are then combined
|
|
# and reduced. The final result is written to standard output.
|
|
#
|
|
# For portability to older systems, I have avoided getopt.
|
|
#
|
|
# Geoff Kuenning
|
|
# 2/28/87
|
|
#
|
|
LIBDIR=/usr/lib
|
|
TDIR=${TMPDIR-/usr/tmp}
|
|
TMP=${TDIR}/munch$$
|
|
SORTTMP=
|
|
if [ -r ./icombine ]
|
|
then
|
|
COMBINE=./icombine
|
|
else
|
|
COMBINE=${LIBDIR}/icombine
|
|
fi
|
|
|
|
debug=no
|
|
dictopt=
|
|
langtabs=${LIBDIR}/english.aff
|
|
convtabs=
|
|
strip=no
|
|
icflags=
|
|
# The following value of "wchars" is necessary to prevent ispell from
|
|
# receiving a null argument if -w is not specified. As long as "A" is
|
|
# a member of the existing character set, ispell will ignore the argument.
|
|
wchars=-wA
|
|
while [ $# != 0 ]
|
|
do
|
|
case "$1" in
|
|
-l)
|
|
if [ -r "$2" ]
|
|
then
|
|
langtabs="$2"
|
|
elif [ -r "${LIBDIR}/$2" ]
|
|
then
|
|
langtabs="${LIBDIR}/$2"
|
|
else
|
|
echo "Can't open language table '$2'" 1>&2
|
|
exit 1
|
|
fi
|
|
shift
|
|
;;
|
|
-c)
|
|
if [ -r "$2" ]
|
|
then
|
|
convtabs="$2"
|
|
elif [ -r "${LIBDIR}/$2"
|
|
then
|
|
convtabs="${LIBDIR}/$2"
|
|
else
|
|
echo "Can't open conversion language table '$2'" 1>&2
|
|
exit 1
|
|
fi
|
|
shift
|
|
;;
|
|
-s)
|
|
dictopt="-d $2"
|
|
strip=yes
|
|
shift
|
|
;;
|
|
-D)
|
|
debug=yes
|
|
;;
|
|
-T)
|
|
icflags="-T $2"
|
|
shift
|
|
;;
|
|
-w)
|
|
wchars="-w$2"
|
|
shift
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
-)
|
|
break
|
|
;;
|
|
-*)
|
|
echo 'Usage: munchlist [-l lang] [-c lang] [-T suff] [-s hashfile] [-D] [-w chars] [file] ...' \
|
|
1>&2
|
|
exit 2
|
|
;;
|
|
*)
|
|
break
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
trap "/bin/rm -f ${TMP}*; exit 1" 1 2 13 15
|
|
#
|
|
# Names of temporary files. This is just to make the code a little easier
|
|
# to read.
|
|
#
|
|
EXPANDEDINPUT=${TMP}a
|
|
STRIPPEDINPUT=${TMP}b
|
|
CRUNCHEDINPUT=${TMP}c
|
|
PRODUCTLIST=${TMP}d
|
|
EXPANDEDPAIRS=${TMP}e
|
|
LEGALFLAGLIST=${TMP}f
|
|
JOINEDPAIRS=${TMP}g
|
|
MINIMALAFFIXES=${TMP}h
|
|
CROSSROOTS=${TMP}i
|
|
CROSSEXPANDED=${TMP}j
|
|
CROSSPAIRS=${TMP}k
|
|
CROSSILLEGAL=${TMP}l
|
|
ILLEGALCOMBOS=${TMP}m
|
|
FAKEDICT=${TMP}n
|
|
# Ispell insists that hash files have a ".hash" suffix
|
|
FAKEHASH=${TMP}o.hash
|
|
AWKSCRIPT=${TMP}p
|
|
if [ "$debug" = yes ]
|
|
then
|
|
touch $EXPANDEDINPUT $STRIPPEDINPUT $CRUNCHEDINPUT $PRODUCTLIST \
|
|
$EXPANDEDPAIRS $LEGALFLAGLIST $JOINEDPAIRS $MINIMALAFFIXES \
|
|
$CROSSROOTS $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $ILLEGALCOMBOS \
|
|
$FAKEDICT $FAKEHASH $AWKSCRIPT
|
|
rm -f ${TDIR}/EXPANDEDINPUT ${TDIR}/STRIPPEDINPUT ${TDIR}/CRUNCHEDINPUT \
|
|
${TDIR}/PRODUCTLIST ${TDIR}/EXPANDEDPAIRS ${TDIR}/LEGALFLAGLIST \
|
|
${TDIR}/JOINEDPAIRS ${TDIR}/MINIMALAFFIXES ${TDIR}/CROSSROOTS \
|
|
${TDIR}/CROSSEXPANDED ${TDIR}/CROSSPAIRS ${TDIR}/CROSSILLEGAL \
|
|
${TDIR}/ILLEGALCOMBOS ${TDIR}/FAKEDICT ${TDIR}/FAKEHASH.hash \
|
|
${TDIR}/AWKSCRIPT
|
|
ln $EXPANDEDINPUT ${TDIR}/EXPANDEDINPUT
|
|
ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT
|
|
ln $CRUNCHEDINPUT ${TDIR}/CRUNCHEDINPUT
|
|
ln $PRODUCTLIST ${TDIR}/PRODUCTLIST
|
|
ln $EXPANDEDPAIRS ${TDIR}/EXPANDEDPAIRS
|
|
ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST
|
|
ln $JOINEDPAIRS ${TDIR}/JOINEDPAIRS
|
|
ln $MINIMALAFFIXES ${TDIR}/MINIMALAFFIXES
|
|
ln $CROSSROOTS ${TDIR}/CROSSROOTS
|
|
ln $CROSSEXPANDED ${TDIR}/CROSSEXPANDED
|
|
ln $CROSSPAIRS ${TDIR}/CROSSPAIRS
|
|
ln $CROSSILLEGAL ${TDIR}/CROSSILLEGAL
|
|
ln $ILLEGALCOMBOS ${TDIR}/ILLEGALCOMBOS
|
|
ln $FAKEDICT ${TDIR}/FAKEDICT
|
|
ln $FAKEHASH ${TDIR}/FAKEHASH.hash
|
|
ln $AWKSCRIPT ${TDIR}/AWKSCRIPT
|
|
fi
|
|
#
|
|
# Create a dummy dictionary to hold a compiled copy of the language
|
|
# table. Initially, it holds the conversion table, if it exists.
|
|
#
|
|
case "X$convtabs" in
|
|
X)
|
|
convtabs="$langtabs"
|
|
;;
|
|
esac
|
|
echo 'QQQQQQQQ' > $FAKEDICT
|
|
buildhash -s $FAKEDICT $convtabs $FAKEHASH \
|
|
|| (echo "Couldn't create fake hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \
|
|
|| exit 1
|
|
#
|
|
# Collect all the input and expand all the affix options (ispell -e),
|
|
# and preserve (sorted) for later joining in EXPANDEDINPUT. The icombine
|
|
# step is to make sure that unneeded capitalizations (e.g., Farmer and farmer)
|
|
# are weeded out. The first sort must be folded for icombine; the second
|
|
# must be unfolded for join.
|
|
#
|
|
if [ $# -eq 0 ]
|
|
then
|
|
ispell "$wchars" -e -d $FAKEHASH -p /dev/null | tr " " '
|
|
'
|
|
else
|
|
cat "$@" | ispell "$wchars" -e -d $FAKEHASH -p /dev/null | tr " " '
|
|
'
|
|
fi \
|
|
| sort $SORTTMP -u +0f -1 +0 \
|
|
| $COMBINE $icflags $langtabs \
|
|
| sort $SORTTMP -u > $EXPANDEDINPUT
|
|
#
|
|
# If a conversion table existed, recreate the fake hash file with the
|
|
# "real" language table.
|
|
#
|
|
case "$convtabs" in
|
|
$langtabs)
|
|
;;
|
|
*)
|
|
buildhash -s $FAKEDICT $langtabs $FAKEHASH \
|
|
|| (echo "Couldn't create fake hash file" 1>&2; \
|
|
/bin/rm -f ${TMP}*; exit 1) \
|
|
|| exit 1
|
|
;;
|
|
esac
|
|
/bin/rm -f ${FAKEDICT}*
|
|
#
|
|
# If the -s (strip) option was specified, remove all
|
|
# expanded words that are covered by the dictionary. This produces
|
|
# the final list of expanded words that this dictionary must cover.
|
|
# Leave the list in STRIPPEDINPUT.
|
|
#
|
|
if [ "X$strip" = "Xno" ]
|
|
then
|
|
rm -f $STRIPPEDINPUT
|
|
ln $EXPANDEDINPUT $STRIPPEDINPUT
|
|
if [ "$debug" = yes ]
|
|
then
|
|
rm -f ${TDIR}/STRIPPEDINPUT
|
|
ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT
|
|
fi
|
|
else
|
|
ispell "$wchars" -l $dictopt -p /dev/null < $EXPANDEDINPUT \
|
|
> $STRIPPEDINPUT
|
|
fi
|
|
#
|
|
# Figure out what the flag-marking character is.
|
|
#
|
|
flagmarker=`ispell -D -d $FAKEHASH \
|
|
| sed -n '/^flagmarker/s/flagmarker //p'`
|
|
case "$flagmarker" in
|
|
\\*)
|
|
flagmarker=`expr "$flagmarker" : '.\(.\)'`
|
|
;;
|
|
esac
|
|
#
|
|
# Munch the input to generate roots and affixes (ispell -c). We are
|
|
# only interested in words that have at least one affix (egrep $flagmarker);
|
|
# the next step will pick up the rest. Some of the roots are illegal. We
|
|
# use join to restrict the output to those root words that are found
|
|
# in the original dictionary.
|
|
#
|
|
ispell "$wchars" -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \
|
|
| tr " " '
|
|
' \
|
|
| egrep "$flagmarker" | sort $SORTTMP -u "-t$flagmarker" +0 -1 +1 \
|
|
| join "-t$flagmarker" - $EXPANDEDINPUT > $CRUNCHEDINPUT
|
|
#
|
|
# We now have a list of legal roots, and of affixes that apply to the
|
|
# root words. However, it is possible for some affix flags to generate more
|
|
# than one output word. For example, with the flag table entry
|
|
#
|
|
# flag R: . > ER
|
|
# . > ERS
|
|
#
|
|
# the input "BOTHER" will generate an entry "BOTH/R" in CRUNCHEDINPUT. But
|
|
# this will accept "BOTHER" and "BOTHERS" in the dictionary, which is
|
|
# wrong (in this case, though it's good English).
|
|
#
|
|
# To cure this problem, we first have to know which flags generate which
|
|
# expansions. We use ispell -ee to expand the flags (the second e causes
|
|
# the root and flag to be included in the output), and run the result of
|
|
# that through sed to get pairs suitable for joining. In the example
|
|
# above, we would get
|
|
#
|
|
# BOTH/R BOTHER
|
|
# BOTH/R BOTHERS
|
|
#
|
|
# We save this in EXPANDEDPAIRS for the next step.
|
|
#
|
|
# First, a small sed script that we'll use a lot. It takes "a b c d" and
|
|
# produces "a b", "a c", and "a d". Lines without blanks are ignored.
|
|
#
|
|
PAIRUP='/ /{
|
|
s;^\([^ ]*\) \([^ ]*\)\(.*\);\1 \2\
|
|
\1\3;
|
|
P
|
|
D
|
|
}'
|
|
ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $CRUNCHEDINPUT \
|
|
| sed -n "$PAIRUP" \
|
|
| sort $SORTTMP +1 > $EXPANDEDPAIRS
|
|
#
|
|
# Now we want to extract the lines in EXPANDEDPAIRS in which the second field
|
|
# is *not* listed in the original dictionary EXPANDEDINPUT; these illegal
|
|
# lines contain the flags we cannot include without accepting illegal words.
|
|
# It is somewhat easier to extract those which actually are listed (with
|
|
# join), and then use comm to strip these from EXPANDEDPAIRS to get the
|
|
# illegal expansions, together with the flags that generate them (we must
|
|
# re-sort EXPANDEDPAIRS before running comm). Sed
|
|
# gets rid of the expansion and uniq gets rid of duplicates. Comm then
|
|
# selects the remainder of the list from CRUNCHEDINPUT and puts it in
|
|
# LEGALFLAGLIST. The final step is to use a sort and icombine to put
|
|
# the list into a one-entry-per-root format.
|
|
#
|
|
# BTW, I thought of using cut for the sed step (on systems that have it),
|
|
# but it turns out that sed is faster!
|
|
#
|
|
join -j1 2 -o 1.1 1.2 $EXPANDEDPAIRS $EXPANDEDINPUT \
|
|
| sort $SORTTMP -u > $JOINEDPAIRS
|
|
|
|
sort $SORTTMP -o $EXPANDEDPAIRS $EXPANDEDPAIRS
|
|
|
|
comm -13 $JOINEDPAIRS $EXPANDEDPAIRS \
|
|
| (sed -e 's; .*$;;' ; /bin/rm -f $JOINEDPAIRS $EXPANDEDPAIRS) \
|
|
| uniq \
|
|
| (comm -13 - $CRUNCHEDINPUT ; /bin/rm -f $CRUNCHEDINPUT) \
|
|
| sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \
|
|
| $COMBINE $langtabs > $LEGALFLAGLIST
|
|
|
|
#
|
|
# LEGALFLAGLIST now contains root/flag combinations that, when expanded,
|
|
# produce only words from EXPANDEDPAIRS. However, there is still a
|
|
# problem if the language tables have any cross-product flags. A legal
|
|
# root may appear in LEGALFLAGLIST with two flags that participate
|
|
# in cross-products. When such a dictionary entry is expanded,
|
|
# the cross-products will generate some extra words that may not
|
|
# be in EXPANDEDPAIRS. We need to remove these from LEGALFLAGLIST.
|
|
#
|
|
# The first step is to collect the names of the flags that participate
|
|
# in cross-products. Ispell will dump the language tables for us, and
|
|
# sed is a pretty handy way to strip out extra information. We use
|
|
# uniq -c and a numerical sort to put the flags in approximate order of how
|
|
# "productive" they are (in terms of how likely they are to generate a lot
|
|
# of output words). The least-productive flags are given last and will
|
|
# be removed first.
|
|
#
|
|
ispell -D -d $FAKEHASH \
|
|
| sed -n '1,$s/:.*$//
|
|
/^flagmarker/d
|
|
/^prefixes/,/^suffixes/s/^ flag \*/p /p
|
|
/^suffixes/,$s/^ flag \*/s /p' \
|
|
| sort $SORTTMP \
|
|
| uniq -c \
|
|
| sort $SORTTMP +0rn -1 +2 > $PRODUCTLIST
|
|
|
|
if [ `egrep ' p ' $PRODUCTLIST | wc -l` -gt 0 \
|
|
-a `egrep ' s ' $PRODUCTLIST | wc -l` -gt 0 ]
|
|
then
|
|
#
|
|
# The language tables allow cross products. See if LEGALFLAGLIST has
|
|
# any roots with multiple cross-product flags. Put them in CROSSROOTS.
|
|
#
|
|
preflags=`sed -n 's/^[ 0-9]*p //p' $PRODUCTLIST | tr -d '
|
|
'`
|
|
sufflags=`sed -n 's/^[ 0-9]*s //p' $PRODUCTLIST | tr -d '
|
|
'`
|
|
egrep "$flagmarker.*[$preflags].*[$sufflags]|$flagmarker.*[$sufflags].*[$preflags]" \
|
|
$LEGALFLAGLIST \
|
|
> $CROSSROOTS
|
|
|
|
#
|
|
# We will need an awk script; it's so big that it core-dumps my shell
|
|
# under certain conditions. The rationale behind the script is commented
|
|
# where the script is used. Note that you may want to change this
|
|
# script for languages other than English.
|
|
#
|
|
case "$flagmarker" in
|
|
/)
|
|
sedchar=:
|
|
;;
|
|
*)
|
|
sedchar=/
|
|
;;
|
|
esac
|
|
sed -e "s/PREFLAGS/$preflags/" -e "s/SUFFLAGS/$sufflags/" \
|
|
-e "s;ILLEGALCOMBOS;$ILLEGALCOMBOS;" \
|
|
-e "s${sedchar}FLAGMARKER${sedchar}$flagmarker${sedchar}" \
|
|
> $AWKSCRIPT << 'ENDOFAWKSCRIPT'
|
|
BEGIN \
|
|
{
|
|
preflags = "PREFLAGS"
|
|
sufflags = "SUFFLAGS"
|
|
illegalcombos = "ILLEGALCOMBOS"
|
|
flagmarker = "FLAGMARKER"
|
|
pflaglen = length (preflags)
|
|
for (i = 1; i <= pflaglen; i++)
|
|
pflags[i] = substr (preflags, i, 1);
|
|
sflaglen = length (sufflags)
|
|
for (i = 1; i <= sflaglen; i++)
|
|
sflags[i] = substr (sufflags, i, 1);
|
|
}
|
|
{
|
|
len = length ($2)
|
|
pnew2 = ""
|
|
snew2 = ""
|
|
pbad = ""
|
|
sbad = ""
|
|
sufs = 0
|
|
pres = 0
|
|
for (i = 1; i <= len; i++)
|
|
{
|
|
curflag = substr ($2, i, 1)
|
|
for (j = 1; j <= pflaglen; j++)
|
|
{
|
|
if (pflags[j] == curflag)
|
|
{
|
|
pres++
|
|
pnew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
|
|
pbad = curflag
|
|
}
|
|
}
|
|
for (j = 1; j <= sflaglen; j++)
|
|
{
|
|
if (sflags[j] == curflag)
|
|
{
|
|
sufs++
|
|
snew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
|
|
sbad = curflag
|
|
}
|
|
}
|
|
}
|
|
if (pres == 1)
|
|
{
|
|
print $1 flagmarker pnew2
|
|
print $1 flagmarker pbad >> illegalcombos
|
|
}
|
|
else if (sufs == 1)
|
|
{
|
|
print $1 flagmarker snew2
|
|
print $1 flagmarker sbad >> illegalcombos
|
|
}
|
|
else if (pres > 0)
|
|
{
|
|
print $1 flagmarker pnew2
|
|
print $1 flagmarker pbad >> illegalcombos
|
|
}
|
|
else
|
|
{
|
|
print $1 flagmarker snew2
|
|
print $1 flagmarker sbad >> illegalcombos
|
|
}
|
|
}
|
|
ENDOFAWKSCRIPT
|
|
: > $ILLEGALCOMBOS
|
|
dbnum=0
|
|
while [ -s $CROSSROOTS ]
|
|
do
|
|
#
|
|
# CROSSROOTS contains the roots whose cross-product expansions
|
|
# might be illegal. We now need to locate the actual illegal ones.
|
|
# We do this in much the same way we created LEGALFLAGLIST from
|
|
# CRUNCHEDINPUT. First we make CROSSEXPANDED, which is analogous
|
|
# to EXPANDEDPAIRS.
|
|
#
|
|
ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $CROSSROOTS \
|
|
| sed -n "$PAIRUP" \
|
|
| sort $SORTTMP +1 > $CROSSEXPANDED
|
|
#
|
|
# Now we join CROSSEXPANDED against EXPANDEDINPUT to produce
|
|
# CROSSPAIRS, and then comm that against CROSSEXPANDED to
|
|
# get CROSSILLEGAL, the list of illegal cross-product flag
|
|
# combinations.
|
|
#
|
|
join -j1 2 -o 1.1 1.2 $CROSSEXPANDED $EXPANDEDINPUT \
|
|
| sort $SORTTMP -u > $CROSSPAIRS
|
|
|
|
sort $SORTTMP -u -o $CROSSEXPANDED $CROSSEXPANDED
|
|
|
|
comm -13 $CROSSPAIRS $CROSSEXPANDED \
|
|
| sed -e 's; .*$;;' \
|
|
| uniq > $CROSSILLEGAL
|
|
|
|
if [ "$debug" = yes ]
|
|
then
|
|
mv $CROSSROOTS $TDIR/CROSSROOTS.$dbnum
|
|
rm -f $TDIR/CROSSEXP.$dbnum $TDIR/CROSSILLEGAL.$dbnum \
|
|
$TDIR/CROSSEXP.$dbnum
|
|
ln $CROSSPAIRS $TDIR/CROSSPAIRS.$dbnum
|
|
ln $CROSSILLEGAL $TDIR/CROSSILLEGAL.$dbnum
|
|
fi
|
|
#
|
|
# Now it is time to try to clear up the illegalities. For
|
|
# each word in the illegal list, remove one of the cross-product
|
|
# flags. The flag chosen is selected in an attempt to cure the
|
|
# problem quickly, as follows: (1) if there is only one suffix
|
|
# flag or only one prefix flag, we remove that. (2) If there is
|
|
# a prefix flag, we remove the "least desirable" (according to
|
|
# the order of preflags). (This may be pro-English prejudice,
|
|
# and you might want to change this if your language is prefix-heavy).
|
|
# (3) Otherwise we remove the least-desirable suffix flag
|
|
#
|
|
# The output of the awk script becomes the new CROSSROOTS. In
|
|
# addition, we add the rejected flags to ILLEGALCOMBOS (this is done
|
|
# inside the awk script) so they can be removed from LEGALFLAGLIST
|
|
# later.
|
|
#
|
|
awk "-F$flagmarker" -f $AWKSCRIPT $CROSSILLEGAL > $CROSSROOTS
|
|
if [ "$debug" = yes ]
|
|
then
|
|
/bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL
|
|
dbnum=`expr $dbnum + 1`
|
|
fi
|
|
done
|
|
/bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $AWKSCRIPT
|
|
#
|
|
# Now we have, in ILLEGALCOMBOS, a list of root/flag combinations
|
|
# that must be removed from LEGALFLAGLIST to get the final list
|
|
# of truly legal flags. ILLEGALCOMBOS has one flag per line, so
|
|
# by turning LEGALFLAGLIST into this form (sed), it's an
|
|
# easy task for comm. We have to recombine flags again after the
|
|
# extraction, to get all flags for a given root on the same line so that
|
|
# cross-products will come out right.
|
|
#
|
|
if [ -s $ILLEGALCOMBOS ]
|
|
then
|
|
sort $SORTTMP -u -o $ILLEGALCOMBOS $ILLEGALCOMBOS
|
|
sort $SORTTMP $LEGALFLAGLIST \
|
|
| sed '/\/../{
|
|
s;^\(.*\)/\(.\)\(.*\);\1/\2\
|
|
\1/\3;
|
|
P
|
|
D
|
|
}' \
|
|
| comm -23 - $ILLEGALCOMBOS \
|
|
| sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \
|
|
| $COMBINE $langtabs > $CROSSROOTS
|
|
mv $CROSSROOTS $LEGALFLAGLIST
|
|
if [ "$debug" = yes ]
|
|
then
|
|
rm -f ${TDIR}/LEGALFLAGLIST1
|
|
ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST1
|
|
fi
|
|
fi
|
|
fi
|
|
/bin/rm -f $PRODUCTLIST $CROSSROOTS $ILLEGALCOMBOS $EXPANDEDINPUT
|
|
#
|
|
# We now have (in LEGALFLAGLIST) a list of roots and flags which will
|
|
# accept words taken from EXPANDEDINPUT and no others (though some of
|
|
# EXPANDEDINPUT is not covered by this list). However, many of the
|
|
# expanded words can be generated in more than one way. For example,
|
|
# "bather" can be generated from "bath/R" and "bathe/R". This wastes
|
|
# unnecessary space in the raw dictionary and, in some cases, in the
|
|
# hash file as well. The solution is to list the various ways of
|
|
# getting a given word and choose exactly one. All other things being
|
|
# equal, we want to choose the one with the shortest root and the most
|
|
# flags. The awk script takes care of this by providing us with a field
|
|
# to sort on.
|
|
#
|
|
# The ispell/awk combination is similar to the ispell/sed pipe
|
|
# used to generate EXPANDEDPAIRS, except that the awk adds an extra
|
|
# field giving the sort order. The first sort gets things in order
|
|
# so the first root listed is the one we want, and the second sort (-um)
|
|
# then selects that first root. Sed strips the expansion from the root,
|
|
# and a final sort -u generates MINIMALAFFIXES, the final list of affixes
|
|
# that (more or less) minimally covers what it can from EXPANDEDINPUT.
|
|
#
|
|
# Incidentally, the sed in the pipe below is necessary only because
|
|
# some versions of awk can't handle large numbers of fields (e.g.,
|
|
# over 100). Otherwise we could just loop over the fields.
|
|
#
|
|
ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $LEGALFLAGLIST \
|
|
| sed -n "$PAIRUP" \
|
|
| awk '
|
|
{
|
|
rootl = index ($1, "'"$flagmarker"'")
|
|
nflags = length (substr ($1, rootl)) - 1
|
|
rootl--
|
|
print $1, $2, rootl, nflags
|
|
}' \
|
|
| sort $SORTTMP +1 -2 +2n -3 +3rn -4 +0 -1 \
|
|
| sort $SORTTMP -um +1 -2 \
|
|
| sed -e 's; .*$;;' \
|
|
| sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 > $MINIMALAFFIXES
|
|
/bin/rm -f $LEGALFLAGLIST
|
|
#
|
|
# Now we're almost done. MINIMALAFFIXES covers some (with luck, most)
|
|
# of the words in STRIPPEDINPUT. Now we must create a list of the remaining
|
|
# words (those omitted by MINIMALAFFIXES) and add it to MINIMALAFFIXES.
|
|
# The best way to do this is to actually build a partial dictionary from
|
|
# MINIMALAFFIXES in FAKEHASH, and then use ispell -l to list the words that
|
|
# are not covered by this dictionary. This must then be combined with the
|
|
# reduced version of MINIMALAFFIXES and sorted to produce the final result.
|
|
#
|
|
if [ -s $MINIMALAFFIXES ]
|
|
then
|
|
buildhash -s $MINIMALAFFIXES $langtabs $FAKEHASH > /dev/null \
|
|
|| (echo "Couldn't create intermediate hash file" 1>&2;
|
|
/bin/rm -f ${TMP}*;
|
|
exit 1) \
|
|
|| exit 1
|
|
if [ "$debug" = yes ]
|
|
then
|
|
rm -f ${TDIR}/MINAFFIXES.cnt ${TDIR}/MINAFFIXES.stat
|
|
ln $MINIMALAFFIXES.cnt ${TDIR}/MINAFFIXES.cnt
|
|
ln $MINIMALAFFIXES.stat ${TDIR}/MINAFFIXES.stat
|
|
fi
|
|
(ispell "$wchars" -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT; \
|
|
$COMBINE $langtabs < $MINIMALAFFIXES) \
|
|
| sort $SORTTMP "-t$flagmarker" -u +0f -1 +0
|
|
else
|
|
# MINIMALAFFIXES is empty; just produce a sorted version of STRIPPEDINPUT
|
|
sort $SORTTMP "-t$flagmarker" -u +0f -1 +0 $STRIPPEDINPUT
|
|
fi
|
|
/bin/rm -f ${TMP}*
|