429 lines
17 KiB
Plaintext
429 lines
17 KiB
Plaintext
#
|
|
# $Id: english.aff,v 1.11 91/09/12 00:01:33 geoff Exp $
|
|
#
|
|
# Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA
|
|
# Permission for non-profit use is hereby granted.
|
|
# All other rights reserved.
|
|
# See "version.h" for a more complete copyright notice.
|
|
#
|
|
# $Log: english.aff,v $
|
|
# Revision 1.11 91/09/12 00:01:33 geoff
|
|
# Fix the definition of "texchars" to reflect the new requirements, and
|
|
# fix the comments describing that statement.
|
|
#
|
|
# Revision 1.10 91/09/04 18:00:46 geoff
|
|
# Fix a tiny typo.
|
|
#
|
|
# Revision 1.9 91/05/27 21:35:12 geoff
|
|
# Move the nroffchars and texchars statements to conform with the new
|
|
# statement ordering restrictions.
|
|
#
|
|
# Revision 1.8 89/04/30 17:50:31 geoff
|
|
# Rewrite the comments (and the character-set declarations) to describe and
|
|
# conform to the new yacc grammar.
|
|
#
|
|
# Revision 1.7 89/02/18 00:51:23 geoff
|
|
# Add definitions of "nroffchars" and "texchars", primarily to server as
|
|
# an example. (Ken Stevens)
|
|
#
|
|
# Revision 1.6 88/12/26 02:24:03 geoff
|
|
# Update the copyright notice.
|
|
#
|
|
# Revision 1.5 88/02/20 23:10:19 geoff
|
|
# Add comments about what flags are within use. Fix a cosmetic case error.
|
|
#
|
|
# Revision 1.4 87/09/14 22:38:13 geoff
|
|
# Add copyright comments
|
|
#
|
|
# Revision 1.3 87/09/09 00:16:17 geoff
|
|
# Add comments about how charset statements now control collating order.
|
|
#
|
|
# Revision 1.2 87/08/02 17:17:03 geoff
|
|
# Add three prefixes (A: re, I: in, and U: un), cross-product flags, and
|
|
# extensive commentary roughly documenting the table format, in case this
|
|
# file gets distributed by itself.
|
|
#
|
|
# Revision 1.1 87/06/28 23:13:54 geoff
|
|
# Initial revision
|
|
#
|
|
#
|
|
# Affix table for English
|
|
#
|
|
# Brief description of table syntax (* in syntax descriptions means one
|
|
# or more occurrences; square brackets and braces have their usual meaning):
|
|
#
|
|
# Although the grammar is designed for a line-oriented layout, it is actually
|
|
# a free-format yacc grammar and can be laid out weirdly if you want.
|
|
# Comments are started by "#" and continue to the end of the line.
|
|
# Backslashes are supported in the usual fashion (\nnn, plus specials
|
|
# \n, \r, \t, \v, \f, \b, and the new hex format \xnn). Any character
|
|
# with special meaning to the parser can be changed to an uninterpreted
|
|
# token by backslashing it; for example you can declare a flag named
|
|
# 'asterisk' or 'colon' with "flag \*:" or "flag \::".
|
|
#
|
|
# table : [headers] [prefixes] [suffixes]
|
|
#
|
|
# One of "prefixes" and "suffixes" is required. They can appear in either
|
|
# order.
|
|
#
|
|
# headers : {options | char-sets }*
|
|
#
|
|
# The headers contain global descriptions and options. These are used to
|
|
# describe the character sets used by the language, the characters that have
|
|
# special meaning to the formatter, and the defaults for certain ispell
|
|
# options.
|
|
#
|
|
# options : { fmtr-stmt | bool-stmt | flag-stmt }
|
|
#
|
|
# fmtr-stmt : { nroff-stmt | tex-stmt }
|
|
#
|
|
# A fmtr-stmt describes characters that have special meaning to a formatter.
|
|
# Normally, this statement is not necessary, but some languages may have
|
|
# preempted the usual defaults for use as language-specific characters. In
|
|
# this case, these statements may be used to redefine the special characters
|
|
# expected by the formatter.
|
|
#
|
|
# nroff-stmt : "nroffchars" string
|
|
#
|
|
# The nroff-stmt describes the characters that have special meaning to the
|
|
# nroff/troff family of formatters. A string of exactly five characters must
|
|
# be given. The default string is "().\*" and refers to the meanings of those
|
|
# five characters in an unmodified nroff. In other words, the backslash is
|
|
# used to introduce special formatter sequences, the asterisk is used to
|
|
# introduce string substitutions, the left parenthesis is used after a backslash
|
|
# to introduce two-character symbol names, and the period is used at the
|
|
# beginning of a line to introduce formatter commands. (The right parenthesis
|
|
# is not currently used, but is included for completeness.)
|
|
#
|
|
# tex-stmt : "texchars" string
|
|
#
|
|
# The tex-stmt is similarly used to describe characters that have special
|
|
# meaning to the TeX and LaTeX formatters. A string of exactly 13 characters
|
|
# must be given. The default string is "()[]{}<>\$*.%" and refers to the
|
|
# meanings of those characters in an unmodified TeX processor. (The
|
|
# dot, square brackets, and angle brackets also apply to the "tib"
|
|
# bibliographic processor.)
|
|
#
|
|
# bool-stmt : { "compoundwords" on-or-off | "allaffixes" on-or-off }
|
|
#
|
|
# on-or-off : { "on" | "off" }
|
|
#
|
|
# A bool-stmt controls certain ispell defaults that are best made
|
|
# language-specific. The "compoundwords" statement controls the default
|
|
# for the "-B" and "-C" options, which control whether run-together words
|
|
# are acceptable spellings. "Compoundwords" should usually be turned on
|
|
# for languages such as German, where legal words may be formed from the
|
|
# concatenation of shorter words. The default is "off".
|
|
#
|
|
# The "allaffixes" statement controls the default for the "-P" and "-m"
|
|
# options, which control whether ispell will suggest possible
|
|
# affix derivations for misspelled words even if possible corrections
|
|
# are found in the dictionary. This option should usually be turned on
|
|
# when you are developing the dictionary for a language, and off for
|
|
# languages that have fairly complete dictionaries. The default is "off".
|
|
#
|
|
# flag-stmt : "flagmarker" character
|
|
#
|
|
# The flag-stmt describes the character which is used to separate affix
|
|
# flags from the root word in a raw dictionary file. This must be a
|
|
# character which is not found in any word (including in string characters;
|
|
# see below). The default is "/" because this character is not normally
|
|
# used to represent special characters in any language.
|
|
#
|
|
# char-sets : { char-stmt | string-stmt }
|
|
#
|
|
# The character-set section describes the characters that can be part of
|
|
# a word. A char-stmt describes single characters; a string-stmt
|
|
# describes characters that must appear together as a string. Either may
|
|
# also describe conversion between upper and lower case.
|
|
#
|
|
# char-stmt : "wordchars" character-range
|
|
# | "wordchars" lowercase-range uppercase-range
|
|
# | "boundarychars" character-range
|
|
# | "boundarychars" lowercase-range uppercase-range
|
|
# string-stmt : "stringchar" string
|
|
# | "stringchar" lowercase-string uppercase-string
|
|
#
|
|
# Characters described with the "boundarychars" statement are considered
|
|
# part of a word only if they are embedded between characters declared with
|
|
# the "wordchars" or "stringchar" statements.
|
|
#
|
|
# If two ranges or strings are given in a statement, the first describes
|
|
# characters that are interpreted as lowercase and the second describes
|
|
# uppercase. In the case of a string-stmt, the two strings must be of the
|
|
# same length. Also, in a string-stmt, the actual strings may contain
|
|
# both uppercase and characters themselves without difficulty; for instance
|
|
# the statement
|
|
#
|
|
# stringchar \\*(sS \\*(Ss
|
|
#
|
|
# is legal and will not interfere with (or be interfered with by) other
|
|
# declarations of of "s" and "S" as lower and upper case, respectively.
|
|
#
|
|
# A final note on string characters: some languages (e.g., German) collate
|
|
# certain special characters as if they were strings. For example, "a-umlaut"
|
|
# is traditionally sorted as if it were "ae". Ispell is not capable of this;
|
|
# each character must be treated as an individual entity. So in certain cases,
|
|
# ispell will sort a list of words into a different order than the standard
|
|
# "dictionary" order for the subject language.
|
|
#
|
|
# A range is given as in egrep and the shell: [a-z] means lowercase
|
|
# alphabetics; [^a-z] means all but lowercase, etc. If more than
|
|
# one statement of a type is given, the characters given are combined
|
|
# to produce the final list of characters.
|
|
#
|
|
# In addition to defining the character sets, the order of these statements
|
|
# defines the collating sequence for sorts. If a range is given, the
|
|
# characters defined by that statement will collate in their ASCII order.
|
|
#
|
|
# The character-declaration statements have a rather strange behavior caused
|
|
# by the need to match each lowercase character with its uppercase equivalent.
|
|
# In any given "wordchars" or "boundarychars" statement, the characters in
|
|
# each range are first sorted into ASCII collating sequence, then matched
|
|
# one-for-one with the other range. (The two ranges must have the same
|
|
# number of characters). Thus, for example, the two statements:
|
|
#
|
|
# wordchars [aeiou] [AEIOU]
|
|
# wordchars [aeiou] [UOIEA]
|
|
#
|
|
# would produce exactly the same effect. To get the vowels to match
|
|
# up "wrong", you would have to use separate statements:
|
|
#
|
|
# wordchars a U
|
|
# wordchars e O
|
|
# wordchars i I
|
|
# wordchars o I
|
|
# wordchars u A
|
|
#
|
|
# which would cause myupper('e') to be 'O', and mylower('O') to be 'e'.
|
|
#
|
|
# This should normally be a problem only with languages which have been
|
|
# forced to use a strange ASCII collating sequence. If your uppercase
|
|
# and lowercase letters both collate in the same order, and your lowercase
|
|
# characters are selected from the "lowercase specials" -- `~{}| --
|
|
# you shouldn't have to worry about this "feature".
|
|
#
|
|
# The prefixes and suffixes section have exactly the same syntax, except
|
|
# that one is introduced with "prefixes" and one with "suffixes". I'll
|
|
# describe them together.
|
|
#
|
|
# prefixes : "prefixes" table*
|
|
# suffixes : "suffixes" table*
|
|
# table : flagdef*
|
|
# flagdef : "flag" [*] <character> ":" replacement*
|
|
#
|
|
# A prefix or suffix table consists of an introductory keyword and a list
|
|
# of flag definitions. Flags can be defined more than once, in which case
|
|
# the definitions are combined. Each flag controls one or more "replacements"
|
|
# which are conditionally applied to the beginnings or endings of various
|
|
# words.
|
|
#
|
|
# Flags are named by a single character. Depending on a configuration file
|
|
# define, this character can be either an uppercase letter (the default
|
|
# configuration) or any 7-bit ASCII character. Most languages should be
|
|
# able to get along with just 26 flags, though.
|
|
#
|
|
# If an asterisk is placed before the flag character, it means that this
|
|
# flag participates in cross-product formation. This only matters if the
|
|
# file describes both prefixes and suffixes. If so, all prefixes and
|
|
# suffixes marked with an asterisk will be applied in all combinations
|
|
# to the root word. This can form a large number of words quickly, some
|
|
# of which may be illegal, so watch out. If cross-products produce illegal
|
|
# words, "munchlist" will not produce those flag combinations, and you
|
|
# will just waste time (yours and ispell's).
|
|
#
|
|
# replacement : condition* ">" [ "-" strip-string "," ] append-string
|
|
#
|
|
# A replacement is a conditional rule for modifying a root word. Up to
|
|
# 8 conditions may be specified. If the conditions are satisfied, the
|
|
# rules on the right-hand side of the replacement are followed. If a
|
|
# strip-string is given, it is first stripped from the beginning or ending
|
|
# (as appropriate) of the root word. Then the append-string is added at
|
|
# that point. For example, the condition "." means "any word", and the
|
|
# condition "Y" means "any word ending in Y". The following (suffix)
|
|
# replacements:
|
|
#
|
|
# . > MENT
|
|
# Y > -Y,IES
|
|
#
|
|
# would change "induce" to "inducement" and "fly" to "flies". (If they
|
|
# were controlled by the same flag, they would also change "fly" to "flyment",
|
|
# which might not be what was wanted).
|
|
#
|
|
# No matter how much you might wish it, the strings on the right must be
|
|
# strings of specific characters, not ranges. The reasons are rooted deeply
|
|
# in the way ispell work, and it would be difficult or impossible to provide
|
|
# for more flexibility. For example, you might wish to write:
|
|
#
|
|
# [EY] > -[EY],IES
|
|
#
|
|
# Too bad. You have to write it as two separate rules:
|
|
#
|
|
# E > -E,IES
|
|
# Y > -Y,IES
|
|
#
|
|
# condition : { "." | character | range }
|
|
#
|
|
# A condition is a restriction on the characters that adjoin and/or are
|
|
# replaced by the right-hand side of the replacement. Up to 8 conditions
|
|
# may be given, which should be enough context for anyone. The right-hand
|
|
# side will be applied only if the conditions in the replacement are
|
|
# satisfied. The conditions also implicitly define a length; roots
|
|
# shorter than the number of conditions will not pass the test. (As a
|
|
# special case, a condition of a single dot "." defines a length of zero,
|
|
# so that the rule applies to all words indiscriminately).
|
|
#
|
|
# Conditions that are single characters should be separated by white space.
|
|
#
|
|
# An example from below:
|
|
#
|
|
# flag *S:
|
|
# [^AEIOU]Y > -Y,IES # As in imply > implies
|
|
# [AEIOU]Y > S # As in convey > conveys
|
|
# [SXZH] > ES # As in fix > fixes
|
|
# [^SXZHY] > S # As in bat > bats
|
|
#
|
|
# The first line applies to words ending in Y, but not in vowel-Y. The
|
|
# second takes care of the vowel-Y words. The third then handles those
|
|
# words that end in a sibilant or near-sibilant, and the last picks up
|
|
# everything else.
|
|
#
|
|
# Note that the conditions are written very carefully so that they apply
|
|
# to disjoint sets of words. In particular, note that the fourth line
|
|
# excludes words ending in Y as well as the obvious SXZH. Otherwise, it
|
|
# would convert "imply" into "implys".
|
|
#
|
|
# Although we do not do so in this file, we could also have a flag
|
|
# generate more than one variation on a root word. For example, we
|
|
# could extend the "R" flag as follows:
|
|
#
|
|
# flag *R:
|
|
# E > R # As in skate > skater
|
|
# E > RS # As in skate > skaters
|
|
# [^AEIOU]Y > -Y,IER # As in multiply > multiplier
|
|
# [^AEIOU]Y > -Y,IERS # As in multiply > multipliers
|
|
# [AEIOU]Y > ER # As in convey > conveyer
|
|
# [AEIOU]Y > ERS # As in convey > conveyers
|
|
# [^EY] > ER # As in build > builder
|
|
# [^EY] > ERS # As in build > builders
|
|
#
|
|
# This flag would generate both "skater" and "skaters" from "skate". This
|
|
# capability can be very useful in language that make use of noun, verb,
|
|
# and adjective endings. For instance, one could define a single flag
|
|
# that generated all of the German "weak" verb endings.
|
|
#
|
|
|
|
nroffchars ().\\*
|
|
texchars ()\[]{}<\>\\$*.%
|
|
|
|
# And now to the English table itself
|
|
|
|
# First we declare the character set. Since it's English, it's easy.
|
|
# The only special character is the apostrophe, so that possessives can
|
|
# be handled. We declare it as a boundary character, so that quoting with
|
|
# single quotes doesn't confuse things. The apostrophe is the only
|
|
# character that gets such treatment.
|
|
#
|
|
# We declare the apostrophe first so that "Jon's" collates before "Jonas".
|
|
# (This is the way ASCII does it).
|
|
#
|
|
|
|
boundarychars '
|
|
wordchars [a-z] [A-Z]
|
|
|
|
# Here's a record of flags used, in case you want to add new ones.
|
|
# Right now, we fit within the minimal MASKBITS definition.
|
|
#
|
|
# ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
|
# Used: * * **** ** * ***** ***
|
|
# A D GHIJ MN P RSTUV XYZ
|
|
# Available: -- -- -- - - -
|
|
# BC EF KL O Q W
|
|
|
|
# Now the prefix table. There are only three prefixes that are truly
|
|
# frequent in English, and none of them seem to need conditional variations.
|
|
#
|
|
prefixes
|
|
|
|
flag *A:
|
|
. > RE # As in enter > reenter
|
|
|
|
flag *I:
|
|
. > IN # As in disposed > indisposed
|
|
|
|
flag *U:
|
|
. > UN # As in natural > unnatural
|
|
|
|
# Finally, the suffixes. These are exactly the suffixes that came out
|
|
# with the original "ispell"; I haven't tried to improve them. The only
|
|
# thing I did besides translate them was to add selected cross-product flags.
|
|
#
|
|
suffixes
|
|
|
|
flag V:
|
|
E > -E,IVE # As in create > creative
|
|
[^E] > IVE # As in prevent > preventive
|
|
|
|
flag *N:
|
|
E > -E,ION # As in create > creation
|
|
Y > -Y,ICATION # As in multiply > multiplication
|
|
[^EY] > EN # As in fall > fallen
|
|
|
|
flag *X:
|
|
E > -E,IONS # As in create > creations
|
|
Y > -Y,ICATIONS # As in multiply > multiplications
|
|
[^EY] > ENS # As in weak > weakens
|
|
|
|
flag H:
|
|
Y > -Y,IETH # As in twenty > twentieth
|
|
[^Y] > TH # As in hundred > hundredth
|
|
|
|
flag *Y:
|
|
. > LY # As in quick > quickly
|
|
|
|
flag *G:
|
|
E > -E,ING # As in file > filing
|
|
[^E] > ING # As in cross > crossing
|
|
|
|
flag *J:
|
|
E > -E,INGS # As in file > filings
|
|
[^E] > INGS # As in cross > crossings
|
|
|
|
flag *D:
|
|
E > D # As in create > created
|
|
[^AEIOU]Y > -Y,IED # As in imply > implied
|
|
[^EY] > ED # As in cross > crossed
|
|
[AEIOU]Y > ED # As in convey > conveyed
|
|
|
|
flag T:
|
|
E > ST # As in late > latest
|
|
[^AEIOU]Y > -Y,IEST # As in dirty > dirtiest
|
|
[AEIOU]Y > EST # As in gray > grayest
|
|
[^EY] > EST # As in small > smallest
|
|
|
|
flag *R:
|
|
E > R # As in skate > skater
|
|
[^AEIOU]Y > -Y,IER # As in multiply > multiplier
|
|
[AEIOU]Y > ER # As in convey > conveyer
|
|
[^EY] > ER # As in build > builder
|
|
|
|
flag *Z:
|
|
E > RS # As in skate > skaters
|
|
[^AEIOU]Y > -Y,IERS # As in multiply > multipliers
|
|
[AEIOU]Y > ERS # As in convey > conveyers
|
|
[^EY] > ERS # As in build > builders
|
|
|
|
flag *S:
|
|
[^AEIOU]Y > -Y,IES # As in imply > implies
|
|
[AEIOU]Y > S # As in convey > conveys
|
|
[SXZH] > ES # As in fix > fixes
|
|
[^SXZHY] > S # As in bat > bats
|
|
|
|
flag *P:
|
|
[^AEIOU]Y > -Y,INESS # As in cloudy > cloudiness
|
|
[AEIOU]Y > NESS # As in gray > grayness
|
|
[^Y] > NESS # As in late > lateness
|
|
|
|
flag *M:
|
|
. > 'S # As in dog > dog's
|