: Use /bin/sh

# This script munches the Norwegian word-lists.

# Usage: munch.sh foo

# Munches the file norsk.foo and puts result in norsk.foo.new.  The
# file norsk.foo uses norsk.aff and norsk.foo.new uses norsk.aff.new.
# Both affix files must be present, even if they are identical.

# Due to bugs in ispell spesial care has been taken.  Warning:
# Munching norsk.base takes 120M disk-space and a lot of time!

LANG="norsk"
EXT=".$1"
AFF=".aff"
NEW=".new"
DELETEFILES=true # Set to false if debugging.

SOURCE=${LANG}${EXT}
FROMAFF=${LANG}${AFF}
TOAFF=${LANG}${AFF}${NEW}
TARGET=${LANG}${EXT}${NEW}

# munchlist does not handle the combination og *J and ~Z flags well by
# default.  All is mapped to *J.  The FIXAFFREGEXP makes new affix
# files that is used only by munchlist to prevent this feature.
# The rules in the ~ flags must contain the string #~ asa marker!!

NORMALFLAG="s/\(^flag  *\)~\(..\?:\)/\1\2/"
FIXAFFREGEXP="s/\(.*> *[-,A-Z]*\)    \( *#~.*\)$/\1XXXX\2 *HACK*/"

echo "Making fake affix files"
cat ${FROMAFF} | sed -e "${FIXAFFREGEXP}" -e "${NORMALFLAG}" > ${FROMAFF}.munch
cat ${TOAFF}   | sed -e "${FIXAFFREGEXP}" -e "${NORMALFLAG}"> ${TOAFF}.munch

# Ispell might core-dump in the last part of the MUNCHLIST-script if
# the whole dictionary is prosessed in one run.  More presicely if one
# have hash-overflows.  So lets split it.  For some reason this split
# works.  No words starting with  does arise from prefix-flags, so we
# loose nothing.

echo
echo "Munchig first part..."
grep "^[]"  ${SOURCE} | \
  munchlist -v -c ${FROMAFF}.munch -l ${TOAFF}.munch > ${TARGET}.ae
echo
echo "Munching second part..."
grep "^[^]" ${SOURCE} | \
  munchlist -v -c ${FROMAFF}.munch -l ${TOAFF}.munch > ${TARGET}.nae

$DELETEFILES && rm ${FROMAFF}.munch ${TOAFF}.munch

# Merge the dictionaries and remove some redundant flags that MUNCHLIST
# does not find.
#
# The test at the end proves that it works.  No docs -)

echo
echo "Removing some redundant flags..."

cat  ${TARGET}.ae ${TARGET}.nae | \
sort "-t/" -u +0f -1 +0 | \
sed \
 -e "s/\(et\/.*M.*\)V/\1/" \
 -e "s/\(e\/.*M.*\)W/\1/" \
 -e "s/\(er\/.*I.*\)V/\1/" \
 -e "s/\(e\/.*B.*\)W/\1/" \
 -e "s/\(e\/.*\)C\(.*M\)/\1\2/" | \
sed -e N -e "s/^\([a-z]*\([^e][^r]\|[e][^r]\|[r][^e]\)\)\/\([A-Zt-z]*\)\\
\1e\/\([A-Zt-z]*\)R\([A-Zt-z]*\)$/\1\/\3\\
\1e\/\4\5/g"  -e "$ p" -e "$ d" -e P -e D | \
sed -e N -e "s/^\([a-z]*\)\(\/[AB]*\)E\(.*\)\\
\1er\/AI/\1\2\3\\
\1er\/AI/"  -e "$ p" -e "$ d" -e P -e D  > ${TARGET}

$DELETEFILES && rm ${TARGET}.ae ${TARGET}.nae

# Lets compare the dictionaries to see if munchlist has done a good
# job.  If it hasn't, check if ISPELL core-dumped.  If it did, try to
# split the dictionary differently at the top of this file.

echo
echo "Comparing the old and the new dictionaries..."

# We need an affix file without any flags:

cat ${TOAFF} | sed -e "/^prefixes *$/,//d" > null.aff
echo "
suffixes

flag *z:
  Y Y Y Y Y   >   YYYYYY" >> null.aff

munchlist -c ${FROMAFF} -l null.aff ${SOURCE} > ${SOURCE}.expanded
munchlist -c ${TOAFF}   -l null.aff ${TARGET} > ${TARGET}.expanded

$DELETEFILES && rm null.aff

# Check if munchlist has done a good job.  Put result in ${SOURCE}.diff.

wc ${SOURCE}.expanded ${TARGET}.expanded > ${SOURCE}.diff
diff ${SOURCE}.expanded ${TARGET}.expanded >> ${SOURCE}.diff
$DELETEFILES && rm ${SOURCE}.expanded ${TARGET}.expanded

echo
echo "Look in ${SOURCE}.diff for sizes and differenes."
