/* #includes */ /*{{{C}}}*//*{{{*/
#undef  _POSIX_SOURCE
#define _POSIX_SOURCE   1
#undef  _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 2

#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <locale.h>
#include <math.h>
#include <nl_types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <getopt.h>
#include <config.h>

#include "misc.h"
#include "sentence.h"
/*}}}*/
/* #defines */ /*{{{*/
#define USAGE1  catgets(catd,1, 1,"Usage: style [-L language] [-l length] [-r ari] [file ...]\n")
#define USAGE2  catgets(catd,1, 2,"       style [--language language] [--print-long length] [--print-ari ari]\n             [file ...]\n")
#define USAGE3  catgets(catd,1, 3,"       style --version\n")
#define MORE    catgets(catd,1, 4,"Try style -h|--help for more information.\n")
#define HELP1   catgets(catd,1, 5,"Analyse surface characteristics of a document.\n")
#define HELP2   catgets(catd,1, 6,"-L, --language    set the document language.\n")
#define HELP3   catgets(catd,1, 7,"-l, --print-long  print all sentences longer than <length> words\n")
#define HELP4   catgets(catd,1, 8,"-r, --print-ari   print all sentences with an ARI greater than than <ari>\n")
#define HELP5   catgets(catd,1, 9,"-h, --help        print this message\n")
#define HELP6   catgets(catd,1,10,"    --version     print the version\n")
#define NOFILE  catgets(catd,1,11,"style: can not open %s: %s\n")
#define NOSENT  catgets(catd,1,12,"No sentences found.\n")
#define NOMEM   catgets(catd,1,13,"style: out of memory\n")
/*}}}*/

/* variables */ /*{{{*/
static nl_catd catd;
static const char *lc_ctype;
enum lc_ctype_int { ASCII, ISO_8859_1 };
static enum lc_ctype_int lc_ctype_int;
static const char *docLanguage;

static int characters;
static int syllables;
static int words;
static int shortwords;
static int longwords;
static int bigwords;
static int sentences;
static int questions;
static int imperatives;
static int shortestLine,shortestLength;
static int longestLine,longestLength;
static int paragraphs;
static int printLongSentences=0;
static float printARI=0.0;
static struct Hit lengths;
/*}}}*/

/* hit counting functions */ /*{{{*/
struct Hit /*{{{*/
{
  int *data;
  int capacity;
  int size;
};
/*}}}*/
static void newHit(struct Hit *hit) /*{{{*/
{
  if ((hit->data=malloc((hit->capacity=3)*sizeof(int)))==(int*)0)
  {
    fprintf(stderr,NOMEM);
    exit(1);
  }
  memset(hit->data,0,hit->capacity*sizeof(int));
  hit->size=0;
}
/*}}}*/
static void noteHit(struct Hit *hit, int n) /*{{{*/
{
  assert(n>0);
  if (n>hit->capacity)
  {
    if ((hit->data=realloc(hit->data,n*2*sizeof(int)))==(int*)0)
    {
      fprintf(stderr,NOMEM);
      exit(1);
    }
    memset(hit->data+hit->capacity,0,(n*2-hit->capacity)*sizeof(int));
    hit->capacity=n*2;
  }
  ++hit->data[n-1];
  if (n>hit->size) hit->size=n;
}
/*}}}*/
/*}}}*/

/* readability formulas */ /*{{{*/
/**
 * Calculate Kincaid Formula (reading grade).
 * @param syllables number of syllables
 * @param words number of words
 * @param sentences number of sentences
 */
static double kincaid(int syllables, int words, int sentences) /*{{{*/
{
  return 11.8*(((double)syllables)/words)+0.39*(((double)words)/sentences)-15.59;
}
/*}}}*/

/**
 * Calculate Automated Readability Index (reading grade).
 * @param letters number of letters
 * @param words the number of words
 * @param sentences the number of sentences
 */
static double ari(int letters, int words, int sentences) /*{{{*/
{
  return 4.71*(((double)letters)/words)+0.5*(((double)words)/sentences)-21.43;
}
/*}}}*/

/**
 * Calculate Coleman-Liau Formula.
 * @param letters number of letters
 * @param words the number of words
 * @param sentences the number of sentences
 */
static double coleman_liau(int letters, int words, int sentences) /*{{{*/
{
  return 5.89*(((double)letters)/words)-0.3*(((double)sentences)/(100*words))-15.8;
}
/*}}}*/

/**
 * Calculate Flesch reading ease formula.
 * @param syllables number of syllables
 * @param words number of words
 * @param sentences number of sentences
 */
static double flesch(int syllables, int words, int sentences) /*{{{*/
{
  return 206.835-84.6*(((double)syllables)/words)-1.015*(((double)words)/sentences);
}
/*}}}*/

/**
 * Calculate fog index.
 * @param words the number of words in the text
 * @param bigwords the number of words which contain more than 3 syllables
 * @param sentences the number of sentences
 */
static double fog(int words, int bigwords, int sentences) /*{{{*/
{
  return ((((double)words)/sentences+((double)bigwords)/words*100)*0.4);
}
/*}}}*/

/**
 * Calculate 1. neue Wiener Sachtextformel (WSFT).
 * @param words the number of words in the text
 * @param shortwords the number of words that contain one syllable
 * @param longwords the number of words that are longer than 6 characters
 * @param bigwords the number of words that contain more than 3 syllables
 * @param sentences number of sentences
 */
static double wsft(int words, int shortwords, int longwords, int bigwords, int sentences) /*{{{*/
{
  return 0.1935*((double)bigwords)/words+0.1672*((double)words)/sentences-0.1297*((double)longwords)/words-0.0327*((double)shortwords)/words-0.875;
}
/*}}}*/

/**
 * Calculate Wheeler-Smith formula.
 * @param words the number of words in the text
 * @param bigwords the number of words that contain more than 3 syllables
 * @param sentences number of sentences
 * @returns the wheeler smith index as result and the grade level in grade.
 *          If grade is 0, the index is lower than any grade, if the index is
 *          99, it is higher than any grade.
 */
static double wheeler_smith(int *grade, int words, int bigwords, int sentences) /*{{{*/
{
  double index=((double)words)/sentences*((double)bigwords)/words/10.0;
  if (index<=16) *grade=0;
  else if (index<=20) *grade=5;
  else if (index<=24) *grade=6;
  else if (index<=29) *grade=7;
  else if (index<=34) *grade=8;
  else if (index<=38) *grade=9;
  else if (index<=42) *grade=10;
  else *grade=99;
  return index;
}
/*}}}*/

/**
 * Calculate Lix formula of Bjrnsson from Sweden.
 * @param words the number of words in the text
 * @param sentences number of sentences
 * @param longwords the number of words that are longer than 6 characters
 * @returns the wheeler smith index as result and the grade level in grade.
 *          If grade is 0, the index is lower than any grade, if the index is
 *          99, it is higher than any grade.
 */
static double lix(int *grade, int words, int longwords, int sentences) /*{{{*/
{
  double index=((double)words)/sentences+((double)longwords)/words;
  if (index<34) *grade=0;
  else if (index<38) *grade=5;
  else if (index<41) *grade=6;
  else if (index<44) *grade=7;
  else if (index<48) *grade=8;
  else if (index<51) *grade=9;
  else if (index<54) *grade=10;
  else if (index<57) *grade=11;
  else *grade=99;
  return index;
}
/*}}}*/

/**
 * Calculate SMOG-Grading.
 * @param bigwords the number of words that contain more than 3 syllables
 * @param sentences number of sentences
 */
static double smog(int bigwords, int sentences) /*{{{*/
{
  if (strncmp(docLanguage,"de",2)==0) return sqrt(((double)bigwords)*((double)sentences)/30)-2.0;
  else return sqrt((double)bigwords*((double)sentences))+3.0;
}
/*}}}*/
/*}}}*/

/**
 * Test if the word is an imperative.  This function uses docLanguage to
 * determine the used language.
 */
static int imperative(const char *word, size_t l) /*{{{*/
{
  static const char *de[]= /* German imperatives */ /*{{{*/
  {
    "mu", "mssen", "werden", "wird", "soll", (const char*)0
  };
  /*}}}*/
  static const char *en[]= /* English imperatives */ /*{{{*/
  {
    "shall", "must", "will", "should", (const char*)0
  };
  /*}}}*/
  const char **list;

  if (strncmp(docLanguage,"de",2)==0) list=de;
  else list=en;
  while (*list) if (strncmp(*list,word,strlen(*list))==0) return 1; else ++list;
  return 0;
}
/*}}}*/

/* syllable counting */ /*{{{*/
/**
 * Check if the character is pronounced as a vowel.
 */
static int vowel(char c) /*{{{*/
{
  switch (lc_ctype_int)
  {
    case ASCII: return (c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || c=='y');
    case ISO_8859_1: return (c=='a' || c=='' || c=='e' || c=='i' || c=='o' || c=='' || c=='u' || c=='' || c=='y');
    default: assert(0);
  }
}
/*}}}*/

/**
 * Count syllables for english words by counting vowel-consonant pairs.
 * @param s the word
 * @param l the word's length
 */
static int syll_en(const char *s, size_t l) /*{{{*/
{
  int count=0;

  if (l>=2 && *(s+l-2)=='e' && *(s+l-1)=='d') l-=2;
  while (l)
  {
    if (l>=2 && vowel(*s) && !vowel(*(s+1))) { ++count; s+=2; l-=2; }
    else { ++s; --l; }
  }
  return (count==0 ? 1 : count);
}
/*}}}*/

/**
 * Count syllables for German words by counting vowel-consonant or
 * consonant-vowel pairs, depending on the first character being a vowel or
 * not.  If it is, a trailing e will be handled with a special rule.  This
 * algorithm fails on "vor-ueber".
 * @param s the word
 * @param l the word's length
 */
static int syll_de(const char *s, size_t l) /*{{{*/
{  
  int count=0;
  size_t ol=l;
 
  if (vowel(*s))  
  while (l) 
  {
    if (l>=2 && vowel(*s) && !vowel(*(s+1))) { ++count; s+=2; l-=2; }
    else if (l==1 && ol>1 && !vowel(*(s-1)) && *s=='e') { ++count; s+=1; l-=1; }
    else { ++s; --l; }
  }
  else
  while (l)
  {
    if (l>=2 && !vowel(*s) && vowel(*(s+1))) { ++count; s+=2; l-=2; }
    else { ++s; --l; }
  }
  return (count==0 ? 1 : count);
}
/*}}}*/

/**
 * Count syllables.  First, charset is set to the used character set.
 * Depending on the language, the right counting function is called.
 * @param s the word
 * @param l the word's length
 */
static int syll(const char *s, size_t l) /*{{{*/
{
  assert(s!=(const char*)0);
  assert(l>=1);
  if (strncmp(docLanguage,"de",2)==0) return syll_de(s,l);
  else return syll_en(s,l);
}
/*}}}*/
/*}}}*/

/**
 * Process one sentence.
 * @param str sentence
 * @param length its length
 */
static void style(const char *str, size_t length, const char *file, int line) /*{{{*/
{
  int inword=0;
  int innumber=0;
  int wordLength=-1;
  int sentWords=0;
  int sentLetters=0;
  int count;
  const char *s=str;

  if (length==0) { ++paragraphs; return; }
  assert(str!=(const char*)0);
  assert(length>=2);
  while (*s)
  {
    if (inword)
    {
      if (!isalpha(*s))
      {
        inword=0;
        count=syll(s-wordLength,wordLength);
        syllables+=count;
        if (count>=3) ++bigwords;
        else if (count==1) ++shortwords;
        if (wordLength>6) ++longwords;
        if (imperative(s-wordLength,wordLength)) ++imperatives;
      }
      else
      {
        ++wordLength;
        ++characters;
        ++sentLetters;
      }
    }
    else if (innumber)
    {
      if (!isdigit(*s))
      {
        innumber=0;
        ++syllables;
      }
      else
      {
        ++wordLength;
        ++characters;
        ++sentLetters;
      }
    }
    else
    {
      if (isalpha(*s))
      {
        ++words;
        ++sentWords;
        inword=1;
        wordLength=1;
        ++characters;
        ++sentLetters;
      }
      else if (isdigit(*s))
      {
        ++words;
        ++sentWords;
        innumber=1;
        wordLength=1;
        ++characters;
        ++sentLetters;
      }
    }
    ++s;
  }
  ++sentences;
  if (shortestLine==0 || sentWords<shortestLength)
  {
    shortestLine=sentences;
    shortestLength=sentWords;
  }
  if (longestLine==0 || sentWords>longestLength)
  {
    longestLine=sentences;
    longestLength=sentWords;
  }
  if (str[length-1]=='?') ++questions;
  noteHit(&lengths,sentWords);
  if ((printLongSentences && sentWords>=printLongSentences) || (printARI && ari(sentLetters,sentWords,1)>printARI)) printf("%s:%d: %s\n",file,line,str);
}
/*}}}*/

int main(int argc, char *argv[]) /*{{{*/
{
  /* variables */ /*{{{*/
  int usage=0,c;
  static struct option lopts[]=
  {
    { "help", no_argument, 0, 'h' },
    { "print-long", required_argument, 0, 'l' },
    { "language", required_argument, 0, 'L' },
    { "print-ari", required_argument, 0, 'r' },
    { "version", no_argument, 0, 'v' },
    { (const char*)0, 0, 0, '\0' }
  };
  /*}}}*/

  /* locale */ /*{{{*/
  setlocale(LC_ALL,"");
  catd=catopen("style",0);
  /*}}}*/
  /* parse options */ /*{{{*/
#if 0
  lc_ctype=setlocale(LC_CTYPE,(const char*)0);
  docLanguage=setlocale(LC_MESSAGES,(const char*)0);
#else
  if ((lc_ctype=getenv("LC_CTYPE"))==(const char*)0) lc_ctype="C";
  if ((docLanguage=getenv("LC_MESSAGES"))==(const char*)0) docLanguage="C";
#endif
  if (strstr(lc_ctype,"8859-1")) lc_ctype_int=ISO_8859_1;
  else lc_ctype_int=ASCII;
  while ((c=getopt_long(argc,argv,"l:L:r:h",lopts,(int*)0))!=EOF) switch(c)
  {
    case 'l':
    {
      char *end;
      printLongSentences=strtol(optarg,&end,10);
      if (end==optarg || *end!='\0') usage=1;
      break;
    }
    case 'L':
    {
      docLanguage=optarg;
      break;
    }
    case 'r':
    {
      char *end;
      printARI=strtod(optarg,&end);
      if (end==optarg || *end!='\0') usage=1;
      break;
    }
    case 'v': fputs("GNU style " VERSION "\n",stdout); exit(0);
    case 'h': usage=2; break;
    default: usage=1; break;
  }
  if (usage==1)
  {
    fputs(USAGE1,stderr);
    fputs(USAGE2,stderr);
    fputs(USAGE3,stderr);
    fputs("\n",stderr);
    fputs(MORE,stderr);
    exit(1);
  }
  else if (usage==2)
  {
    fputs(USAGE1,stdout);
    fputs(USAGE2,stdout);
    fputs(USAGE3,stdout);
    fputs("\n",stdout);
    fputs(HELP1,stdout);
    fputs("\n",stdout);
    fputs(HELP2,stdout);
    fputs(HELP3,stdout);
    fputs(HELP4,stdout);
    exit(0);
  }
  /*}}}*/
  newHit(&lengths);
  if (optind==argc) sentence("style",catd,stdin,"(stdin)",style,docLanguage);
  else while (optind<argc)
  {
    FILE *fp;
    if ((fp=fopen(argv[optind],"r"))==(FILE*)0) fprintf(stderr,NOFILE,argv[optind],strerror(errno));
    {
      sentence("style",catd,fp,argv[optind],style,docLanguage);
      fclose(fp);
    }
    ++optind;
  }
  if (sentences==0)
  {
    printf(NOSENT);
  }
  else
  {
    int wsg;
    int lixg;
    int i,shortLength,shortSent,longLength,longSent;

    printf("readability grades:\n");
    printf("        Kincaid: %.1f\n",kincaid(syllables,words,sentences));
    printf("        ARI: %.1f\n",ari(characters,words,sentences));
    printf("        Coleman-Liau: %.1f\n",coleman_liau(characters,words,sentences));
    printf("        Flesch Index: %.1f\n",flesch(syllables,words,sentences));
    printf("        Fog Index: %.1f\n",fog(words,bigwords,sentences));
    printf("        1. WSFT Index: %.1f\n",wsft(words,shortwords,longwords,bigwords,sentences));
    printf("        Wheeler-Smith Index: %.1f = ",wheeler_smith(&wsg,words,bigwords,sentences));
    if (wsg==0) printf("below school year 5\n");
    else if (wsg==99) printf("higher than school year 10\n");
    else printf("school year %d\n",wsg);
    printf("        Lix: %.1f = ",lix(&lixg,words,longwords,sentences));
    if (lixg==0) printf("below school year 5\n");
    else if (lixg==99) printf("higher than school year 11\n");
    else printf("school year %d\n",lixg);
    printf("        SMOG-Grading: %.1f\n",smog(bigwords,sentences));

    printf("sentence info:\n");
    printf("        %d characters\n",characters);
    printf("        %d words, average length %.2f characters = %.2f syllables\n",words,((double)characters)/words,((double)syllables)/words);
    printf("        %d sentences, average length %.1f words\n",sentences,((double)words)/sentences);
    shortLength=((double)words)/sentences-4.5;
    if (shortLength<1) shortLength=1;
    for (i=0,shortSent=0; i<=shortLength; ++i) shortSent+=lengths.data[i];
    printf("        %d%% (%d) short sentences (at most %d words)\n",100*shortSent/sentences,shortSent,shortLength);
    longLength=((double)words)/sentences+10.5;
    for (i=longLength,longSent=0; i<=lengths.size; ++i) longSent+=lengths.data[i];
    printf("        %d%% (%d) long sentences (at least %d words)\n",100*longSent/sentences,longSent,longLength);
    printf("        %d paragraphs, average length %.1f sentences\n",paragraphs,((double)sentences)/paragraphs);
    printf("        %d questions, %d imperatives\n",questions,imperatives);
    printf("        longest sent %d wds at sent %d; shortest sent %d wds at sent %d\n",longestLength,longestLine,shortestLength,shortestLine);

/*
Missing output:

sentence types:
        simple 100% (1) complex   0% (0)
        compound   0% (0) compound-complex   0% (0)
word usage:
        verb types as % of total verbs
        tobe 100% (1) aux   0% (0) inf   0% (0)
        passives as % of non-inf verbs   0% (0)
        types as % of total
        prep 0.0% (0) conj 0.0% (0) adv 0.0% (0)
        noun 25.0% (1) adj 25.0% (1) pron 25.0% (1)
        nominalizations   0 % (0)
sentence beginnings:
        subject opener: noun (0) pron (1) pos (0) adj (0) art (0) tot 100%
        prep   0% (0) adv   0% (0)
        verb   0% (0)  sub_conj   0% (0) conj   0% (0)
        expletives   0% (0)
*/
  }
  exit(0);
}
/*}}}*/
