/*
** Copyright 1998 - 2009 Double Precision, Inc.  See COPYING for
** distribution information.
*/


/*
** $Id: html.c,v 1.29 2009/02/04 23:53:32 mrsam Exp $
*/
#include	<stdio.h>
#include	<ctype.h>
#include	<string.h>
#include	<stdlib.h>
#include	"config.h"
#include	"cgi/cgi.h"
#include	"sqwebmail.h"
#include	"rfc2045/rfc2045.h"

#include	"html.h"

void decodehtmlchar(char *p)
{
char	*q;

	for (q=p; *p; )
	{
	int	i;

		if (*p != '&')
		{
			*q++=*p++;
			continue;
		}

		if ( p[1] == '#')
		{
		unsigned c=0;

			for (p += 2; isdigit((int)(unsigned char)*p); p++)
				c=c * 10 + (*p++ - '0');
			c=(unsigned char)c;
			if (c)	*q++=c;
			if (*p == ';')	p++;
			continue;
		}

		for (i=1; p[i]; i++)
			if (!isalpha((int)(unsigned char)p[i]))	break;

		if (p[i] != ';')
		{
			*q++=*p++;
			continue;
		}

		for (i=0; p[i] != ';'; i++)
			p[i]=tolower(p[i]);
		++i;
		if (strncmp(p, "&lt;", 4) == 0)
		{
			*q++ = '<';
		}
		else if ( strncmp(p, "&gt;",4) == 0)
		{
			*q++ = '>';
		}
		else if ( strncmp(p, "&amp;", 5) == 0)
		{
			*q++ = '&';
		}
		else if ( strncmp(p, "&quot;", 6) == 0)
		{
			*q++ = '"';
		}
		p += i;
	}
	*q=0;
}

/*
	HTML sanitization filter.  Transforms HTML as follows:

	The following tags are dropped:

		<SCRIPT>, </SCRIPT>, <APP>, </APP>, <APPLET>, </APPLET>, 
		<SERVER>, </SERVER>, <OBJECT>, </OBJECT>, <HTML>, </HTML>, 
		<HEAD>, </HEAD>, <BODY>, </BODY>, <META>, <TITLE>, </TITLE>,
		<FRAME>, </FRAME>, <LINK> <IFRAME> and </IFRAME>.

	The ONLOAD, ONMOUSEOVER, and all other ON* attributes are removed.

	Attributes BACKGROUND, STYLE, TARGET, CODE, ACTION, CODETYPE and
	LANGUAGE are removed.
	TARGET=_blank is added to all <A> tags.

	HREF, SRC, or LOWSRC attributes that do not specify a URL of http:,
	https:, ftp:, gopher:, wais:, or telnet:, are removed.

	Everything in <STYLE>   </STYLE> is dropped..
*/

static void addtagbuf(struct htmlfilter_info *info, int c)
{
	if (info->tagbufsize >= 1024)	return;	/* DOS attack - get rid of the tag */

	if (info->tagbuflen >= info->tagbufsize)
	{
	char	*newtagbuf= info->tagbuf ? (char *)realloc(info->tagbuf, info->tagbufsize+256)
			:(char *)malloc(info->tagbufsize+256);

		if (!newtagbuf)
			return;

		info->tagbuf=newtagbuf;
		info->tagbufsize += 256;
	}
	info->tagbuf[info->tagbuflen++]=c;
}

/* Parse the contents of tagbuf into individual attributes.  If argument is
** NULL, just the count of attributes is returned.  That's the first pass.
** On the second pass the argument points to a struct tagattrinfo array which
** we initialize.
**
** The first attribute is -- obviously -- the actual tag.
*/

static size_t parseattr(struct htmlfilter_info *info, struct tagattrinfo *tai)
{
size_t	c=0;
char *p;

	for (p=info->tagbuf; *p; )
	{
		while (*p && isspace((int)(unsigned char)*p))	p++;
		if (!*p)	break;

		++c;
		if (tai)
		{
			tai->tagname=p;
			tai->tagnamelen=0;
			tai->atagstart=p;
		}
		while (*p && !isspace((int)(unsigned char)*p) && *p != '=')
		{
			++p;
			if (tai)	++tai->tagnamelen;
		}
		if (*p != '=')	/* No attribute value */
		{
			if (tai)
			{
				tai->tagvalue=0;
				tai->tagvaluelen=0;
			}
			while (*p && isspace((int)(unsigned char)*p))	p++;
			if (*p == '=')
				memset(info->tagbuf, ' ',
				       strlen(info->tagbuf));

		}
		else
		{
			char c;

			++p;
			if ((c=*p) == '"' || c == '\'')
				/* Attr value in quotes */
			{
				++p;
				if (tai)
				{
					tai->tagvalue=p;
					tai->tagvaluelen=0;
				}
				while (*p && *p != (char)c)
				{
					++p;
					if (tai)	++tai->tagvaluelen;
				}
				if (*p)	p++;
				else
				{
					memset(info->tagbuf, ' ', strlen(info->tagbuf));
				}
			}
			else
			{
				if (c == 0)
					memset(info->tagbuf, ' ', strlen(info->tagbuf));

				if (tai)
				{
					tai->tagvalue=p;
					tai->tagvaluelen=0;
				}
				while (*p && !isspace((int)(unsigned char)*p))
				{
					p++;
					if (tai)
					{
						tai->tagvalue=p;
						tai->tagvaluelen=0;
					}
				}
			}
		}
		if (tai)
		{
			tai->ataglen=p-tai->atagstart;
			++tai;
		}
	}
	return (c);
}

static void parsetagbuf(struct htmlfilter_info *info)
{
	char *p;

	while ((p=strchr(info->tagbuf, '<')) != NULL)
		*p=' ';

	for (p=info->tagbuf; *p; p++)
	{
		char *q;

		if (*p != '&')
			continue;

		q=p;

		++p;

		while (*p)
		{
			if (strchr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", *p) == NULL)
				break;
			++p;
		}

		if (*p != ';' && *p != '=')
		{
			*q=0;
		}
		--p;
	}

        info->tagattrlen=parseattr(info, 0);
        if ( info->tagattrlen > info->tagattrsize)
        {
        struct tagattrinfo *newta= info->tagattr ? (struct tagattrinfo *)
                realloc(info->tagattr, (info->tagattrlen+16)*sizeof(*info->tagattr))
                :(struct tagattrinfo *)
			malloc((info->tagattrlen+16)*sizeof(*info->tagattr));

                if (!newta)
		{
			info->tagattrlen=0;
			return;
		}

		info->tagattrsize=info->tagattrlen+16;
		info->tagattr=newta;
        }
        info->tagattrlen=parseattr(info, info->tagattr);
}

/* See if this attribute is the one we're looking for */

static int is_attr(struct tagattrinfo *i, const char *l)
{
size_t ll=strlen(l);

	return (i->tagnamelen == ll && strncasecmp(i->tagname, l, ll) == 0);
}

/* If this is the tag we're looking for */

static int is_htmltag(struct htmlfilter_info *info, const char *l)
{
	return (info->tagattrlen ? is_attr(info->tagattr, l):0);
}

/* See if the attribute value starts with what we're looking for */

static int is_valuestart(const char *v, const char *l)
{
	while (v && isspace((int)(unsigned char)*v))
		++v;

	return (v && strncasecmp(v, l, strlen(l)) == 0);
}

/*
	htmlfilter() is repeatedly called to filter the HTML text.  htmlfilter()
	will call htmlfiltered() with the filtered text, more or less on a
	one to one basis.

	htmlfilter_init() must be called before the first invocation of
	htmlfilter().  Because the HTML can be fed in arbitrary quantities,
	htmlfilter() implements a state machine which htmlfilter_init()
	initializes.
*/

#define skipping() (info->instyletag || info->inscripttag)

static void filtered_tag(struct htmlfilter_info *);

struct htmlfilter_info *htmlfilter_alloc( void (*func)(const char *, size_t))
{
	struct htmlfilter_info *info=malloc(sizeof(struct htmlfilter_info));

	if (!info)
		return NULL;

	memset(info, 0, sizeof(*info));

	info->cur_state=intext;
	info->htmlfiltered_func=func;
	info->instyletag=0;
	info->inscripttag=0;
	return info;
}

void htmlfilter_free(struct htmlfilter_info *info)
{
	if (info->tagattr)
		free(info->tagattr);
	if (info->tagbuf)
		free(info->tagbuf);
	free(info);
}

/* Set prefix to wash HTML links */

void htmlfilter_washlink(struct htmlfilter_info *info, const char *p)
{
	info->washlink=p;
}

void htmlfilter_contentbase(struct htmlfilter_info *info, const char *p)
{
	info->contentbase=p;
}

void htmlfilter_washlinkmailto(struct htmlfilter_info *info, const char *p)
{
	info->washlinkmailto=p;
}

void htmlfilter_convertcid(struct htmlfilter_info *info,
			   char *(*cidfunc)(const char *, void *), void *arg)
{
	info->htmlconvertcid_func=cidfunc;
	info->convertcid_arg=arg;
}

void htmlfilter(struct htmlfilter_info *info, const char *p, size_t s)
{
size_t	l;
size_t	start=0;

	for (l=0; l<s; l++)
	{
		switch (info->cur_state)	{
		case intext:
			if (p[l] == '>')
			{
				(*info->htmlfiltered_func)(p+start, l-start);
				(*info->htmlfiltered_func)("&nbsp;", 6);
				start=l+1;
			}

			if (p[l] != '<')	continue;
			if (!skipping())
				(*info->htmlfiltered_func)(p+start, l-start);
					/* Output everything up until the tag */
			info->cur_state=seenlt;
			info->tagbuflen=0;
			break;
		case seenlt:
			if (p[l] == '>')
			{
				info->cur_state=intext;
				start=l+1;
				if (!skipping())
					(*info->htmlfiltered_func)("<>", 2);
						/* Eh? */
				continue;
			}
			if (isspace((int)(unsigned char)p[l]))
				break;
			if (p[l] == '!')
				info->cur_state=seenltbang;
			else if (p[l] != '/'
				 && !isalpha((int)(unsigned char)p[l]))
			{
				start=l+1;
				info->cur_state=intext;
				break;
			}
			else
				info->cur_state=intag;
			addtagbuf(info, p[l]);
			break;
		case intag:
			/* We're in a tag (not a <!-- comment)
			collect the contents in tagbuf, until > is seen */

			info->cur_state=intag;
			if (p[l] == '>')
			{
				start=l+1;
				info->cur_state=intext;
				filtered_tag(info);	/* Filter this tag */
				continue;
			}
			addtagbuf(info, p[l]);
			continue;

		case skiptag:
			if (p[l] == '>')
			{
				start=l+1;
				info->cur_state=intext;
			}
			continue;
		case seenltbang:
			/* We have <!.  If - is not here, this is a SGML tag */
			if (p[l] != '-')
			{
				info->cur_state=skiptag;
				continue;
			}

			addtagbuf(info, p[l]);
			info->cur_state=seenltbangdash;
			continue;

		case seenltbangdash:

			/* We have <!-. If - is not here, this is a SGML tag,
			otherweise we're in a comment, which we can pass
			along */

			if (p[l] != '-')
			{
				info->cur_state=skiptag;
				continue;
			}

			start=l+1;
			info->cur_state=incomment;
			continue;

			/* Look for end of comment */

		case incomment:
			if (p[l] == '-')	info->cur_state=incommentseendash;
			continue;
		case incommentseendash:
			info->cur_state= p[l] == '-' ? incommentseendashdash
						:incomment;
			continue;
		case incommentseendashdash:
			if (p[l] == '-')	continue;
			if (p[l] != '>')
			{
				info->cur_state=incomment;
				continue;
			}
			info->cur_state=intext;
			start=l+1;
			continue;
		}
	}

	/* When we're done with this chunk, if we're doing plain text, or if
	** we're in a comment, just pass it along */

	switch (info->cur_state)	{
	case intext:
		if (!skipping())
			(*info->htmlfiltered_func)(p+start, l-start);
	default:
		break;
	}
}

/* Ok, wash an HREF link.  The entire A (or whatever) element is in tagbuf.
** tag=value pairs have been parsed into tagattr array.
**
** Our argument is the index of the HREF (or SRC) link, which points back
** into tagbuf.
**
** We build a new element, and then rebuild the tagbuf structure.
*/

static char *redirectencode(const char *, size_t );

/* replacelink takes care of replacing the contents of one tag's value. */

static void replacelink(struct htmlfilter_info *info, size_t l, const char *p)
{
	struct tagattrinfo *tagattrp=info->tagattr+l;
	char	*newbuf;
	size_t	plen;
	size_t	i;

	if (!tagattrp->tagvalue)
		return;

	plen=tagattrp->tagvalue - info->tagbuf;

	newbuf=malloc(strlen(info->tagbuf)+strlen(p)+1);
			/* Yes, that's a bit too much.  That's OK */

	if (!newbuf)
	{
		info->tagbuflen=0;
		addtagbuf(info, 0);
		return;
	}

	memcpy(newbuf, info->tagbuf, plen);
	strcpy(newbuf+plen, p);
	strcat(newbuf, tagattrp->tagvalue+tagattrp->tagvaluelen);

	info->tagbuflen=0;
	for (i=0; newbuf[i]; i++)
		addtagbuf(info, newbuf[i]);
	addtagbuf(info, 0);
	parsetagbuf(info);
	free(newbuf);
}
static void dowashlink(struct htmlfilter_info *info,
		       size_t l, const char *value)
{
	char	*url, *p;

	url=redirectencode(value, strlen(value));

	p=url ? malloc(strlen(url)+strlen(info->washlink)+1):NULL;

	if (!p)
	{
		info->tagbuflen=0;
		addtagbuf(info, 0);
		return;
	}

	strcat(strcpy(p, info->washlink), url);
	replacelink(info, l, p);
	free(p);
	free(url);
}

static void dowashlinkmailto(struct htmlfilter_info *info,
			     size_t l, const char *mailtolink)
{
	size_t mailtolinklen=strlen(mailtolink);
	char	*newlink;
	char	*p;

	newlink=malloc(strlen(info->washlinkmailto)+1+mailtolinklen);
	if (!newlink)
	{
		info->tagbuflen=0;
		addtagbuf(info, 0);
		return;
	}

	strcpy(newlink, info->washlinkmailto);
	strncat(newlink, mailtolink, mailtolinklen);
	if ((p=strchr(newlink+strlen(info->washlinkmailto), '?')) != 0)
		*p='&';
	replacelink(info, l, newlink);
	free(newlink);
}

static void dowashcid(struct htmlfilter_info *info, size_t l, const char *link)
{
size_t linklen=strlen(link);
char	*p;

	p=malloc(linklen+1);

	if (!p)
	{
		info->tagbuflen=0;
		addtagbuf(info, 0);
	}

	memcpy(p, link, linklen);
	p[linklen]=0;

	if (!info->htmlconvertcid_func)
		*p=0;
	else
	{
		char	*q=(*info->htmlconvertcid_func)(p+4,
							info->convertcid_arg);

		free(p);
		p=q;
	}
	replacelink(info, l, p ? p:"");
	if (p)
		free(p);
}

static char *redirectencode(const char *p, size_t l)
{
	char	*q=malloc(l+1);
	char	*r;

	if (!q)	return NULL;

	memcpy(q, p, l);
	q[l]=0;
	decodehtmlchar(q);
	r=cgiurlencode(q);
	free(q);
	return (r);
}


static size_t find_tag(struct htmlfilter_info *info, const char *tagname)
{
size_t	l;

	for (l=1; l<info->tagattrlen; l++)
		if (is_attr(info->tagattr+l, tagname))	return (l);
	return (0);
}

/*
	Decide what to do with this tag
*/

static void filtered_tag(struct htmlfilter_info *info)
{
	size_t	l;
	int	open_window=0;
 
	addtagbuf(info, 0);
	parsetagbuf(info);

	if ( is_htmltag(info, "SCRIPT"))
	{
		++info->inscripttag;
		return;
	}

	if ( is_htmltag(info, "/SCRIPT"))
	{
		if (info->inscripttag)
			--info->inscripttag;
		return;
	}

	if (	is_htmltag(info, "TITLE") || is_htmltag(info, "/TITLE") ||
		is_htmltag(info, "FRAME") || is_htmltag(info, "/FRAME") ||
		is_htmltag(info, "IFRAME") || is_htmltag(info, "/IFRAME") ||
		is_htmltag(info, "APP") || is_htmltag(info, "/APP") ||
		is_htmltag(info, "APPLET") || is_htmltag(info, "/APPLET") ||
		is_htmltag(info, "SERVER") || is_htmltag(info, "/SERVER") ||
		is_htmltag(info, "OBJECT") || is_htmltag(info, "/OBJECT") ||
		is_htmltag(info, "HTML") || is_htmltag(info, "/HTML") ||
		is_htmltag(info, "HEAD") || is_htmltag(info, "/HEAD") ||
		is_htmltag(info, "BODY") || is_htmltag(info, "/BODY") ||
		is_htmltag(info, "LINK") || is_htmltag(info, "META"))
	{
		return;
	}

	if ( is_htmltag(info, "STYLE"))
	{
		++info->instyletag;
		return;
	}

	if ( is_htmltag(info, "/STYLE"))
	{
		if (info->instyletag)
			--info->instyletag;
		return;
	}

	if (skipping())	return;


	for (l=1; l<info->tagattrlen; l++)
	{
		if (info->tagattr[l].tagnamelen > 2 &&
			strncasecmp(info->tagattr[l].tagname, "ON", 2) == 0)
		{
			memset(info->tagattr[l].atagstart, ' ',
				info->tagattr[l].ataglen);
		}
	}

	if (is_htmltag(info, "IMG"))
	{
	size_t	nsrc=find_tag(info, "src");
	size_t	nalt=find_tag(info, "alt");
	int	ignore_img=0;

		/*
		** An IMG with a cid: URL is passed along, untouched, with
		** the URL being processed.  This is handled later.
		*/

		if (nsrc && info->htmlconvertcid_func)
		{
		const char *p=info->tagattr[nsrc].tagvalue;
		size_t s=info->tagattr[nsrc].tagvaluelen;

			while (s)
			{
				if ( !isspace((int)(unsigned char)*p)) break;

				++p;
				--s;
			}
			if (s >= 4 && strncasecmp(p, "cid:", 4) == 0)
			{
				nsrc=0;
				ignore_img=1;
				/* Handle tags below */
			}
		}

		if (nsrc)
		{
		char	*r;
		char	*q;
		char	*alt=0;

			r=malloc(info->tagattr[nsrc].tagvaluelen+1);
			if (!r)
				return;

			memcpy(r, info->tagattr[nsrc].tagvalue,
				info->tagattr[nsrc].tagvaluelen);
			r[info->tagattr[nsrc].tagvaluelen]=0;
			q=rfc2045_append_url(info->contentbase, r);
			free(r);

			for (r=q; *r; r++)
				if (*r == '"' ||
					*r == '<' || *r == '>')	*r=0;
					/* Someone's playing games with us */

			if (nalt)
			{
				alt=malloc(info->tagattr[nalt].tagvaluelen+1);
				if (!alt)
					return;

				memcpy(alt, info->tagattr[nalt].tagvalue,
					info->tagattr[nalt].tagvaluelen);
				alt[info->tagattr[nalt].tagvaluelen]=0;
			}

			(*info->htmlfiltered_func)("<a target=\"_blank\" href=\"", 25);

			if (info->washlink)	dowashlink(info, nsrc, q);
			else	replacelink(info, nsrc, q);

			(*info->htmlfiltered_func)(info->tagattr[nsrc].tagvalue,
					info->tagattr[nsrc].tagvaluelen);
			(*info->htmlfiltered_func)("\">", 2);
			if (alt)
				(*info->htmlfiltered_func)(alt, strlen(alt));
			else
			{
				(*info->htmlfiltered_func)("[", 1);
				(*info->htmlfiltered_func)(q, strlen(q));
				(*info->htmlfiltered_func)("]", 1);
			}
			(*info->htmlfiltered_func)("</a>", 4);
			free(q);
			if (alt)	free(alt);
		}

		if (!ignore_img)
			return;
	}

	/* Attempt to automatically plug in any holes */

	for (l=1; l<info->tagattrlen; l++)
	{
		if (is_attr(info->tagattr+l, "target") ||
			is_attr(info->tagattr+l, "code") ||
			is_attr(info->tagattr+l, "language") ||
			is_attr(info->tagattr+l, "action") ||
		    is_attr(info->tagattr+l, "background") ||
		    is_attr(info->tagattr+l, "style") ||
			(is_attr(info->tagattr+l, "type")
			 && !is_htmltag(info, "BLOCKQUOTE")) ||
			is_attr(info->tagattr+l, "codetype"))
			memset(info->tagattr[l].atagstart, ' ',
				info->tagattr[l].ataglen);

		if (is_attr(info->tagattr+l, "href")
			|| is_attr(info->tagattr+l, "src")
			|| is_attr(info->tagattr+l, "lowsrc"))
		{
		char	*p=malloc(info->tagattr[l].tagvaluelen+1), *q;
		size_t	n;
		int	goodhref=0;

			if (!p)
				return;

			memcpy(p, info->tagattr[l].tagvalue, info->tagattr[l].tagvaluelen);
			p[info->tagattr[l].tagvaluelen]=0;

			q=rfc2045_append_url(info->contentbase, p);
			free(p);

			for (p=q; *p; p++)
				if (*p == '"' ||
					*p == '<' || *p == '>')	*p=0;
					/* Someone's playing games with us */

			for (n=0; q[n]; n++)
				if (q[n] == ':')
				{
					if (is_valuestart(q, "cid:"))
					{
						goodhref=1;
						dowashcid(info, l, q);
					}
					else if ((is_valuestart(q, "http:") ||
						is_valuestart(q, "https:")) &&
						is_attr(info->tagattr+l, "href"))
/* block src/lowsrc tags in anything but IMG.
Ex: <input type="image" src="http://"> -- don't render as a redirect
URL
*/
					{
						goodhref=1;
						if (info->washlink)
							dowashlink(info, l, q);
						else
							replacelink(info, l, q);
						if (is_htmltag(info, "A"))
							open_window=1;
						break;
					}
					else if ( is_valuestart(q, "mailto:"))
					{
						goodhref=1;
						dowashlinkmailto(info,
								 l, strchr(q,
									   ':')
								 +1);
					}
					else if ( is_valuestart(q, "ftp:") ||
						is_valuestart(q, "gopher:") ||
						is_valuestart(q, "wais:") ||
						is_valuestart(q, "telnet:"))
					{
						goodhref=1;
						if (is_htmltag(info, "A"))
							open_window=1;
					}
					break;
				}
			if (!goodhref)
			{
				memset(info->tagattr[l].atagstart, ' ',
					info->tagattr[l].ataglen);
			}
			free(q);
		}
	}

	(*info->htmlfiltered_func)("<", 1);
	(*info->htmlfiltered_func)(info->tagbuf, strlen(info->tagbuf));
	if (open_window)
		(*info->htmlfiltered_func)(" target=\"_blank\"", 16);
	(*info->htmlfiltered_func)(">", 1);

}
