/*****************************************************************************/ /* HTML.c Processing associated with reading and composing HTML-format messages. COPYRIGHT --------- Copyright (C) 2005-2022 Mark G.Daniel This program, comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under the conditions of the GNU GENERAL PUBLIC LICENSE, version 3, or any later version. VERSION HISTORY --------------- 25-FEB-2010 MGD HtmlToPlain() supply charset for UTF-8 deentify HtmlCharacterDeEntify() allow for UTF-8 charset 04-JUL-2006 MGD bugfix; HtmlSkipTagTo() quotes outside of tags 20-JUN-2006 MGD HtmlInDisguise() is it really not plain but HTML text? 14-MAR-2006 MGD refine HtmlNumericDeEntify() and HtmlCharacterDeEntify() 13-MAR-2006 MGD HtmlSanitise() suppress "background-image" 01-FEB-2005 MGD initial */ /*****************************************************************************/ #ifdef SOYMAIL_VMS_V7 #undef _VMS_V6_SOURCE #define _VMS_V6_SOURCE #undef __VMS_VER #define __VMS_VER 70000000 #undef __CRTL_VER #define __CRTL_VER 70000000 #endif #pragma nomember_alignment /* standard C header files */ #include #include #include #include #include #include #include #include /* VMS related header files */ #include #include #include /* application header file */ #include "soymail.h" #include "cgilib.h" #include "config.h" #include "html.h" #include "message.h" #define FI_LI __FILE__, __LINE__ /******************/ /* global storage */ /******************/ /********************/ /* external storage */ /********************/ extern BOOL Debug, WatchEnabled; extern CONFIG_DATA SoyMailConfig; /*****************************************************************************/ /* Reduce HTML markup down to a semblance of plain-text. Relies on the underlying HTML to provide white-space, indenting, etc. Essentially just strips the markup out and adds carriage-control as necessary. Will format ordered and unordered lists. */ int HtmlToPlain ( char *HtmlPtr, char *CharSetPtr ) { #define MAX_LIST_DEPTH 16 int ListIndex; int ListType [MAX_LIST_DEPTH+1]; char *cptr, *sptr, *tptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis ("HtmlToPlain()"); /* ensure all potentially 'interesting' entities are revealed */ HtmlNumericDeEntify (HtmlPtr); ListIndex = 0; memset (&ListType, 0, sizeof(ListType)); cptr = sptr = HtmlPtr; while (*cptr) { while (*cptr && *cptr != '<') *sptr++ = *cptr++; if (!*cptr) break; if (!memcmp (cptr, ""); continue; } if (HtmlThisTag (cptr, "SCRIPT")) { cptr += HtmlSkipTagTo (cptr, ""); continue; } if (HtmlThisTag (cptr, "STYLE")) { cptr += HtmlSkipTagTo (cptr, ""); continue; } if (HtmlThisTag (cptr, "IMG")) { cptr += HtmlSkipTagTo (cptr, ">"); continue; } if (HtmlThisTag (cptr, "P")) *sptr++ = '\n'; else if (HtmlThisTag (cptr, "B")) *sptr++ = '*'; else if (HtmlThisTag (cptr, "U")) *sptr++ = '_'; else if (HtmlThisTag (cptr, "I")) *sptr++ = '/'; else if (HtmlThisEndTag (cptr, "LI")) *sptr = '\0'; else if (HtmlThisTag (cptr, "LI")) { if (ListType[ListIndex] < 0) { *sptr++ = ' '; *sptr++ = '*'; *sptr++ = ' '; *sptr++ = ' '; } else { *sptr++ = ' '; *sptr++ = ListType[ListIndex]; *sptr++ = '.'; *sptr++ = ' '; if (ListType[ListIndex] < 'z') ListType[ListIndex]++; if (ListType[ListIndex] == '9' + 1) ListType[ListIndex] = 'a'; } } else if (HtmlThisEndTag (cptr, "UL")) { if (ListIndex > 1) ListIndex--; } else if (HtmlThisTag (cptr, "UL")) { if (ListIndex < MAX_LIST_DEPTH) ListIndex++; ListType[ListIndex] = -1; } else if (HtmlThisEndTag (cptr, "OL")) { if (ListIndex > 1) ListIndex--; } else if (HtmlThisTag (cptr, "OL")) { if (ListIndex < MAX_LIST_DEPTH) ListIndex++; ListType[ListIndex] = '1'; } cptr += HtmlSkipTagTo (cptr, ">"); } *sptr = '\0'; HtmlCharacterDeEntify (HtmlPtr, CharSetPtr&&!strcmp(CharSetPtr,"UTF-8")); return (MessageCollapseLineBreaks (HtmlPtr, 2)); } /*****************************************************************************/ /* Remove all HTML tags except those (probably) harmless tags permitted in the tests. Ensure only known anchor HREFs schemes are permitted. In particular try to disable all scripting capabilities. Bit rude and crude and with lots of assumptions! Done in situ because resultant text will always be smaller or the same size. Return number of characters remaining. CAUTION!! Sanitising anything is like skating on thin ice! */ int HtmlSanitise (char *HtmlPtr) { char *cptr, *sptr, *tptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis ("HtmlSanitise()"); /* ensure all potentially 'interesting' entities are revealed */ HtmlNumericDeEntify (HtmlPtr); /* sanitise */ cptr = sptr = HtmlPtr; while (*cptr) { while (*cptr && *cptr != '<') *sptr++ = *cptr++; if (!*cptr) break; if (!memcmp (cptr, ""); continue; } if (HtmlThisTag (cptr, "TITLE")) { /* absorb the title */ cptr += HtmlSkipTagTo (cptr, ""); continue; } if (HtmlThisTag (cptr, "SCRIPT")) { /**********/ /* script */ /**********/ /* absorb and add the following warning to the displayed page */ static char ScriptHere [] = "<SCRIPT>"; cptr += HtmlSkipTagTo (tptr = cptr, ""); if (cptr - tptr >= sizeof(ScriptHere)) for (tptr = ScriptHere; *tptr; *sptr++ = *tptr++); continue; } if (HtmlThisTag (cptr, "IMG")) { /*********/ /* image */ /*********/ /* absorb and add the following informational to the displayed page */ static char ImgHere [] = "<IMG>"; cptr += HtmlSkipTagTo (tptr = cptr, ">"); if (cptr - tptr >= sizeof(ImgHere)) for (tptr = ImgHere; *tptr; *sptr++ = *tptr++); continue; } if (HtmlThisTag (cptr, "STYLE")) { /*********/ /* style */ /*********/ /* lose all attributes other than what's specifically allowed */ while (*cptr && *cptr != '>' && !isspace(*cptr)) *sptr++ = *cptr++; while (*cptr) { while (isspace (*cptr)) cptr++; if (!*cptr || *cptr == '>') break; if (strsame (cptr, "type=", 5)) { *sptr++ = ' '; cptr += HtmlSpanAttrib (tptr = cptr); while (tptr < cptr) *sptr++ = *tptr++; continue; } /* absorb the attribute */ cptr += HtmlSpanAttrib (cptr); } /* now, include any intervening embedded style */ cptr += HtmlSkipTagTo (tptr = cptr, ""); while (tptr < cptr) { /* do not allow any importing of style sheets! */ if (*tptr == '@' && strsame (tptr, "@import", 7)) { memcpy (sptr, "_imp0rt", 7); sptr += 7; tptr += 7; } else if (tolower(*tptr) == 'b' && strsame (tptr, "background-image", 16)) { memcpy (sptr, "backgr0und_image", 16); sptr += 16; tptr += 16; } else *sptr++ = *tptr++; } continue; } if (HtmlThisTag (cptr, "A")) { /**********/ /* anchor */ /**********/ /* lose all attributes other than what's specifically allowed */ while (*cptr && *cptr != '>' && !isspace(*cptr)) *sptr++ = *cptr++; while (*cptr) { while (isspace (*cptr)) cptr++; if (!*cptr || *cptr == '>') break; /* lose everything but a known HREF */ if (strsame (cptr, "href=", 5)) { /* only allow this subset of known schemes to be specified */ if (strsame (cptr+6, "http://", 7) || strsame (cptr+6, "https://", 8) || strsame (cptr+6, "ftp://", 6) || strsame (cptr+6, "mailto:", 7)) { *sptr++ = ' '; cptr += HtmlSpanAttrib (tptr = cptr); while (tptr < cptr) *sptr++ = *tptr++; } else cptr += HtmlSpanAttrib (cptr); continue; } /* absorb the attribute */ cptr += HtmlSpanAttrib (cptr); } continue; } if (HtmlThisTag (cptr, "B") || HtmlThisTag (cptr, "BLOCKQUOTE") || HtmlThisTag (cptr, "BODY") || HtmlThisTag (cptr, "BR") || HtmlThisTag (cptr, "EM") || HtmlThisTag (cptr, "HR") || HtmlThisTag (cptr, "I") || HtmlThisTag (cptr, "LI") || HtmlThisTag (cptr, "NOSCRIPT") || HtmlThisTag (cptr, "OL") || HtmlThisTag (cptr, "P") || HtmlThisTag (cptr, "PRE") || HtmlThisTag (cptr, "SPAN") || HtmlThisTag (cptr, "STRONG") || HtmlThisTag (cptr, "TABLE") || HtmlThisTag (cptr, "TD") || HtmlThisTag (cptr, "TH") || HtmlThisTag (cptr, "TR") || HtmlThisTag (cptr, "U") || HtmlThisTag (cptr, "UL")) { /***********/ /* allowed */ /***********/ /* lose all attributes other than what's specifically allowed */ while (*cptr && *cptr != '>' && !isspace(*cptr)) *sptr++ = *cptr++; while (*cptr) { while (isspace (*cptr)) cptr++; if (!*cptr || *cptr == '>') break; /* of course not all of these apply to every tag! */ if (strsame (cptr, "align=", 6) || strsame (cptr, "Alink=", 6) || strsame (cptr, "BGCOLOR=", 8) || strsame (cptr, "border=", 7) || strsame (cptr, "cellspacing=", 12) || strsame (cptr, "cellpadding=", 12) || strsame (cptr, "class=", 6) || strsame (cptr, "colspan=", 8) || strsame (cptr, "HEIGHT=", 7) || strsame (cptr, "id=", 3) || strsame (cptr, "link=", 5) || strsame (cptr, "name=", 5) || strsame (cptr, "nowrap", 6) || strsame (cptr, "RULES=", 6) || strsame (cptr, "rowspan=", 8) || strsame (cptr, "style=", 6) || strsame (cptr, "TEXT=", 5) || strsame (cptr, "valign=", 7) || strsame (cptr, "Vlink=", 6) || strsame (cptr, "width=", 6)) { *sptr++ = ' '; cptr += HtmlSpanAttrib (tptr = cptr); while (tptr < cptr) *sptr++ = *tptr++; continue; } /* absorb the attribute */ cptr += HtmlSpanAttrib (cptr); } continue; } /**************/ /* disallowed */ /**************/ cptr += HtmlSkipTagTo (cptr, ">"); } *sptr = '\0'; return (MessageCollapseLineBreaks (HtmlPtr, 1)); } /*****************************************************************************/ /* Return true if the next characters in the string represent the specified tag. */ int HtmlThisTag ( char *sptr, char *tagptr ) { /*********/ /* begin */ /*********/ /* if oops */ if (*sptr++ != '<') return (FALSE); /* if end of tag */ if (*sptr == '/') sptr++; while (*sptr && *tagptr) if (toupper (*sptr++) != toupper (*tagptr++)) return (FALSE); if (*tagptr || (*sptr != '>' && !isspace(*sptr))) return (FALSE); return (TRUE); } /*****************************************************************************/ /* Return true if the next characters in the string represent the specified end-of-tag. */ int HtmlThisEndTag ( char *sptr, char *tagptr ) { /*********/ /* begin */ /*********/ /* if oops */ if (*sptr++ != '<') return (FALSE); /* if not end of tag */ if (*sptr != '/') return (FALSE); sptr++; while (*sptr && *tagptr) if (toupper (*sptr++) != toupper (*tagptr++)) return (FALSE); if (*tagptr || (*sptr != '>' && !isspace(*sptr))) return (FALSE); return (TRUE); } /*****************************************************************************/ /* Find the specified tag (to close a tag with intervening text) taking into account quoting. Return the number of characters consumed. */ int HtmlSkipTagTo ( char *HtmlPtr, char *CloseTag ) { BOOL InsideTag; int taglen; char *cptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis ("HtmlSkipTagTo() !AZ", CloseTag); InsideTag = FALSE; cptr = HtmlPtr; taglen = strlen(CloseTag); while (*cptr) { while (*cptr && *cptr != *CloseTag && *cptr != '<' && *cptr != '>' && *cptr != '\"' && *cptr != '\'') cptr++; if (*cptr == '\"') { cptr++; if (!InsideTag) continue; while (*cptr && *cptr != '\"') cptr++; if (*cptr) cptr++; continue; } if (*cptr == '\'') { cptr++; if (!InsideTag) continue; while (*cptr && *cptr != '\'') cptr++; if (*cptr) cptr++; continue; } if (*cptr == '<') InsideTag = TRUE; else if (*cptr == '>') { if (taglen == 1) { cptr += taglen; break; } InsideTag = FALSE; } if (taglen > 1 && strsame (cptr, CloseTag, taglen)) { cptr += taglen; break; } cptr++; } return (cptr-HtmlPtr); } /*****************************************************************************/ /* */ int HtmlSpanAttrib (char *HtmlPtr) { char *cptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis ("HtmlSpanAttrib()"); cptr = HtmlPtr; while (*cptr) { while (*cptr && !isspace(*cptr) && *cptr != '>' && *cptr != '\"' && *cptr != '\'') cptr++; if (*cptr == '\"') { cptr++; while (*cptr && *cptr != '\"') cptr++; if (*cptr) cptr++; continue; } if (*cptr == '\'') { cptr++; while (*cptr && *cptr != '\'') cptr++; if (*cptr) cptr++; continue; } break; } return (cptr-HtmlPtr); } /*****************************************************************************/ /* Ensure all potentially 'interesting' and numeric entities are revealed in order to prevent characters they might represent from being 'hidden' from HTML stripping. Done in situ because resultant text will always be smaller or the same size. Return number of characters remaining. */ int HtmlNumericDeEntify (char *HtmlPtr) { int enval; char *cptr, *sptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis ("HtmlNumericDeEntify() !AZ", HtmlPtr); cptr = sptr = HtmlPtr; while (*cptr) { if (*(USHORTPTR)cptr != '&#') { *sptr++ = *cptr++; continue; } if (!memcmp (cptr, " ", 7) || !memcmp (cptr, " ", 7) || !memcmp (cptr, "‘", 7) || !memcmp (cptr, "’", 7) || !memcmp (cptr, "“", 7) || !memcmp (cptr, "”", 7) || !memcmp (cptr, "…", 7)) { /* a few, common UTF-8 exceptions */ *sptr++ = *cptr++; continue; } /* convert a numeric entity to it's character equivalent */ cptr += 2; if (*cptr == 'x') enval = strtol(cptr+1, NULL, 16); else enval = atol(cptr); while (*cptr && *cptr != ';') cptr++; /* if badly formed entity */ if (!*cptr) break; cptr++; /* only cater for 8 bits, untranslated character an ¿ */ if (enval <= 0 || enval >= 256) enval = 191; *sptr++ = enval; } *sptr = '\0'; return (sptr-HtmlPtr); } /*****************************************************************************/ /* Convert numeric HTML entities in a string into their character equivalents (e.g. "&" to '&', "� to 0x00, etc.) Also converts common alphabetic entities (e.g. "&", ", <", etc.) but not all (any that are not recognised are left untouched). Does not URL-decode! Resultant string is always the same size or smaller so it can be done in-situ! Returns the size of the resultant string. Copied from CgiLibHtmlDeEntify() and modified so that control characters (<=0x1f) are just absorbed. Optional, elementary UTF-8 encoding. */ int HtmlCharacterDeEntify ( char *TextPtr, BOOL Utf8 ) { struct HtmlEntityStruct { char *ent; int len, val; } HtmlEntity [] = { { "amp;", 4, '&' }, { "lt;", 3, '<' }, { "gt;", 3, '>' }, { "quot;", 5, '\"' }, { "apos;", 5, '\'' }, { "rsquo;", 6, '\'' }, { "lsquo;", 6, '`' }, { "nbsp;", 5, 160 }, { "iexcl;", 6, 161 }, { "cent;", 5, 162 }, { "pound;", 6, 163 }, { "curren;", 7, 164 }, { "yen;", 4, 165 }, { "brvbar;", 7, 166 }, { "sect;", 5, 167 }, { "uml;", 4, 168 }, { "copy;", 5, 169 }, { "ordf;", 5, 170 }, { "laquo;", 6, 171 }, { "not;", 4, 172 }, { "shy;", 4, 173 }, { "reg;", 4, 174 }, { "macr;", 5, 175 }, { "deg;", 4, 176 }, { "plusmn;", 7, 177 }, { "sup2;", 5, 178 }, { "sup3;", 5, 179 }, { "acute;", 6, 180 }, { "micro;", 6, 181 }, { "para;", 5, 182 }, { "middot;", 7, 183 }, { "cedil;", 6, 184 }, { "sup1;", 5, 185 }, { "ordm;", 5, 186 }, { "raquo;", 6, 187 }, { "frac14;", 7, 188 }, { "frac12;", 7, 189 }, { "frac34;", 7, 190 }, { "iquest;", 7, 191 }, { "Agrave;", 7, 192 }, { "Aacute;", 7, 193 }, { "Acirc;", 6, 194 }, { "Atilde;", 7, 195 }, { "Auml;", 5, 196 }, { "Aring;", 6, 197 }, { "AElig;", 6, 198 }, { "Ccedil;", 7, 199 }, { "Egrave;", 7, 200 }, { "Eacute;", 7, 201 }, { "Ecirc;", 6, 202 }, { "Euml;", 5, 203 }, { "Igrave;", 7, 204 }, { "Iacute;", 7, 205 }, { "Icirc;", 6, 206 }, { "Iuml;", 5, 207 }, { "ETH;", 4, 208 }, { "Ntilde;", 7, 209 }, { "Ograve;", 7, 210 }, { "Oacute;", 7, 211 }, { "Ocirc;", 6, 212 }, { "Otilde;", 7, 213 }, { "Ouml;", 5, 214 }, { "times;", 6, 215 }, { "Oslash;", 7, 216 }, { "Ugrave;", 7, 217 }, { "Uacute;", 7, 218 }, { "Ucirc;", 6, 219 }, { "Uuml;", 5, 220 }, { "Yacute;", 7, 221 }, { "THORN;", 6, 222 }, { "szlig;", 6, 223 }, { "agrave;", 7, 224 }, { "aacute;", 7, 225 }, { "acirc;", 6, 226 }, { "atilde;", 7, 227 }, { "auml;", 5, 228 }, { "aring;", 6, 229 }, { "aelig;", 6, 230 }, { "ccedil;", 7, 231 }, { "egrave;", 7, 232 }, { "eacute;", 7, 233 }, { "ecirc;", 6, 234 }, { "euml;", 5, 235 }, { "igrave;", 7, 236 }, { "iacute;", 7, 237 }, { "icirc;", 6, 238 }, { "iuml;", 5, 239 }, { "eth;", 4, 240 }, { "ntilde;", 7, 241 }, { "ograve;", 7, 242 }, { "oacute;", 7, 243 }, { "ocirc;", 6, 244 }, { "otilde;", 7, 245 }, { "ouml;", 5, 246 }, { "divide;", 7, 247 }, { "oslash;", 7, 248 }, { "ugrave;", 7, 249 }, { "uacute;", 7, 250 }, { "ucirc;", 6, 251 }, { "uuml;", 5, 252 }, { "yacute;", 7, 253 }, { "thorn;", 6, 254 }, { "yuml;", 5, 255 }, { NULL, 0, 0 } }; int ch, idx; char *cptr, *sptr; /*********/ /* begin */ /*********/ if (Debug) fprintf (stdout, "HtmlCharacterDeEntify() |%s|\n", TextPtr); cptr = sptr = TextPtr; while (*cptr) { if (*cptr != '&') { *sptr++ = *cptr++; continue; } /* entity */ cptr++; if (*cptr == '#') { cptr++; ch = atoi(cptr); while (*cptr && *cptr != ';') cptr++; if (!*cptr) break; cptr++; if (ch < 32) continue; if (!Utf8) { if (ch > 255) continue; *sptr++ = (ch & 0xff); continue; } /* drop thru to UTF-8 encode */ } else { ch = 0; for (idx = 0; HtmlEntity[idx].ent; idx++) { if (!memcmp (cptr, HtmlEntity[idx].ent, HtmlEntity[idx].len)) { ch = HtmlEntity[idx].val; cptr += HtmlEntity[idx].len; break; } } if (!ch) { /* unknown entity */ *sptr++ = '&'; *sptr++ = *cptr++; continue; } if (!Utf8) { *sptr++ = (ch & 0xff); continue; } /* drop thru to UTF-8 encode */ } /* UTF-8 */ if (ch > 255) { *sptr++ = 0xe0 | ((ch & 0xf000) >> 12); *sptr++ = 0x80 | ((ch & 0x0f00) >> 6) | ((ch & 0xc0) >> 6); *sptr++ = 0x80 | (ch & 0x3f); } else if (ch > 127) { *sptr++ = 0xc0 | ((ch & 0xc0) >> 6); *sptr++ = 0x80 | (ch & 0x3f); } else *sptr++ = ch; } *sptr = '\0'; if (Debug) fprintf (stdout, "|%s|\n", TextPtr); return (sptr-TextPtr); } /*****************************************************************************/ /* Elementary UTF-8 encoding for characters > 127. */ char* HtmlUtf8Encode (char *TextPtr) { int ch, cnt; unsigned char *cptr, *sptr, *tptr; /*********/ /* begin */ /*********/ if (Debug) fprintf (stdout, "HtmlUtf8Encode() |%s|\n", TextPtr); for (cptr = (unsigned char*)TextPtr; *cptr && *cptr <= 127; cptr++); if (!*cptr) return (TextPtr); cnt = 0; while (*cptr) if (*cptr++ > 127) cnt++; tptr = CgiLibVeeMemCalloc ((cptr-(unsigned char*)TextPtr)+cnt+1); if (!tptr) ErrorExit (vaxc$errno, FI_LI); sptr = tptr; cptr = (unsigned char*)TextPtr; while (*cptr) { if (*cptr <= 127) { *sptr++ = *cptr++; continue; } /* UTF-8 */ ch = *cptr++; if (ch > 255) { *sptr++ = 0xe0 | ((ch & 0xf000) >> 12); *sptr++ = 0x80 | ((ch & 0x0f00) >> 6) | ((ch & 0xc0) >> 6); *sptr++ = 0x80 | (ch & 0x3f); } else if (ch > 127) { *sptr++ = 0xc0 | ((ch & 0xc0) >> 6); *sptr++ = 0x80 | (ch & 0x3f); } else *sptr++ = ch; } *sptr = '\0'; if (Debug) fprintf (stdout, "|%s|\n", tptr); return ((char*)tptr); } /*****************************************************************************/ /* Check the leading text for what looks like a common HTML tag. */ BOOL HtmlInDisguise (char *TextPtr) { char *cptr; /*********/ /* begin */ /*********/ if (Debug) fprintf (stdout, "HtmlInDisguise()\n"); if (!TextPtr) return (FALSE); for (cptr = TextPtr; *cptr && isspace(*cptr); cptr++); if (WatchEnabled) WatchThis ("!8AZ", cptr); if (*cptr != '<') return (FALSE); if (!memcmp (cptr, "