()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))'; /** * Encodes characters as HTML numeric entities to make it marginally more * difficult for web scrapers to grab sensitive info. If `text` starts with * `mailto:` only the email address following it will be obfuscated. */ public static function escapeObfuscated(string $text): string { if (str_starts_with($text, 'mailto:')) { return 'mailto:' . self::escapeObfuscated(mb_substr($text, 7)); } $html = ''; $l = mb_strlen($text); for ($p = 0; $p < $l; $p++) { $cp = mb_ord(mb_substr($text, $p, 1)); $html .= "&#{{$cp}}"; } return $html; } /** * Removes illegal characters from an HTML attribute name. */ public static function scrubAttributeName(string $name): string { return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', '', $name); } /** * Strips one or more leading indents from a line or lines of markdown. An * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3 * spaces) are treated like one indent level. * * @param string|string[] $line * @param int $levels * @return string|string[] */ public static function stripIndent(string|array &$line, int $levels=1): string|array { $regex = "^(?: {1,4}|\\t){{$levels}}"; return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l), $line) : mb_ereg_replace($regex, '', $line); } /** * Counts the number of indent levels in a line of text. Partial indents * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly` * is `true`. */ public static function countIndents(string &$line, bool $fullIndentsOnly=false): int { // normalize indents to tabs $t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line); // remove content after indent $t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t); // count tabs return mb_strlen($t); } /** * Returns a copy of an array without any whitespace-only lines at the end. * * @param string[] $lines * @return string[] */ public static function withoutTrailingBlankLines(array &$lines): array { $stripped = $lines; while (sizeof($stripped) > 0 && mb_strlen(trim($stripped[sizeof($stripped) - 1])) == 0) { array_pop($stripped); } return $stripped; } /** * Tests if an array of lines contains at least one blank. A blank line * can contain whitespace. * * @param string[] $lines */ public static function containsBlankLine(array &$lines): bool { foreach ($lines as $line) { if (mb_strlen(trim($line)) == 0) return true; } return false; } public static function equalAssocArrays(array &$a, array &$b) { return empty(array_diff_assoc($a, $b)); } } /** * Token type enum for `MDToken`. */ enum MDTokenType { case Text; /** * Only used for the leading and trailing whitespace around a run of text, * not every single whitespace character. */ case Whitespace; case Underscore; case Asterisk; case Slash; case Tilde; case Bang; case Backtick; case Equal; case Caret; case Label; // content=label case URL; // content=URL, extra=title case Email; // content=email address, extra=title case SimpleLink; // content=URL case SimpleEmail; // content=email address case Footnote; // content=symbol case Modifier; // modifier=MDTagModifier case HTMLTag; // tag=MDHTMLTag /** Wildcard for `MDToken::findFirstTokens` */ case META_AnyNonWhitespace; /** Wildcard for `MDToken::findFirstTokens` */ case META_OptionalWhitespace; } /** * Search results from `MDToken.findFirstTokens`. */ class MDTokenMatch { /** @var MDToken{} */ public array $tokens; public int $index; /** * @param MDToken[] $tokens * @param int $index */ public function __construct(array $tokens, int $index) { $this->tokens = $tokens; $this->index = $index; } } /** * Search results from `MDToken.findPairedTokens`. */ class MDPairedTokenMatch { /** @var MDToken[] */ public array $startTokens; /** @var MDToken[] */ public array $contentTokens; /** @var MDToken[] */ public array $endTokens; public int $startIndex; public int $contentIndex; public int $endIndex; public int $totalLength; public function __construct(array $startTokens, array $contentTokens, array $endTokens, int $startIndex, int $contentIndex, int $endIndex, int $totalLength) { $this->startTokens = $startTokens; $this->contentTokens = $contentTokens; $this->endTokens = $endTokens; $this->startIndex = $startIndex; $this->contentIndex = $contentIndex; $this->endIndex = $endIndex; $this->totalLength = $totalLength; } } /** * One lexical unit in inline markdown syntax parsing. */ class MDToken { /** * The original verbatim token string. Required as a plaintext fallback if * the token remains unresolved. */ public string $original; public MDTokenType $type; public ?string $content = null; public ?string $extra = null; public ?MDHTMLTag $tag = null; public ?MDTagModifier $modifier = null; /** * Creates a token. * * @param string $original verbatim token string * @param MDTokenType $type token type * @param string|MDTagModifier|MDHTMLTag|null $content primary content of * the token * @param string|null $extra additional content */ public function __construct(string $original, MDTokenType $type, string|MDTagModifier|MDHTMLTag|null $content=null, ?string $extra=null) { $this->original = $original; $this->type = $type; if ($content instanceof MDTagModifier) { $this->modifier = $content; } elseif ($content instanceof MDHTMLTag) { $this->tag = $content; } else { $this->content = $content; } $this->extra = $extra; } public function __toString(): string { $classname = get_class($this); return "({$classname} type={$this->type} content={$this->content})"; } /** * Attempts to parse a label token from the beginning of `line`. A label is * of the form `[content]`. If found, returns an array: * - `0`: the entire label including brackets * - `1`: the content of the label * * @param string $line * @return ?string[] match groups or null if not found */ public static function tokenizeLabel(string $line): ?array { if (!str_starts_with($line, '[')) return null; $parenCount = 0; $bracketCount = 0; $l = mb_strlen($line); for ($p = 1; $p < $l; $p++) { $ch = mb_substr($line, $p, 1); if ($ch == '\\') { $p++; } elseif ($ch == '(') { $parenCount++; } elseif ($ch == ')') { $parenCount--; if ($parenCount < 0) return null; } elseif ($ch == '[') { $bracketCount++; } elseif ($ch == ']') { if ($bracketCount > 0) { $bracketCount--; } else { return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p - 1) ]; } } } return null; } private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)'; // 1=URL, 2=title private static $urlRegex = '^\\((\\S+?)\\)'; // 1=URL /** * Attempts to parse a URL token from the beginning of `line`. A URL token * is of the form `(url)` or `(url "title")`. If found, returns an array: * - `0`: the entire URL token including parentheses * - `1`: the URL * - `2`: the optional title, or `null` * * @param string $line * @return ?array token tuple */ public static function tokenizeURL(string $line): ?array { $groups = []; if (mb_eregi(self::$urlWithTitleRegex, $line, $groups)) { if (self::tokenizeEmail($line)) return null; // make sure it's not better described as an email address return $groups; } if (mb_eregi(self::$urlRegex, $line, $groups)) { if (self::tokenizeEmail($line)) return null; return [ $groups[0], $groups[1], null ]; } return null; } /** * Attempts to parse an email address from the beginning of `line`. An * email address is of the form `(user@example.com)` or * `(user@example.com "link title")`. If found, returns an array: * - `0`: the entire token including parentheses * - `1`: the email address * - `2`: the optional link title, or `null` * * @param string $line * @return ?string[] token tuple */ public static function tokenizeEmail(string $line): ?array { $groups; if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)", $line, $groups)) { return $groups; } if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) { return [ $groups[0], $groups[1], null ]; } return null; } /** * Searches an array of `MDToken` for the given pattern of `MDTokenType`s. * If found, returns a `MDTokenMatch`, otherwise `null`. * * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace` * are special supported token types. Note that `META_OptionalWhitespace` * may give a result with a variable number of tokens. * * @param (MDToken|MDNode)[] $tokensToSearch - mixed array of `MDToken` and * `MDNode` elements * @param MDTokenType[] $pattern - contiguous run of token types to find * @param int $startIndex - token index to begin searching (defaults to 0) * @return ?MDTokenMatch match object, or `null` if not found */ public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch { $matched = []; for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) { $matchedAll = true; $matched = []; $patternOffset = 0; for ($p = 0; $p < mb_strlen($pattern); $p++) { $t0 = $t + $p + $patternOffset; if ($t0 >= sizeof($tokensToSearch)) return null; $token = $tokensToSearch[$t0]; $elem = $pattern[$p]; if ($elem == MDTokenType::META_OptionalWhitespace) { if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) { array_push($matched, $token); } else { $patternOffset--; } } elseif ($elem == MDTokenType::META_AnyNonWhitespace) { if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) { $matchedAll = false; break; } array_push($matched, $token); } else { if (!($token instanceof MDToken) || $token->type != $elem) { $matchedAll = false; break; } array_push($matched, $token); } } if ($matchedAll) { return new MDTokenMatch($matched, $t); } } return null; } /** * Searches an array of MDToken for a given starting pattern and ending * pattern and returns match info about both and the tokens in between. * * If `contentValidator` is specified, it will be called with the content * tokens of a potential match. If the validator returns `true`, the result * will be accepted and returned by this method. If the validator returns * `false`, this method will keep looking for another matching pair. If no * validator is given the first match will be returned regardless of content. * * If a match is found, a `MDPairedTokenMatch` is returned with details * of the opening tokens, closing tokens, and content tokens between. Otherwise * `null` is returned. * * @param MDToken[] $tokensToSearch - array of `MDToken` to search in * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern` * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching * @param number $startIndex - token index where searching should begin * @return ?MDPairedTokenMatch match, or `null` */ public static function findPairedTokens(array $tokensToSearch, array $startPattern, array $endPattern, ?callable $contentValidator=null, int $startIndex=0): ?MDPairedTokenMatch { for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) { $startMatch = findFirstTokens($tokensToSearch, $startPattern, $s); if ($startMatch === null) return null; $endStart = $startMatch->index + sizeof($startMatch->tokens); while ($endStart < sizeof($tokensToSearch)) { $endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart); if ($endMatch === null) break; $contentStart = $startMatch->index + sizeof($startMatch->tokens); $contentLength = $endMatch->index - $contentStart; $contents = array_slice($tokensToSearch, $contentStart, $contentLength); if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) { return new MDPairedTokenMatch($startMatch->tokens, $contents, $endMatch->tokens, $startMatch->index, $startMatch->index + sizeof($startMatch->tokens), $endMatch->index, $endMatch->index + sizeof($endMatch->tokens) - $startMatch->index); } else { // Contents rejected. Try next end match. $endStart = $endMatch->index + 1; } } // No end matches. Increment start match. $s = $startMatch->index; } return null; } public function equals($other) { if (!($other instanceof MDToken)) return false; if ($other->original !== $this->original) return false; if ($other->type != $this->type) return false; if ($other->content !== $this->content) return false; if ($other->extra !== $this->extra) return false; if ($other->tag !== $this->tag) return false; if ($other->modifier != $this->modifier) return false; return true; } } /** * Parsing and rendering state. Passed around throughout the parsing process. * * States are hierarchical. A sub-state can be created by calling `.copy()` with * a new array of lines. The sub-state points back to its parent state. This * is done to parse inner content of a syntax as its own standalone document. * * If a custom `MDReader` implementation wants to store data in this object, * always do so on `state.root` to ensure it's stored on the original state, * not a child state. Otherwise data may be lost when the sub-state is discarded. */ class MDState { /** * Ascends the parent chain to the root `MDState` instance. This should be * used when referencing most stored fields except `lines` and `p`. */ public function root(): MDState { return $this->parent ? $this->parent->root() : $this; } /** * Lines of the markdown document. The current line index is pointed to by `p`. * * @var string[] */ public array $lines; /** * The current line in `lines`. */ public function currentLine(): ?string { return ($this->p < sizeof($this->lines)) ? $this->lines[$this->p] : null; } /** * Current line pointer into array `lines`. */ public int $p = 0; private ?MDState $parent = null; /** * Array of `MDReader`s sorted by block reading priority. * @var MDReader[] */ public array $readersByBlockPriority = []; /** * Array of `MDReader`s sorted by tokenization priority. * @var MDReader[] */ public array $readersByTokenPriority = []; /** * Array of tuples of `pass:number` and `MDReader` sorted by substitution * priority. * @var array[] */ public array $readersBySubstitutePriority = []; /** * Prefix to include in any generated `id` attributes on HTML elements. * Useful for keeping elements unique in multiple parsed documents in the * same HTML page. */ public string $elementIdPrefix = ''; /** * Filter for removing unapproved HTML tags, attributes, and values. */ public MDHTMLFilter $tagFilter; private static string $textWhitespaceRegex = '^(\\s*)(?:(\\S|\\S.*\\S)(\\s*?))?$'; // 1=leading WS, 2=text, 3=trailing WS /** * @param string[] $lines - lines of markdown text */ public function __construct(array $lines) { $this->lines = $lines; } /** * Creates a copy of this state with new lines. Useful for parsing nested * content. * * @param string[] $lines * @return MDState copied sub-state */ public function copy(array $lines) { $cp = new MDState($lines); $cp->parent = $this; return $cp; } /** * Tests if there are at least `minCount` lines available to read. If `p` * is not provided it will be relative to `this.p`. */ public function hasLines(int $minCount, ?int $p=null): bool { $relativeTo = ($p === null) ? $this->p : $p; return $relativeTo + $minCount <= sizeof($this->lines); } /** * Reads and returns an array of blocks from the current line pointer. * * @return MDBlockNode[] parsed blocks */ public function readBlocks(): array { $blocks = []; while ($this->hasLines(1)) { $block = $this->readNextBlock(); if ($block) { array_push($blocks, $block); } else { break; } } return $blocks; } /** * Creates a simple `MDBlockNode` if no other registered blocks match. */ private function readFallbackBlock(): ?MDBlockNode { if ($this->p >= sizeof($this->lines)) return null; $lines = MDUtils::withoutTrailingBlankLines(array_slice($this->lines, $this->p)); if (sizeof($lines) == 0) return null; $this->p = sizeof($this->lines); return $this->inlineMarkdownToNode(implode("\n", $lines)); } /** * Attempts to read one block from the current line pointer. The pointer * will be positioned just after the end of the block. */ private function readNextBlock(): ?MDBlockNode { while ($this->hasLines(1) && mb_strlen(trim($this->lines[$this->p])) == 0) { $this->p++; } if (!$this->hasLines(1)) return null; foreach ($this->root()->readersByBlockPriority as $reader) { $startP = $this->p; $block = $reader->readBlock($this); if ($block) { if ($this->p == $startP) { $readerClassName = get_class($reader); $blockClassName = get_class($block); throw new Error("{$readerClassName} returned an " + "{$blockClassName} without incrementing MDState.p. " + "This could lead to an infinite loop."); } return $block; } } $fallback = $this->readFallbackBlock(); return $fallback; } /** * @param string $line * @return MDToken[] */ private function inlineMarkdownToTokens(string $line): array { if ($this->parent) return $this->parent->inlineMarkdownToTokens($line); $tokens = []; $text = ''; $expectLiteral = false; /** * Flushes accumulated content in `text` to `tokens`. */ function endText() { if (mb_strlen($text) == 0) return; $textGroups = null; if (mb_eregi(MDState::$textWhitespaceRegex, $text, $textGroups)) { if (mb_strlen($textGroups[1]) > 0) { array_push($tokens, new MDToken($textGroups[1], MDTokenType::Whitespace, $textGroups[1])); } if ($textGroups[2] && mb_strlen($textGroups[2]) > 0) { $tokens.push(new MDToken($textGroups[2], MDTokenType::Text, $textGroups[2])); } if ($textGroups[3] && mb_strlen($textGroups[3]) > 0) { $tokens.push(new MDToken($textGroups[3], MDTokenType::Whitespace, $textGroups[3])); } } else { array_push($tokens, new MDToken($text, MDTokenType::Text, $text)); } $text = ''; } for ($p = 0; $p < mb_strlen(line); $p++) { $ch = mb_substr($line, p, 1); $remainder = mb_substr($line, $p); if ($expectLiteral) { $text .= $ch; $expectLiteral = false; continue; } if ($ch == '\\') { $expectLiteral = true; continue; } $found = false; foreach ($this->root()->readersByTokenPriority as $reader) { $token = $reader->readToken($this, $remainder); if ($token === null) continue; endText(); array_push($tokens, $token); if ($token->original == null || mb_strlen($token->original) == 0) { $readerClassName = get_class($reader); throw new Error(`{$readerClassName} returned a token with an empty .original. This would cause an infinite loop.`); } $p += mb_strlen($token->original) - 1; $found = true; break; } if (!$found) { $text += $ch; } } endText(); return $tokens; } /** * Converts a line of markdown to an `MDInlineNode`. * * @param string|string[] $line * @return MDInlineNode */ public function inlineMarkdownToNode(string|array $line): MDInlineNode { $nodes = $this->inlineMarkdownToNodes($line); return (sizeof($nodes) == 1) ? $nodes[0] : new MDInlineNode($nodes); } /** * Converts a line of markdown to an array of `MDInlineNode`s. * * @param string|string[] $line * @return MDInlineNode[] */ public function inlineMarkdownToNodes(string|array $line): array { $tokens = $this->inlineMarkdownToTokens(is_array($line) ? implode("\n", $line) : $line); return $this->tokensToNodes($tokens); } /** * Converts a mixed array of `MDToken` and `MDInlineNode` elements into an array * of only `MDInlineNode` via repeated `MDReader` substition. * * @param (MDToken|MDInlineNode)[] $tokens * @return MDInlineNode[] */ public function tokensToNodes(array $tokens): array { $nodes = $tokens; // Perform repeated substitutions, converting sequences of tokens into // nodes, until no more substitutions can be made. $anyChanges = false; do { $anyChanges = false; foreach ($this->root->readersBySubstitutePriority as $readerTuple) { /** @var int */ $pass = $readerTuple[0]; /** @var MDReader */ $reader = $readerTuple[1]; $changed = $reader->substituteTokens($this, $pass, $nodes); if (!$changed) continue; $anyChanges = true; break; } } while ($anyChanges); // Convert any remaining tokens to text nodes. Also apply any inline // CSS modifiers. $lastNode = null; $me = $this; $nodes = array_map(function($node) use ($lastNode, $me) { if ($node instanceof MDToken) { /** @var MDToken */ $token = $node; if ($token->type == MDTokenType::Modifier && $lastNode) { $me->root()->tagFilter->scrubModifier($token->modifier); $token->modifier->applyTo($lastNode); $lastNode = null; return new MDTextNode(''); } $lastNode = null; return new MDTextNode($token->original); } elseif ($node instanceof MDNode) { $lastNode = ($node instanceof MDTextNode) ? null : $node; return $node; } else { $nodeClassName = get_class($node); throw new Error("Unexpected node type {$nodeClassName}"); } }, $nodes); return $nodes; } /** * Mapping of reference symbols to URLs. Used by `MDReferencedLinkReader` * and `MDReferencedImageReader`. * @var array symbol -> URL */ private array $referenceToURL = []; /** * Mapping of reference symbols to titles. Used by `MDReferencedLinkReader` * and `MDReferencedImageReader`. * @var array symbol -> title string */ private array $referenceToTitle = []; /** * Defines a URL by reference symbol. */ public function defineURL(string $reference, string $url, ?string $title=null) { $this->root->referenceToURL[mb_strtolower($reference)] = $url; if ($title !== null) $this->root()->referenceToTitle[mb_strtolower($reference)] = $title; } /** * Returns the URL associated with a reference symbol. */ public function urlForReference(string $reference): ?string { return $this->root()->referenceToURL[mb_strtolower($reference)] ?? null; } /** * Returns the link title associated with a reference symbol. */ public function urlTitleForReference(string $reference): ?string { return $this->root()->referenceToTitle[mb_strtolower($reference)] ?? null; } } /** * Defines a set of allowable HTML tags, attributes, and CSS. */ class MDHTMLFilter { /** * Mapping of permitted lowercase tag names to objects containing allowable * attributes for those tags. Does not need to include those attributes * defined in `allowableGlobalAttributes`. * * Values are objects with allowable lowercase attribute names mapped to * allowable value patterns. A `*` means any value is acceptable. Multiple * allowable values can be joined together with `|`. These special symbols * represent certain kinds of values and can be used in combination or in * place of literal values. * * - `{classlist}`: A list of legal CSS classnames, separated by spaces * - `{int}`: An integer * - `{none}`: No value (an attribute with no `=` or value, like `checked`) * - `{style}`: One or more CSS declarations, separated by semicolons (simple * `key: value;` syntax only) * - `{url}`: A URL * @type {object} */ public array $allowableTags = [ 'address' => [ 'cite' => '{url}', ], 'h1' => [], 'h2' => [], 'h3' => [], 'h4' => [], 'h5' => [], 'h6' => [], 'blockquote' => [], 'dl' => [], 'dt' => [], 'dd' => [], 'div' => [], 'hr' => [], 'ul' => [], 'ol' => [ 'start' => '{int}', 'type' => 'a|A|i|I|1', ], 'li' => [ 'value' => '{int}', ], 'p' => [], 'pre' => [], 'table' => [], 'thead' => [], 'tbody' => [], 'tfoot' => [], 'tr' => [], 'td' => [], 'th' => [], 'a' => [ 'href' => '{url}', 'target' => '*', ], 'abbr' => [], 'b' => [], 'br' => [], 'cite' => [], 'code' => [], 'data' => [ 'value' => '*', ], 'dfn' => [], 'em' => [], 'i' => [], 'kbd' => [], 'mark' => [], 'q' => [ 'cite' => '{url}', ], 's' => [], 'samp' => [], 'small' => [], 'span' => [], 'strong' => [], 'sub' => [], 'sup' => [], 'time' => [ 'datetime' => '*', ], 'u' => [], 'var' => [], 'wbr' => [], 'img' => [ 'alt' => '*', 'href' => '{url}', ], 'figure' => [], 'figcaption' => [], 'del' => [], 'ins' => [], 'details' => [], 'summary' => [], ]; /** * Mapping of allowable lowercase global attributes to their permitted * values. Uses same value pattern syntax as described in `allowableTags`. * @type {object} */ public array $allowableGlobalAttributes = [ 'class' => '{classlist}', 'data-*' => '*', 'dir' => 'ltr|rtl|auto', 'id' => '*', 'lang' => '*', 'style' => '{style}', 'title' => '*', 'translate' => 'yes|no|{none}', ]; /** * Mapping of allowable CSS style names to their allowable value patterns. * Multiple values can be delimited with `|` characters. Limited support * so far. * * Recognized special values: * - `{color}`: A hex or named color * * @type {object} */ public array $allowableStyleKeys = [ 'background-color' => '{color}', 'color' => '{color}', ]; /** * Scrubs all forbidden attributes from an HTML tag. Assumes the tag name * itself has already been whitelisted. * * @param {MDHTMLTag} tag - HTML tag */ public function scrubTag(MDHTMLTag $tag) { foreach ($tag->attributes as $name => $value) { if (!$this->isValidAttributeName($tag->tagName, $name)) { unset($tag->attributes[$name]); } if (!$this->isValidAttributeValue($tag->tagName, $name, $value)) { unset($tag->attributes[$name]); } } } /** * Scrubs all forbidden attributes from an HTML modifier. * * @param MDTagModifier $modifier * @param ?string $tagName HTML tag name, if known, otherwise only * global attributes will be permitted */ public function scrubModifier(MDHTMLModifier $modifier, ?string $tagName) { if (sizeof($modifier->cssClasses) > 0) { $classList = implode(' ', $modifier->cssClasses); if (!$this->isValidAttributeValue($tagName, 'class', $classList)) { $modifier->cssClasses = []; } } if ($modifier->cssId !== null) { if (!$this->isValidAttributeValue($tagName, 'id', $modifier->cssId)) { $modifier->cssId = null; } } if (!$this->isValidAttributeName($tagName, 'style')) { $modifier->cssStyles = []; } else { foreach ($modifier->cssStyles as $key => $val) { if (!$this->isValidStyleValue($key, $val)) { unset($modifier->cssStyles[$key]); } } } foreach ($modifier->attributes as $key => $val) { if (!$this->isValidAttributeValue($tagName, $key, $val)) { unset($modifier->attributes[$key]); } } } /** * Tests if an HTML tag name is permitted. */ public function isValidTagName(string $tagName): bool { return ($this->allowableTags[mb_strtolower($tagName)] ?? null) !== null; } /** * Tests if an HTML attribute name is permitted. */ public function isValidAttributeName(?string $tagName, string $attributeName): bool { $lcAttributeName = mb_strtolower($attributeName); if (($this->allowableGlobalAttributes[$lcAttributeName] ?? null) !== null) { return true; } foreach ($this->allowableGlobalAttributes as $pattern => $valuePattern) { if (!str_ends_with($pattern, '*')) continue; $patternPrefix = mb_substr($pattern, 0, mb_strlen($pattern) - 1); if (str_starts_with($lcAttributeName, $patternPrefix)) { return true; } } if ($tagName === null) return false; $lcTagName = mb_strtolower($tagName); $tagAttributes = $this->allowableTags[$lcTagName]; if ($tagAttributes !== null) { return ($tagAttributes[$lcAttributeName] ?? null) !== null; } return false; } /** * Tests if an attribute value is allowable. */ public function isValidAttributeValue(?string $tagName, string $attributeName, $attributeValue): bool { $lcAttributeName = mb_strtolower($attributeName); $globalPattern = $this->allowableGlobalAttributes[$lcAttributeName] ?? null; if ($globalPattern !== null) { return $this->attributeValueMatchesPattern($attributeValue, $globalPattern); } foreach ($this->allowableGlobalAttributes as $namePattern => $valuePattern) { if (str_ends_with($namePattern, '*') && str_starts_with($lcAttributeName, mb_substr($namePattern, 0, mb_strlen($namePattern) - 1))) { return $this->attributeValueMatchesPattern($attributeValue, $valuePattern); } } if ($tagName === null) return false; $lcTagName = mb_strtolower($tagName); $tagAttributes = $this->allowableTags[$lcTagName] ?? null; if ($tagAttributes === null) return false; $valuePattern = $tagAttributes[$lcAttributeName] ?? null; if ($valuePattern === null) return false; return $this->attributeValueMatchesPattern($attributeValue, $valuePattern); } private static string $permissiveURLRegex = '^\\S+$'; private static string $integerRegex = '^[\\-]?\\d+$'; private static string $classListRegex = '^-?[_a-zA-Z]+[_a-zA-Z0-9-]*(?:\\s+-?[_a-zA-Z]+[_a-zA-Z0-9-]*)*$'; private function attributeValueMatchesPattern(string|bool $value, string $pattern): bool { $options = explode('|', $pattern); foreach ($options as $option) { switch ($option) { case '*': return true; case '{classlist}': if (mb_eregi(self::classListRegex, $value)) return true; break; case '{int}': if (mb_eregi(self::integerRegex, $value)) return true; break; case '{none}': if ($value === true) return true; break; case '{style}': if ($this->isValidStyleDeclaration($value)) return true; break; case '{url}': if (mb_eregi(self::permissiveURLRegex, $value)) return true; break; default: if ($value === $option) return true; break; } } return false; } /** * Tests if a string of one or more style `key: value;` declarations is * fully allowable. */ public function isValidStyleDeclaration(string $styles): bool { $settings = explode(';', $styles); foreach ($settings as $setting) { if (mb_strlen(trim($setting)) == 0) continue; $parts = explode(':', $setting); if (sizeof($parts) != 2) return false; $name = trim($parts[0]); if (!$this->isValidStyleKey($name)) return false; $value = trim($parts[1]); if (!$this->isValidStyleValue($name, $value)) return false; } return true; } /** * Tests if a CSS style key is allowable. */ public function isValidStyleKey(string $key): bool { return ($this->allowableStyleKeys[$key] ?? null) !== null; } /** * Tests if a CSS style value is allowable. */ public function isValidStyleValue(string $key, string $value): bool { $pattern = $this->allowableStyleKeys[$key] ?? null; if ($pattern === null) return false; $options = explode('|', $pattern); foreach ($options as $option) { switch ($option) { case '{color}': if ($this->isValidCSSColor($value)) return true; default: if ($value === $option) return true; } } return false; } private static string $styleColorRegex = '^#[0-9a-f]{3}(?:[0-9a-f]{3})?$|^[a-zA-Z]+$'; private function isValidCSSColor(string $value): bool { return mb_eregi(self::$styleColorRegex, $value); } } /** * Represents a single HTML tag. Paired tags are represented separately. */ class MDHTMLTag { /** * Verbatim string of the original parsed tag. Not modified. Should be * considered unsafe for inclusion in the final document. Use `toString()` * instead. */ public string $original; public string $tagName; public bool $isCloser; /** * Map of attribute names to value strings. */ public array $attributes; /** * @param string $original * @param string $tagName * @param bool $isCloser * @param array $attributes */ public function __construct(string $original, string $tagName, bool $isCloser, array $attributes) { $this->original = $original; $this->tagName = $tagName; $this->isCloser = $isCloser; $this->attributes = $attributes; } public function __toString(): string { if ($this->isCloser) { return "tagName}>"; } $html = '<'; $html .= $this->tagName; foreach ($this->attributes as $key => $value) { $safeName = MDUtils::scrubAttributeName($key); if ($value === true) { $html .= " {$safeName}"; } else { $escapedValue = MDUtils::escapeHTML("{$value}"); $html .= " {$safeName}=\"{$escapedValue}\""; } } $html .= '>'; return $html; } public function equals($other): bool { if (!($other instanceof MDHTMLTag)) return false; if ($other->tagName != $this->tagName) return false; if ($other->isCloser != $this->isCloser) return false; return MDUtils::equal($other->attributes, $this->attributes); } private static string $htmlTagNameFirstRegex = '[a-z]'; private static string $htmlTagNameMedialRegex = '[a-z0-9]'; private static string $htmlAttributeNameFirstRegex = '[a-z]'; private static string $htmlAttributeNameMedialRegex = '[a-z0-9-]'; private static string $whitespaceCharRegex = '\\s'; /** * Checks the start of the given string for presence of an HTML tag. */ public static function fromLineStart(string $line): ?MDHTMLTag { $expectOpenBracket = 0; $expectCloserOrName = 1; $expectName = 2; $expectAttributeNameOrEnd = 3; $expectEqualsOrAttributeOrEnd = 4; $expectAttributeValue = 5; $expectCloseBracket = 6; $isCloser = false; $tagName = ''; $attributeName = ''; $attributeValue = ''; $attributeQuote = null; $attributes = []; $fullTag = null; $endAttribute = function(bool $unescape=false) use (&$attributes, &$attributeName, &$attributeValue, &$attributeQuote) { if (mb_strlen($attributeName) > 0) { if (mb_strlen($attributeValue) > 0 || $attributeQuote !== null) { $attributes[$attributeName] = $unescape ? html_entity_decode($attributeValue, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401, 'UTF-8') : $attributeValue; } else { $attributes[$attributeName] = true; } } $attributeName = ''; $attributeValue = ''; $attributeQuote = null; }; $expect = $expectOpenBracket; for ($p = 0; $p < mb_strlen($line) && $fullTag === null; $p++) { $ch = mb_substr($line, $p, 1); $isWhitespace = mb_eregi(self::$whitespaceCharRegex, $ch); switch ($expect) { case $expectOpenBracket: if ($ch != '<') return null; $expect = $expectCloserOrName; break; case $expectCloserOrName: if ($ch == '/') { $isCloser = true; } else { $p--; } $expect = $expectName; break; case $expectName: if (mb_strlen($tagName) == 0) { if (!mb_eregi(self::$htmlTagNameFirstRegex, $ch)) return null; $tagName .= $ch; } else { if (mb_eregi(self::$htmlTagNameMedialRegex, $ch)) { $tagName .= $ch; } else { $p--; $expect = ($isCloser) ? $expectCloseBracket : $expectAttributeNameOrEnd; } } break; case $expectAttributeNameOrEnd: if (mb_strlen($attributeName) == 0) { if ($isWhitespace) { // skip whitespace } elseif ($ch == '/') { $expect = $expectCloseBracket; } elseif ($ch == '>') { $fullTag = mb_substr($line, 0, $p + 1); break; } elseif (mb_eregi(self::$htmlAttributeNameFirstRegex, $ch)) { $attributeName .= $ch; } else { return null; } } elseif ($isWhitespace) { $expect = $expectEqualsOrAttributeOrEnd; } elseif ($ch == '/') { $endAttribute(); $expect = $expectCloseBracket; } elseif ($ch == '>') { $endAttribute(); $fullTag = mb_substr($line, 0, $p + 1); break; } elseif ($ch == '=') { $expect = $expectAttributeValue; } elseif (mb_eregi(self::$htmlAttributeNameMedialRegex, $ch)) { $attributeName .= $ch; } else { return null; } break; case $expectEqualsOrAttributeOrEnd: if ($ch == '=') { $expect = $expectAttributeValue; } elseif ($isWhitespace) { // skip whitespace } elseif ($ch == '/') { $expect = $expectCloseBracket; } elseif ($ch == '>') { $fullTag = mb_substr($line, 0, $p + 1); break; } elseif (mb_eregi(self::$htmlAttributeNameFirstRegex, $ch)) { $endAttribute(); $expect = $expectAttributeNameOrEnd; $p--; } break; case $expectAttributeValue: if (mb_strlen($attributeValue) == 0) { if ($attributeQuote === null) { if ($isWhitespace) { // skip whitespace } elseif ($ch == '"' || $ch == "'") { $attributeQuote = $ch; } else { $attributeQuote = ''; // explicitly unquoted $p--; } } else { if ($ch === $attributeQuote) { // Empty string $endAttribute($attributeQuote != ''); $expect = $expectAttributeNameOrEnd; } elseif ($attributeQuote === '' && ($ch == '/' || $ch == '>')) { return null; } else { $attributeValue .= $ch; } } } else { if ($ch === $attributeQuote) { $endAttribute($attributeQuote != ''); $expect = $expectAttributeNameOrEnd; } elseif ($attributeQuote === '' && $isWhitespace) { $endAttribute(); $expect = $expectAttributeNameOrEnd; } else { $attributeValue .= $ch; } } break; case $expectCloseBracket: if ($isWhitespace) { // ignore whitespace } elseif ($ch == '>') { $fullTag = mb_substr($line, 0, $p + 1); break; } break; } } if ($fullTag === null) return null; $endAttribute(); return new MDHTMLTag($fullTag, $tagName, $isCloser, $attributes); } } /** * Represents HTML modifications to a node, such as CSS classes to add or * additional attributes. See `MDHTMLFilter.scrubModifier()` to remove disallowed * values. */ class MDTagModifier { /** * Verbatim markdown syntax. Unmodified by changes to other properties. */ public string $original; /** @var string[] */ public array $cssClasses = []; public ?string $cssId = null; public array $cssStyles = []; public array $attributes = []; private static $baseClassRegex = '\\.([a-z_\\-][a-z0-9_\\-]*?)'; private static $baseIdRegex = '#([a-z_\\-][a-z0-9_\\-]*?)'; private static $baseAttributeRegex = '([a-z0-9]+?)=([^\\s\\}]+?)'; private static $baseRegex = '\\{([^}]+?)}'; private static $leadingClassRegex = '^\\{([^}]+?)}'; private static $trailingClassRegex = '^(.*?)\\s*\\{([^}]+?)}\\s*$'; private static $classRegex = '^\\.([a-z_\\-][a-z0-9_\\-]*?)$'; // 1=classname private static $idRegex = '^#([a-z_\\-][a-z0-9_\\-]*?)$'; // 1=id private static $attributeRegex = '^([a-z0-9]+?)=([^\\s\\}]+?)$'; // 1=attribute name, 2=attribute value public function applyTo(MDNode $node) { if ($node instanceof MDNode) { foreach ($this->cssClasses as $cssClass) { $node->addClass($cssClass); } if ($this->cssId) $node->cssId = $this->cssId; foreach ($this->attributes as $name => $value) { $node->attributes[$name] = $value; } foreach ($this->cssStyles as $name => $value) { $node->cssStyles[$name] = $value; } } } /** * Adds a CSS class. If already present it will not be duplicated. */ public function addClass(string $cssClass): bool { if (array_search($cssClass, $this->cssClasses) !== false) return false; array_push($this->cssClasses, $cssClass); return true; } /** * Removes a CSS class. */ public function removeClass(string $cssClass): bool { $beforeLength = sizeof($this->cssClasses); $this->cssClasses = array_diff($this->cssClasses, [ $cssClass ]); return sizeof($this->cssClasses) != beforeLength; } public function equals($other): bool { if (!($other instanceof MDTagModifier)) return false; if (!MDUtils::equal($other->cssClasses, $this->cssClasses)) return false; if ($other->cssId !== $this->cssId) return false; if (!MDUtils::equal($other->attributes, $this->attributes)) return false; return true; } public function __toString(): string { return $this->original; } private static function styleToObject(string $styleValue): array { $pairs = explode(';', $styleValue); $styles = []; foreach ($pairs as $pair) { $keyAndValue = explode(':', $pair); if (sizeof($keyAndValue) != 2) continue; $styles[$keyAndValue[0]] = $keyAndValue[1]; } return $styles; } private static function fromContents(string $contents): ?MDTagModifier { $modifierTokens = mb_split('\\s+', $contents); $mod = new MDTagModifier(); $mod->original = "{{$contents}}"; foreach ($modifierTokens as $token) { if (trim($token) == '') continue; if (mb_eregi(self::$classRegex, $token, $groups)) { $mod->addClass($groups[1]); } elseif (mb_eregi(self::$idRegex, $token, $groups)) { $mod->cssId = $groups[1]; } elseif (mb_eregi(self::$attributeRegex, $token, $groups)) { if ($groups[1] == 'style') { $mod->cssStyles = self::styleToObject($groups[2]); } else { $mod->attributes[$groups[1]] = $groups[2]; } } else { return null; } } return $mod; } /** * Extracts block modifier from end of a line. Always returns a 2-element * tuple array: * - `0`: the line without the modifier * - `1`: an `MDTagModifier` if found or `null` if not * * @param string $line * @param ?MDState $state * @return array tuple with remaining line and `MDTagModifier` or `null` */ public static function fromLine(string $line, ?MDState $state): array { if ($state) { $found = false; foreach ($state->root()->readersByBlockPriority as $reader) { if ($reader instanceof MDModifierReader) { $found = true; break; } } if (!$found) return [ $line, null ]; } if (!mb_eregi(self::$trailingClassRegex, $line, $groups)) return [ $line, null ]; $bareLine = $groups[1]; $mod = self::fromContents($groups[2]); return [ $bareLine, $mod ]; } /** * Attempts to extract modifier from head of string. */ public static function fromStart(string $line): ?MDTagModifier { if (!mb_eregi(self::$leadingClassRegex, $line, $groups)) return null; return self::fromContents($groups[1]); } /** * Discards any modifiers from a line and returns what remains. */ public static function strip(string $line): string { if (!mb_eregi(self::$trailingClassRegex, $line, $groups)) return $line; return $groups[1]; } } // -- Readers --------------------------------------------------------------- class MDReader {} class MDUnderlinedHeadingReader extends MDReader {} class MDHashHeadingReader extends MDReader {} class MDSubtextReader extends MDReader {} class MDBlockQuoteReader extends MDReader {} class _MDListReader extends MDReader {} class MDUnorderedListReader extends _MDListReader {} class MDOrderedListReader extends _MDListReader {} class MDFencedCodeBlockReader extends MDReader {} class MDIndentedCodeBlockReader extends MDReader {} class MDHorizontalRuleReader extends MDReader {} class MDTableReader extends MDReader {} class MDDefinitionListReader extends MDReader {} class MDFootnoteReader extends MDReader {} class MDAbbreviationReader extends MDReader {} class MDParagraphReader extends MDReader {} class MDSimplePairInlineReader extends MDReader {} class MDEmphasisReader extends MDSimplePairInlineReader {} class MDStrongReader extends MDSimplePairInlineReader {} class MDStrikethroughReader extends MDSimplePairInlineReader {} class MDUnderlineReader extends MDSimplePairInlineReader {} class MDHighlightReader extends MDSimplePairInlineReader {} class MDCodeSpanReader extends MDSimplePairInlineReader {} class MDSubscriptReader extends MDSimplePairInlineReader {} class MDSuperscriptReader extends MDSimplePairInlineReader {} class MDLinkReader extends MDReader {} class MDReferencedLinkReader extends MDLinkReader {} class MDImageReader extends MDLinkReader {} class MDReferencedImageReader extends MDReferencedLinkReader {} class MDLineBreakReader extends MDReader {} class MDHTMLTagReader extends MDReader {} class MDModifierReader extends MDReader {} // -- Nodes ----------------------------------------------------------------- class MDNode {} class MDBlockNode extends MDNode {} class MDParagraphNode extends MDBlockNode {} class MDHeadingNode extends MDBlockNode {} class MDSubtextNode extends MDBlockNode {} class MDHorizontalRuleNode extends MDBlockNode {} class MDBlockquoteNode extends MDBlockNode {} class MDUnorderedListNode extends MDBlockNode {} class MDOrderedListNode extends MDBlockNode {} class MDListItemNode extends MDBlockNode {} class MDCodeBlockNode extends MDBlockNode {} class MDTableNode extends MDBlockNode {} class MDTableRowNode extends MDBlockNode {} class MDTableCellNode extends MDBlockNode {} class MDTableHeaderCellNode extends MDBlockNode {} class MDDefinitionListNode extends MDBlockNode {} class MDDefinitionListTermNode extends MDBlockNode {} class MDDefinitionListDefinitionNode extends MDBlockNode {} class MDFootnoteListNode extends MDBlockNode {} class MDInlineNode extends MDNode {} class MDTextNode extends MDInlineNode {} class MDObfuscatedTextNode extends MDTextNode {} class MDEmphasisNode extends MDInlineNode {} class MDStrongNode extends MDInlineNode {} class MDStrikethroughNode extends MDInlineNode {} class MDUnderlineNode extends MDInlineNode {} class MDHighlightNode extends MDInlineNode {} class MDSuperscriptNode extends MDInlineNode {} class MDSubscriptNode extends MDInlineNode {} class MDCodeNode extends MDInlineNode {} class MDFootnoteNode extends MDInlineNode {} class MDLinkNode extends MDInlineNode {} class MDReferencedLinkNode extends MDLinkNode {} class MDImageNode extends MDInlineNode {} class MDReferencedImageNode extends MDImageNode {} class MDAbbreviationNode extends MDInlineNode {} class MDLineBreakNode extends MDInlineNode {} class MDHTMLTagNode extends MDInlineNode {} // -- Main class ------------------------------------------------------------ class Markdown {} ?>