()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))'; /** * Encodes characters as HTML numeric entities to make it marginally more * difficult for web scrapers to grab sensitive info. If `text` starts with * `mailto:` only the email address following it will be obfuscated. */ public static function escapeObfuscated(string $text): string { if (str_starts_with($text, 'mailto:')) { return 'mailto:' . escapeObfuscated(mb_substr($text, 7)); } $html = ''; $l = mb_strlen($text); for ($p = 0; $p < $l; $p++) { $cp = mb_ord(mb_substr($text, $p, 1)); $html .= "&#{{$cp}}"; } return $html; } /** * Removes illegal characters from an HTML attribute name. */ public static function scrubAttributeName(string $name): string { return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', ''); } /** * Strips one or more leading indents from a line or lines of markdown. An * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3 * spaces) are treated like one indent level. * * @param string|string[] $line * @param int $levels * @return string|string[] */ public static function stripIndent(string|array $line, int $levels=1): string|array { $regex = "^(?: {1,4}|\\t){{$levels}}"; return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l)) : mb_ereg_replace($regex, '', $line); } /** * Counts the number of indent levels in a line of text. Partial indents * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly` * is `true`. */ public static function countIndents(string $line, bool $fullIndentsOnly=false): int { // normalize indents to tabs $t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line); // remove content after indent $t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t); // count tabs return mb_strlen($t); } /** * Returns a copy of an array without any whitespace-only lines at the end. * * @param string[] $lines * @return string[] */ public static function withoutTrailingBlankLines(array $lines): array { $stripped = $lines; while (sizeof($stripped) > 0 && sizeof(mb_trim($stripped[sizeof($stripped) - 1])) == 0) { array_pop($stripped); } return $stripped; } /** * Tests if an array of lines contains at least one blank. A blank line * can contain whitespace. * * @param string[] $lines */ public static function containsBlankLine(array $lines): bool { foreach ($lines as $line) { if (mb_len(mb_trim($line)) == 0) return true; } return false; } public static function equalAssocArrays(array $a, array $b) { return empty(array_diff_assoc($a, $b)); } } /** * Token type enum for `MDToken`. */ enum MDTokenType { case Text; /** * Only used for the leading and trailing whitespace around a run of text, * not every single whitespace character. */ case Whitespace; case Underscore; case Asterisk; case Slash; case Tilde; case Bang; case Backtick; case Equal; case Caret; case Label; // content=label case URL; // content=URL, extra=title case Email; // content=email address, extra=title case SimpleLink; // content=URL case SimpleEmail; // content=email address case Footnote; // content=symbol case Modifier; // modifier=MDTagModifier case HTMLTag; // tag=MDHTMLTag /** Wildcard for `MDToken.findFirstTokens` */ case META_AnyNonWhitespace; /** Wildcard for `MDToken.findFirstTokens` */ case META_OptionalWhitespace; } /** * Search results from `MDToken.findFirstTokens`. */ class MDTokenMatch { /** @var MDToken{} */ public array $tokens; public int $index; /** * @param MDToken[] $tokens * @param int $index */ public function __construct($tokens, $index) { $this->tokens = $tokens; $this->index = $index; } } /** * Search results from `MDToken.findPairedTokens`. */ class MDPairedTokenMatch { /** @var MDToken[] */ public array $startTokens; /** @var MDToken[] */ public array $contentTokens; /** @var MDToken[] */ public array $endTokens; public int $startIndex; public int $contentIndex; public int $endIndex; public int $totalLength; public function __construct($startTokens, $contentTokens, $endTokens, $startIndex, $contentIndex, $endIndex, $totalLength) { $this->startTokens = $startTokens; $this->contentTokens = $contentTokens; $this->endTokens = $endTokens; $this->startIndex = $startIndex; $this->contentIndex = $contentIndex; $this->endIndex = $endIndex; $this->totalLength = $totalLength; } } /** * One lexical unit in inline markdown syntax parsing. */ class MDToken { /** * The original verbatim token string. Required as a plaintext fallback if * the token remains unresolved. */ public string $original; public MDTokenType $type; public ?string $content = null; public ?string $extra = null; public ?MDHTMLTag $tag = null; public ?MDTagModifier $modifier = null; /** * Creates a token. * * @param string original verbatim token string * @param MDTokenType type token type * @param string|MDTagModifier|MDHTMLTag|null content primary content of the token * @param string|null extra additional content */ public function __construct(string $original, MDTokenType $type, string|MDTagModifier|MDHTMLTag|null $content=null, ?string $extra=null) { $this->original = $original; $this->type = $type; if ($content instanceof MDTagModifier) { $this->modifier = $content; } elseif ($content instanceof MDHTMLTag) { $this->tag = $content; } else { $this->content = $content; } $this->extra = $extra; } public function __toString() { $classname = get_class($this); return "({$classname} type={$this->type} content={$this->content})"; } /** * Attempts to parse a label token from the beginning of `line`. A label is * of the form `[content]`. If found, returns an array: * - `0`: the entire label including brackets * - `1`: the content of the label * * @param string $line * @return ?string[] match groups or null if not found */ public static function tokenizeLabel(string $line): ?array { if (!str_starts_with($line, '[')) return null; $parenCount = 0; $bracketCount = 0; $l = mb_strlen($line); for ($p = 1; $p < $l; $p++) { $ch = mb_substr($line, $p, 1); if ($ch == '\\') { $p++; } elseif ($ch == '(') { $parenCount++; } elseif ($ch == ')') { $parenCount--; if ($parenCount < 0) return null; } elseif ($ch == '[') { $bracketCount++; } elseif ($ch == ']') { if ($bracketCount > 0) { $bracketCount--; } else { return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p) ]; } } } return null; } private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)'; // 1=URL, 2=title private static $urlRegex = '^\\((\\S+?)\\)'; // 1=URL /** * Attempts to parse a URL token from the beginning of `line`. A URL token * is of the form `(url)` or `(url "title")`. If found, returns an array: * - `0`: the entire URL token including parentheses * - `1`: the URL * - `2`: the optional title, or `null` * * @param string $line * @return ?array token tuple */ public static function tokenizeURL(string $line): ?array { $groups = []; if (mb_eregi($urlWithTitleRegex, $line, $groups)) { if (tokenizeEmail($line)) return null; // make sure it's not better described as an email address return $groups; } if (mb_eregi($urlRegex, $line, $groups)) { if (tokenizeEmail($line)) return null; return [ $groups[0], $groups[1], null ]; } return null; } /** * Attempts to parse an email address from the beginning of `line`. An * email address is of the form `(user@example.com)` or * `(user@example.com "link title")`. If found, returns an array: * - `0`: the entire token including parentheses * - `1`: the email address * - `2`: the optional link title, or `null` * * @param string $line * @return string[] token tuple */ public static function tokenizeEmail(string $line): array { $groups; if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)", $line, $groups)) { return $groups; } if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) { return [ $groups[0], $groups[1], null ]; } return null; } /** * Searches an array of `MDToken` for the given pattern of `MDTokenType`s. * If found, returns a `MDTokenMatch`, otherwise `null`. * * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace` * are special supported token types. Note that `META_OptionalWhitespace` * may give a result with a variable number of tokens. * * @param (MDToken|MDNode)[] tokensToSearch - mixed array of `MDToken` and * `MDNode` elements * @param MDTokenType[] pattern - contiguous run of token types to find * @param int startIndex - token index to begin searching (defaults to 0) * @return ?MDTokenMatch match object, or `null` if not found */ public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch { $matched = []; for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) { $matchedAll = true; $matched = []; $patternOffset = 0; for ($p = 0; $p < mb_strlen($pattern); $p++) { $t0 = $t + $p + $patternOffset; if ($t0 >= sizeof($tokensToSearch)) return null; $token = $tokensToSearch[$t0]; $elem = $pattern[$p]; if ($elem == MDTokenType::META_OptionalWhitespace) { if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) { array_push($matched, $token); } else { $patternOffset--; } } elseif ($elem == MDTokenType::META_AnyNonWhitespace) { if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) { $matchedAll = false; break; } array_push($matched, $token); } else { if (!($token instanceof MDToken) || $token->type != $elem) { $matchedAll = false; break; } array_push($matched, $token); } } if ($matchedAll) { return new MDTokenMatch($matched, $t); } } return null; } /** * Searches an array of MDToken for a given starting pattern and ending * pattern and returns match info about both and the tokens in between. * * If `contentValidator` is specified, it will be called with the content * tokens of a potential match. If the validator returns `true`, the result * will be accepted and returned by this method. If the validator returns * `false`, this method will keep looking for another matching pair. If no * validator is given the first match will be returned regardless of content. * * If a match is found, a `MDPairedTokenMatch` is returned with details * of the opening tokens, closing tokens, and content tokens between. Otherwise * `null` is returned. * * @param MDToken[] $tokensToSearch - array of `MDToken` to search in * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern` * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching * @param number $startIndex - token index where searching should begin * @return ?MDPairedTokenMatch match, or `null` */ public static function findPairedTokens(array $tokensToSearch, array $startPattern, array $endPattern, ?callable $contentValidator=null, int $startIndex=0): ?MDPairedTokenMatch { for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) { $startMatch = findFirstTokens($tokensToSearch, $startPattern, $s); if ($startMatch === null) return null; $endStart = $startMatch->index + sizeof($startMatch->tokens); while ($endStart < sizeof($tokensToSearch)) { $endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart); if ($endMatch === null) break; $contentStart = $startMatch->index + sizeof($startMatch->tokens); $contentLength = $endMatch->index - $contentStart; $contents = array_slice($tokensToSearch, $contentStart, $contentLength); if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) { return new MDPairedTokenMatch($startMatch->tokens, $contents, $endMatch->tokens, $startMatch->index, $startMatch->index + sizeof($startMatch->tokens), $endMatch->index, $endMatch->index + sizeof($endMatch->tokens) - $startMatch->index); } else { // Contents rejected. Try next end match. $endStart = $endMatch->index + 1; } } // No end matches. Increment start match. $s = $startMatch->index; } return null; } public function equals($other) { if (!($other instanceof MDToken)) return false; if ($other->original !== $this->original) return false; if ($other->type != $this->type) return false; if ($other->content !== $this->content) return false; if ($other->extra !== $this->extra) return false; if ($other->tag !== $this->tag) return false; if ($other->modifier != $this->modifier) return false; return true; } } class MDState {} class MDHTMLFilter {} class MDHTMLTag {} class MDTagModifier {} // -- Readers --------------------------------------------------------------- class MDReader {} class MDUnderlinedHeadingReader extends MDReader {} class MDHashHeadingReader extends MDReader {} class MDSubtextReader extends MDReader {} class MDBlockQuoteReader extends MDReader {} class _MDListReader extends MDReader {} class MDUnorderedListReader extends _MDListReader {} class MDOrderedListReader extends _MDListReader {} class MDFencedCodeBlockReader extends MDReader {} class MDIndentedCodeBlockReader extends MDReader {} class MDHorizontalRuleReader extends MDReader {} class MDTableReader extends MDReader {} class MDDefinitionListReader extends MDReader {} class MDFootnoteReader extends MDReader {} class MDAbbreviationReader extends MDReader {} class MDParagraphReader extends MDReader {} class MDSimplePairInlineReader extends MDReader {} class MDEmphasisReader extends MDSimplePairInlineReader {} class MDStrongReader extends MDSimplePairInlineReader {} class MDStrikethroughReader extends MDSimplePairInlineReader {} class MDUnderlineReader extends MDSimplePairInlineReader {} class MDHighlightReader extends MDSimplePairInlineReader {} class MDCodeSpanReader extends MDSimplePairInlineReader {} class MDSubscriptReader extends MDSimplePairInlineReader {} class MDSuperscriptReader extends MDSimplePairInlineReader {} class MDLinkReader extends MDReader {} class MDReferencedLinkReader extends MDLinkReader {} class MDImageReader extends MDLinkReader {} class MDReferencedImageReader extends MDReferencedLinkReader {} class MDLineBreakReader extends MDReader {} class MDHTMLTagReader extends MDReader {} class MDModifierReader extends MDReader {} // -- Nodes ----------------------------------------------------------------- class MDNode {} class MDBlockNode extends MDNode {} class MDParagraphNode extends MDBlockNode {} class MDHeadingNode extends MDBlockNode {} class MDSubtextNode extends MDBlockNode {} class MDHorizontalRuleNode extends MDBlockNode {} class MDBlockquoteNode extends MDBlockNode {} class MDUnorderedListNode extends MDBlockNode {} class MDOrderedListNode extends MDBlockNode {} class MDListItemNode extends MDBlockNode {} class MDCodeBlockNode extends MDBlockNode {} class MDTableNode extends MDBlockNode {} class MDTableRowNode extends MDBlockNode {} class MDTableCellNode extends MDBlockNode {} class MDTableHeaderCellNode extends MDBlockNode {} class MDDefinitionListNode extends MDBlockNode {} class MDDefinitionListTermNode extends MDBlockNode {} class MDDefinitionListDefinitionNode extends MDBlockNode {} class MDFootnoteListNode extends MDBlockNode {} class MDInlineNode extends MDNode {} class MDTextNode extends MDInlineNode {} class MDObfuscatedTextNode extends MDTextNode {} class MDEmphasisNode extends MDInlineNode {} class MDStrongNode extends MDInlineNode {} class MDStrikethroughNode extends MDInlineNode {} class MDUnderlineNode extends MDInlineNode {} class MDHighlightNode extends MDInlineNode {} class MDSuperscriptNode extends MDInlineNode {} class MDSubscriptNode extends MDInlineNode {} class MDCodeNode extends MDInlineNode {} class MDFootnoteNode extends MDInlineNode {} class MDLinkNode extends MDInlineNode {} class MDReferencedLinkNode extends MDLinkNode {} class MDImageNode extends MDInlineNode {} class MDReferencedImageNode extends MDImageNode {} class MDAbbreviationNode extends MDInlineNode {} class MDLineBreakNode extends MDInlineNode {} class MDHTMLTagNode extends MDInlineNode {} // -- Main class ------------------------------------------------------------ class Markdown {} ?>