| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581 |
- <?php
- declare(strict_types=1);
-
- class MDUtils {
- // Modified from https://urlregex.com/ to remove capture groups. Matches fully qualified URLs only.
- public static $baseURLRegex = '(?:(?:(?:[a-z]{3,9}:(?:\\/\\/)?)(?:[\\-;:&=\\+\\$,\\w]+@)?[a-z0-9\\.\\-]+|(?:www\\.|[\\-;:&=\\+\\$,\\w]+@)[a-z0-9\\.\\-]+)(?:(?:\\/[\\+~%\\/\\.\\w\\-_]*)?\\??(?:[\\-\\+=&;%@\\.\\w_]*)#?(?:[\\.\\!\\/\\\\\\w]*))?)';
- // Modified from https://emailregex.com/ to remove capture groups.
- public static $baseEmailRegex = '(?:(?:[^<>()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))';
-
- /**
- * Encodes characters as HTML numeric entities to make it marginally more
- * difficult for web scrapers to grab sensitive info. If `text` starts with
- * `mailto:` only the email address following it will be obfuscated.
- */
- public static function escapeObfuscated(string $text): string {
- if (str_starts_with($text, 'mailto:')) {
- return 'mailto:' . escapeObfuscated(mb_substr($text, 7));
- }
- $html = '';
- $l = mb_strlen($text);
- for ($p = 0; $p < $l; $p++) {
- $cp = mb_ord(mb_substr($text, $p, 1));
- $html .= "&#{{$cp}}";
- }
- return $html;
- }
-
- /**
- * Removes illegal characters from an HTML attribute name.
- */
- public static function scrubAttributeName(string $name): string {
- return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', '');
- }
-
- /**
- * Strips one or more leading indents from a line or lines of markdown. An
- * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3
- * spaces) are treated like one indent level.
- *
- * @param string|string[] $line
- * @param int $levels
- * @return string|string[]
- */
- public static function stripIndent(string|array $line, int $levels=1): string|array {
- $regex = "^(?: {1,4}|\\t){{$levels}}";
- return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l)) : mb_ereg_replace($regex, '', $line);
- }
-
- /**
- * Counts the number of indent levels in a line of text. Partial indents
- * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly`
- * is `true`.
- */
- public static function countIndents(string $line, bool $fullIndentsOnly=false): int {
- // normalize indents to tabs
- $t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line);
- // remove content after indent
- $t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t);
- // count tabs
- return mb_strlen($t);
- }
-
- /**
- * Returns a copy of an array without any whitespace-only lines at the end.
- *
- * @param string[] $lines
- * @return string[]
- */
- public static function withoutTrailingBlankLines(array $lines): array {
- $stripped = $lines;
- while (sizeof($stripped) > 0 && sizeof(mb_trim($stripped[sizeof($stripped) - 1])) == 0) {
- array_pop($stripped);
- }
- return $stripped;
- }
-
- /**
- * Tests if an array of lines contains at least one blank. A blank line
- * can contain whitespace.
- *
- * @param string[] $lines
- */
- public static function containsBlankLine(array $lines): bool {
- foreach ($lines as $line) {
- if (mb_len(mb_trim($line)) == 0) return true;
- }
- return false;
- }
-
- public static function equalAssocArrays(array $a, array $b) {
- return empty(array_diff_assoc($a, $b));
- }
- }
-
- /**
- * Token type enum for `MDToken`.
- */
- enum MDTokenType {
- case Text;
- /**
- * Only used for the leading and trailing whitespace around a run of text,
- * not every single whitespace character.
- */
- case Whitespace;
-
- case Underscore;
- case Asterisk;
- case Slash;
- case Tilde;
- case Bang;
- case Backtick;
- case Equal;
- case Caret;
-
- case Label; // content=label
- case URL; // content=URL, extra=title
- case Email; // content=email address, extra=title
- case SimpleLink; // content=URL
- case SimpleEmail; // content=email address
- case Footnote; // content=symbol
- case Modifier; // modifier=MDTagModifier
-
- case HTMLTag; // tag=MDHTMLTag
-
- /** Wildcard for `MDToken.findFirstTokens` */
- case META_AnyNonWhitespace;
- /** Wildcard for `MDToken.findFirstTokens` */
- case META_OptionalWhitespace;
- }
-
- /**
- * Search results from `MDToken.findFirstTokens`.
- */
- class MDTokenMatch {
- /** @var MDToken{} */
- public array $tokens;
- public int $index;
-
- /**
- * @param MDToken[] $tokens
- * @param int $index
- */
- public function __construct($tokens, $index) {
- $this->tokens = $tokens;
- $this->index = $index;
- }
- }
-
- /**
- * Search results from `MDToken.findPairedTokens`.
- */
- class MDPairedTokenMatch {
- /** @var MDToken[] */
- public array $startTokens;
- /** @var MDToken[] */
- public array $contentTokens;
- /** @var MDToken[] */
- public array $endTokens;
- public int $startIndex;
- public int $contentIndex;
- public int $endIndex;
- public int $totalLength;
-
- public function __construct($startTokens, $contentTokens, $endTokens, $startIndex, $contentIndex, $endIndex, $totalLength) {
- $this->startTokens = $startTokens;
- $this->contentTokens = $contentTokens;
- $this->endTokens = $endTokens;
- $this->startIndex = $startIndex;
- $this->contentIndex = $contentIndex;
- $this->endIndex = $endIndex;
- $this->totalLength = $totalLength;
- }
- }
-
- /**
- * One lexical unit in inline markdown syntax parsing.
- */
- class MDToken {
- /**
- * The original verbatim token string. Required as a plaintext fallback if
- * the token remains unresolved.
- */
- public string $original;
- public MDTokenType $type;
- public ?string $content = null;
- public ?string $extra = null;
- public ?MDHTMLTag $tag = null;
- public ?MDTagModifier $modifier = null;
-
- /**
- * Creates a token.
- *
- * @param string original verbatim token string
- * @param MDTokenType type token type
- * @param string|MDTagModifier|MDHTMLTag|null content primary content of the token
- * @param string|null extra additional content
- */
- public function __construct(string $original, MDTokenType $type,
- string|MDTagModifier|MDHTMLTag|null $content=null,
- ?string $extra=null) {
- $this->original = $original;
- $this->type = $type;
- if ($content instanceof MDTagModifier) {
- $this->modifier = $content;
- } elseif ($content instanceof MDHTMLTag) {
- $this->tag = $content;
- } else {
- $this->content = $content;
- }
- $this->extra = $extra;
- }
-
- public function __toString() {
- $classname = get_class($this);
- return "({$classname} type={$this->type} content={$this->content})";
- }
-
- /**
- * Attempts to parse a label token from the beginning of `line`. A label is
- * of the form `[content]`. If found, returns an array:
- * - `0`: the entire label including brackets
- * - `1`: the content of the label
- *
- * @param string $line
- * @return ?string[] match groups or null if not found
- */
- public static function tokenizeLabel(string $line): ?array {
- if (!str_starts_with($line, '[')) return null;
- $parenCount = 0;
- $bracketCount = 0;
- $l = mb_strlen($line);
- for ($p = 1; $p < $l; $p++) {
- $ch = mb_substr($line, $p, 1);
- if ($ch == '\\') {
- $p++;
- } elseif ($ch == '(') {
- $parenCount++;
- } elseif ($ch == ')') {
- $parenCount--;
- if ($parenCount < 0) return null;
- } elseif ($ch == '[') {
- $bracketCount++;
- } elseif ($ch == ']') {
- if ($bracketCount > 0) {
- $bracketCount--;
- } else {
- return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p) ];
- }
- }
- }
- return null;
- }
-
- private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)'; // 1=URL, 2=title
- private static $urlRegex = '^\\((\\S+?)\\)'; // 1=URL
-
- /**
- * Attempts to parse a URL token from the beginning of `line`. A URL token
- * is of the form `(url)` or `(url "title")`. If found, returns an array:
- * - `0`: the entire URL token including parentheses
- * - `1`: the URL
- * - `2`: the optional title, or `null`
- *
- * @param string $line
- * @return ?array token tuple
- */
- public static function tokenizeURL(string $line): ?array {
- $groups = [];
- if (mb_eregi($urlWithTitleRegex, $line, $groups)) {
- if (tokenizeEmail($line)) return null; // make sure it's not better described as an email address
- return $groups;
- }
- if (mb_eregi($urlRegex, $line, $groups)) {
- if (tokenizeEmail($line)) return null;
- return [ $groups[0], $groups[1], null ];
- }
- return null;
- }
-
- /**
- * Attempts to parse an email address from the beginning of `line`. An
- * email address is of the form `(user@example.com)` or
- * `(user@example.com "link title")`. If found, returns an array:
- * - `0`: the entire token including parentheses
- * - `1`: the email address
- * - `2`: the optional link title, or `null`
- *
- * @param string $line
- * @return string[] token tuple
- */
- public static function tokenizeEmail(string $line): array {
- $groups;
- if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)",
- $line, $groups)) {
- return $groups;
- }
- if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) {
- return [ $groups[0], $groups[1], null ];
- }
- return null;
- }
-
- /**
- * Searches an array of `MDToken` for the given pattern of `MDTokenType`s.
- * If found, returns a `MDTokenMatch`, otherwise `null`.
- *
- * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace`
- * are special supported token types. Note that `META_OptionalWhitespace`
- * may give a result with a variable number of tokens.
- *
- * @param (MDToken|MDNode)[] tokensToSearch - mixed array of `MDToken` and
- * `MDNode` elements
- * @param MDTokenType[] pattern - contiguous run of token types to find
- * @param int startIndex - token index to begin searching (defaults to 0)
- * @return ?MDTokenMatch match object, or `null` if not found
- */
- public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch {
- $matched = [];
- for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) {
- $matchedAll = true;
- $matched = [];
- $patternOffset = 0;
- for ($p = 0; $p < mb_strlen($pattern); $p++) {
- $t0 = $t + $p + $patternOffset;
- if ($t0 >= sizeof($tokensToSearch)) return null;
- $token = $tokensToSearch[$t0];
- $elem = $pattern[$p];
- if ($elem == MDTokenType::META_OptionalWhitespace) {
- if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
- array_push($matched, $token);
- } else {
- $patternOffset--;
- }
- } elseif ($elem == MDTokenType::META_AnyNonWhitespace) {
- if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
- $matchedAll = false;
- break;
- }
- array_push($matched, $token);
- } else {
- if (!($token instanceof MDToken) || $token->type != $elem) {
- $matchedAll = false;
- break;
- }
- array_push($matched, $token);
- }
- }
- if ($matchedAll) {
- return new MDTokenMatch($matched, $t);
- }
- }
- return null;
- }
-
- /**
- * Searches an array of MDToken for a given starting pattern and ending
- * pattern and returns match info about both and the tokens in between.
- *
- * If `contentValidator` is specified, it will be called with the content
- * tokens of a potential match. If the validator returns `true`, the result
- * will be accepted and returned by this method. If the validator returns
- * `false`, this method will keep looking for another matching pair. If no
- * validator is given the first match will be returned regardless of content.
- *
- * If a match is found, a `MDPairedTokenMatch` is returned with details
- * of the opening tokens, closing tokens, and content tokens between. Otherwise
- * `null` is returned.
- *
- * @param MDToken[] $tokensToSearch - array of `MDToken` to search in
- * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first
- * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern`
- * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching
- * @param number $startIndex - token index where searching should begin
- * @return ?MDPairedTokenMatch match, or `null`
- */
- public static function findPairedTokens(array $tokensToSearch,
- array $startPattern, array $endPattern, ?callable $contentValidator=null,
- int $startIndex=0): ?MDPairedTokenMatch {
- for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) {
- $startMatch = findFirstTokens($tokensToSearch, $startPattern, $s);
- if ($startMatch === null) return null;
- $endStart = $startMatch->index + sizeof($startMatch->tokens);
- while ($endStart < sizeof($tokensToSearch)) {
- $endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart);
- if ($endMatch === null) break;
- $contentStart = $startMatch->index + sizeof($startMatch->tokens);
- $contentLength = $endMatch->index - $contentStart;
- $contents = array_slice($tokensToSearch, $contentStart, $contentLength);
- if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) {
- return new MDPairedTokenMatch($startMatch->tokens,
- $contents,
- $endMatch->tokens,
- $startMatch->index,
- $startMatch->index + sizeof($startMatch->tokens),
- $endMatch->index,
- $endMatch->index + sizeof($endMatch->tokens) - $startMatch->index);
- } else {
- // Contents rejected. Try next end match.
- $endStart = $endMatch->index + 1;
- }
- }
- // No end matches. Increment start match.
- $s = $startMatch->index;
- }
- return null;
- }
-
- public function equals($other) {
- if (!($other instanceof MDToken)) return false;
- if ($other->original !== $this->original) return false;
- if ($other->type != $this->type) return false;
- if ($other->content !== $this->content) return false;
- if ($other->extra !== $this->extra) return false;
- if ($other->tag !== $this->tag) return false;
- if ($other->modifier != $this->modifier) return false;
- return true;
- }
- }
-
- class MDState {}
-
- class MDHTMLFilter {}
-
- class MDHTMLTag {}
-
- class MDTagModifier {}
-
-
- // -- Readers ---------------------------------------------------------------
-
-
- class MDReader {}
-
- class MDUnderlinedHeadingReader extends MDReader {}
-
- class MDHashHeadingReader extends MDReader {}
-
- class MDSubtextReader extends MDReader {}
-
- class MDBlockQuoteReader extends MDReader {}
-
- class _MDListReader extends MDReader {}
-
- class MDUnorderedListReader extends _MDListReader {}
-
- class MDOrderedListReader extends _MDListReader {}
-
- class MDFencedCodeBlockReader extends MDReader {}
-
- class MDIndentedCodeBlockReader extends MDReader {}
-
- class MDHorizontalRuleReader extends MDReader {}
-
- class MDTableReader extends MDReader {}
-
- class MDDefinitionListReader extends MDReader {}
-
- class MDFootnoteReader extends MDReader {}
-
- class MDAbbreviationReader extends MDReader {}
-
- class MDParagraphReader extends MDReader {}
-
- class MDSimplePairInlineReader extends MDReader {}
-
- class MDEmphasisReader extends MDSimplePairInlineReader {}
-
- class MDStrongReader extends MDSimplePairInlineReader {}
-
- class MDStrikethroughReader extends MDSimplePairInlineReader {}
-
- class MDUnderlineReader extends MDSimplePairInlineReader {}
-
- class MDHighlightReader extends MDSimplePairInlineReader {}
-
- class MDCodeSpanReader extends MDSimplePairInlineReader {}
-
- class MDSubscriptReader extends MDSimplePairInlineReader {}
-
- class MDSuperscriptReader extends MDSimplePairInlineReader {}
-
- class MDLinkReader extends MDReader {}
-
- class MDReferencedLinkReader extends MDLinkReader {}
-
- class MDImageReader extends MDLinkReader {}
-
- class MDReferencedImageReader extends MDReferencedLinkReader {}
-
- class MDLineBreakReader extends MDReader {}
-
- class MDHTMLTagReader extends MDReader {}
-
- class MDModifierReader extends MDReader {}
-
-
- // -- Nodes -----------------------------------------------------------------
-
-
- class MDNode {}
-
- class MDBlockNode extends MDNode {}
-
- class MDParagraphNode extends MDBlockNode {}
-
- class MDHeadingNode extends MDBlockNode {}
-
- class MDSubtextNode extends MDBlockNode {}
-
- class MDHorizontalRuleNode extends MDBlockNode {}
-
- class MDBlockquoteNode extends MDBlockNode {}
-
- class MDUnorderedListNode extends MDBlockNode {}
-
- class MDOrderedListNode extends MDBlockNode {}
-
- class MDListItemNode extends MDBlockNode {}
-
- class MDCodeBlockNode extends MDBlockNode {}
-
- class MDTableNode extends MDBlockNode {}
-
- class MDTableRowNode extends MDBlockNode {}
-
- class MDTableCellNode extends MDBlockNode {}
-
- class MDTableHeaderCellNode extends MDBlockNode {}
-
- class MDDefinitionListNode extends MDBlockNode {}
-
- class MDDefinitionListTermNode extends MDBlockNode {}
-
- class MDDefinitionListDefinitionNode extends MDBlockNode {}
-
- class MDFootnoteListNode extends MDBlockNode {}
-
- class MDInlineNode extends MDNode {}
-
- class MDTextNode extends MDInlineNode {}
-
- class MDObfuscatedTextNode extends MDTextNode {}
-
- class MDEmphasisNode extends MDInlineNode {}
-
- class MDStrongNode extends MDInlineNode {}
-
- class MDStrikethroughNode extends MDInlineNode {}
-
- class MDUnderlineNode extends MDInlineNode {}
-
- class MDHighlightNode extends MDInlineNode {}
-
- class MDSuperscriptNode extends MDInlineNode {}
-
- class MDSubscriptNode extends MDInlineNode {}
-
- class MDCodeNode extends MDInlineNode {}
-
- class MDFootnoteNode extends MDInlineNode {}
-
- class MDLinkNode extends MDInlineNode {}
-
- class MDReferencedLinkNode extends MDLinkNode {}
-
- class MDImageNode extends MDInlineNode {}
-
- class MDReferencedImageNode extends MDImageNode {}
-
- class MDAbbreviationNode extends MDInlineNode {}
-
- class MDLineBreakNode extends MDInlineNode {}
-
- class MDHTMLTagNode extends MDInlineNode {}
-
-
- // -- Main class ------------------------------------------------------------
-
-
- class Markdown {}
- ?>
|