ialbert
/
www-lib-markdown


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
							<?php
declare(strict_types=1);

class MDUtils {
	// Modified from https://urlregex.com/ to remove capture groups. Matches fully qualified URLs only.
	public static $baseURLRegex = '(?:(?:(?:[a-z]{3,9}:(?:\\/\\/)?)(?:[\\-;:&=\\+\\$,\\w]+@)?[a-z0-9\\.\\-]+|(?:www\\.|[\\-;:&=\\+\\$,\\w]+@)[a-z0-9\\.\\-]+)(?:(?:\\/[\\+~%\\/\\.\\w\\-_]*)?\\??(?:[\\-\\+=&;%@\\.\\w_]*)#?(?:[\\.\\!\\/\\\\\\w]*))?)';
	// Modified from https://emailregex.com/ to remove capture groups.
	public static $baseEmailRegex = '(?:(?:[^<>()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))';

	/**
	 * Encodes characters as HTML numeric entities to make it marginally more
	 * difficult for web scrapers to grab sensitive info. If `text` starts with
	 * `mailto:` only the email address following it will be obfuscated.
	 */
	public static function escapeObfuscated(string $text): string {
		if (str_starts_with($text, 'mailto:')) {
			return 'mailto:' . escapeObfuscated(mb_substr($text, 7));
		}
		$html = '';
		$l = mb_strlen($text);
		for ($p = 0; $p < $l; $p++) {
			$cp = mb_ord(mb_substr($text, $p, 1));
			$html .= "&#{{$cp}}";
		}
		return $html;
	}

	/**
	 * Removes illegal characters from an HTML attribute name.
	 */
	public static function scrubAttributeName(string $name): string {
		return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', '');
	}

	/**
	 * Strips one or more leading indents from a line or lines of markdown. An
	 * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3
	 * spaces) are treated like one indent level.
	 *
	 * @param string|string[] $line
	 * @param int $levels
	 * @return string|string[]
	 */
	public static function stripIndent(string|array $line, int $levels=1): string|array {
		$regex = "^(?: {1,4}|\\t){{$levels}}";
		return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l)) : mb_ereg_replace($regex, '', $line);
	}

	/**
	 * Counts the number of indent levels in a line of text. Partial indents
	 * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly`
	 * is `true`.
	 */
	public static function countIndents(string $line, bool $fullIndentsOnly=false): int {
		// normalize indents to tabs
		$t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line);
		// remove content after indent
		$t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t);
		// count tabs
		return mb_strlen($t);
	}

	/**
	 * Returns a copy of an array without any whitespace-only lines at the end.
	 *
	 * @param string[] $lines
	 * @return string[]
	 */
	public static function withoutTrailingBlankLines(array $lines): array {
		$stripped = $lines;
		while (sizeof($stripped) > 0 && sizeof(mb_trim($stripped[sizeof($stripped) - 1])) == 0) {
			array_pop($stripped);
		}
		return $stripped;
	}

	/**
	 * Tests if an array of lines contains at least one blank. A blank line
	 * can contain whitespace.
	 *
	 * @param string[] $lines
	 */
	public static function containsBlankLine(array $lines): bool {
		foreach ($lines as $line) {
			if (mb_len(mb_trim($line)) == 0) return true;
		}
		return false;
	}

	public static function equalAssocArrays(array $a, array $b) {
		return empty(array_diff_assoc($a, $b));
	}
}

/**
 * Token type enum for `MDToken`.
 */
enum MDTokenType {
	case Text;
	/**
	 * Only used for the leading and trailing whitespace around a run of text,
	 * not every single whitespace character.
	 */
	case Whitespace;

	case Underscore;
	case Asterisk;
	case Slash;
	case Tilde;
	case Bang;
	case Backtick;
	case Equal;
	case Caret;

	case Label; // content=label
	case URL; // content=URL, extra=title
	case Email; // content=email address, extra=title
	case SimpleLink; // content=URL
	case SimpleEmail; // content=email address
	case Footnote; // content=symbol
	case Modifier; // modifier=MDTagModifier

	case HTMLTag; // tag=MDHTMLTag

	/** Wildcard for `MDToken.findFirstTokens` */
	case META_AnyNonWhitespace;
	/** Wildcard for `MDToken.findFirstTokens` */
	case META_OptionalWhitespace;
}

/**
 * Search results from `MDToken.findFirstTokens`.
 */
class MDTokenMatch {
	/** @var MDToken{} */
	public array $tokens;
	public int $index;

	/**
	 * @param MDToken[] $tokens
	 * @param int $index
	 */
	public function __construct($tokens, $index) {
		$this->tokens = $tokens;
		$this->index = $index;
	}
}

/**
 * Search results from `MDToken.findPairedTokens`.
 */
class MDPairedTokenMatch {
	/** @var MDToken[] */
	public array $startTokens;
	/** @var MDToken[] */
	public array $contentTokens;
	/** @var MDToken[] */
	public array $endTokens;
	public int $startIndex;
	public int $contentIndex;
	public int $endIndex;
	public int $totalLength;

	public function __construct($startTokens, $contentTokens, $endTokens, $startIndex, $contentIndex, $endIndex, $totalLength) {
		$this->startTokens = $startTokens;
		$this->contentTokens = $contentTokens;
		$this->endTokens = $endTokens;
		$this->startIndex = $startIndex;
		$this->contentIndex = $contentIndex;
		$this->endIndex = $endIndex;
		$this->totalLength = $totalLength;
	}
}

/**
 * One lexical unit in inline markdown syntax parsing.
 */
class MDToken {
	/**
	 * The original verbatim token string. Required as a plaintext fallback if
	 * the token remains unresolved.
	 */
	public string $original;
	public MDTokenType $type;
	public ?string $content = null;
	public ?string $extra = null;
	public ?MDHTMLTag $tag = null;
	public ?MDTagModifier $modifier = null;

	/**
	 * Creates a token.
	 *
	 * @param string original  verbatim token string
	 * @param MDTokenType type  token type
	 * @param string|MDTagModifier|MDHTMLTag|null content  primary content of the token
	 * @param string|null extra  additional content
	 */
	public function __construct(string $original, MDTokenType $type,
			string|MDTagModifier|MDHTMLTag|null $content=null,
			?string $extra=null) {
		$this->original = $original;
		$this->type = $type;
		if ($content instanceof MDTagModifier) {
			$this->modifier = $content;
		} elseif ($content instanceof MDHTMLTag) {
			$this->tag = $content;
		} else {
			$this->content = $content;
		}
		$this->extra = $extra;
	}

	public function __toString() {
		$classname = get_class($this);
		return "({$classname} type={$this->type} content={$this->content})";
	}

	/**
	 * Attempts to parse a label token from the beginning of `line`. A label is
	 * of the form `[content]`. If found, returns an array:
	 * - `0`: the entire label including brackets
	 * - `1`: the content of the label
	 *
	 * @param string $line
	 * @return ?string[] match groups or null if not found
	 */
	public static function tokenizeLabel(string $line): ?array {
		if (!str_starts_with($line, '[')) return null;
		$parenCount = 0;
		$bracketCount = 0;
		$l = mb_strlen($line);
		for ($p = 1; $p < $l; $p++) {
			$ch = mb_substr($line, $p, 1);
			if ($ch == '\\') {
				$p++;
			} elseif ($ch == '(') {
				$parenCount++;
			} elseif ($ch == ')') {
				$parenCount--;
				if ($parenCount < 0) return null;
			} elseif ($ch == '[') {
				$bracketCount++;
			} elseif ($ch == ']') {
				if ($bracketCount > 0) {
					$bracketCount--;
				} else {
					return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p) ];
				}
			}
		}
		return null;
	}

	private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)';  // 1=URL, 2=title
	private static $urlRegex = '^\\((\\S+?)\\)';  // 1=URL

	/**
	 * Attempts to parse a URL token from the beginning of `line`. A URL token
	 * is of the form `(url)` or `(url "title")`. If found, returns an array:
	 * - `0`: the entire URL token including parentheses
	 * - `1`: the URL
	 * - `2`: the optional title, or `null`
	 *
	 * @param string $line
	 * @return ?array token tuple
	 */
	public static function tokenizeURL(string $line): ?array {
		$groups = [];
		if (mb_eregi($urlWithTitleRegex, $line, $groups)) {
			if (tokenizeEmail($line)) return null; // make sure it's not better described as an email address
			return $groups;
		}
		if (mb_eregi($urlRegex, $line, $groups)) {
			if (tokenizeEmail($line)) return null;
			return [ $groups[0], $groups[1], null ];
		}
		return null;
	}

	/**
	 * Attempts to parse an email address from the beginning of `line`. An
	 * email address is of the form `(user@example.com)` or
	 * `(user@example.com "link title")`. If found, returns an array:
	 * - `0`: the entire token including parentheses
	 * - `1`: the email address
	 * - `2`: the optional link title, or `null`
	 *
	 * @param string $line
	 * @return string[] token tuple
	 */
	public static function tokenizeEmail(string $line): array {
		$groups;
		if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)",
				$line, $groups)) {
			return $groups;
		}
		if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) {
			return [ $groups[0], $groups[1], null ];
		}
		return null;
	}

	/**
	 * Searches an array of `MDToken` for the given pattern of `MDTokenType`s.
	 * If found, returns a `MDTokenMatch`, otherwise `null`.
	 *
	 * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace`
	 * are special supported token types. Note that `META_OptionalWhitespace`
	 * may give a result with a variable number of tokens.
	 *
	 * @param (MDToken|MDNode)[] tokensToSearch - mixed array of `MDToken` and
	 *   `MDNode` elements
	 * @param MDTokenType[] pattern - contiguous run of token types to find
	 * @param int startIndex - token index to begin searching (defaults to 0)
	 * @return ?MDTokenMatch match object, or `null` if not found
	 */
	public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch {
		$matched = [];
		for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) {
			$matchedAll = true;
			$matched = [];
			$patternOffset = 0;
			for ($p = 0; $p < mb_strlen($pattern); $p++) {
				$t0 = $t + $p + $patternOffset;
				if ($t0 >= sizeof($tokensToSearch)) return null;
				$token = $tokensToSearch[$t0];
				$elem = $pattern[$p];
				if ($elem == MDTokenType::META_OptionalWhitespace) {
					if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
						array_push($matched, $token);
					} else {
						$patternOffset--;
					}
				} elseif ($elem == MDTokenType::META_AnyNonWhitespace) {
					if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
						$matchedAll = false;
						break;
					}
					array_push($matched, $token);
				} else {
					if (!($token instanceof MDToken) || $token->type != $elem) {
						$matchedAll = false;
						break;
					}
					array_push($matched, $token);
				}
			}
			if ($matchedAll) {
				return new MDTokenMatch($matched, $t);
			}
		}
		return null;
	}

	/**
	 * Searches an array of MDToken for a given starting pattern and ending
	 * pattern and returns match info about both and the tokens in between.
	 *
	 * If `contentValidator` is specified, it will be called with the content
	 * tokens of a potential match. If the validator returns `true`, the result
	 * will be accepted and returned by this method. If the validator returns
	 * `false`, this method will keep looking for another matching pair. If no
	 * validator is given the first match will be returned regardless of content.
	 *
	 * If a match is found, a `MDPairedTokenMatch` is returned with details
	 * of the opening tokens, closing tokens, and content tokens between. Otherwise
	 * `null` is returned.
	 *
	 * @param MDToken[] $tokensToSearch - array of `MDToken` to search in
	 * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first
	 * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern`
	 * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching
	 * @param number $startIndex - token index where searching should begin
	 * @return ?MDPairedTokenMatch match, or `null`
	 */
	public static function findPairedTokens(array $tokensToSearch,
			array $startPattern, array $endPattern, ?callable $contentValidator=null,
			int $startIndex=0): ?MDPairedTokenMatch {
		for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) {
			$startMatch = findFirstTokens($tokensToSearch, $startPattern, $s);
			if ($startMatch === null) return null;
			$endStart = $startMatch->index + sizeof($startMatch->tokens);
			while ($endStart < sizeof($tokensToSearch)) {
				$endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart);
				if ($endMatch === null) break;
				$contentStart = $startMatch->index + sizeof($startMatch->tokens);
				$contentLength = $endMatch->index - $contentStart;
				$contents = array_slice($tokensToSearch, $contentStart, $contentLength);
				if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) {
					return new MDPairedTokenMatch($startMatch->tokens,
						$contents,
						$endMatch->tokens,
						$startMatch->index,
						$startMatch->index + sizeof($startMatch->tokens),
						$endMatch->index,
						$endMatch->index + sizeof($endMatch->tokens) - $startMatch->index);
				} else {
					// Contents rejected. Try next end match.
					$endStart = $endMatch->index + 1;
				}
			}
			// No end matches. Increment start match.
			$s = $startMatch->index;
		}
		return null;
	}

	public function equals($other) {
		if (!($other instanceof MDToken)) return false;
		if ($other->original !== $this->original) return false;
		if ($other->type != $this->type) return false;
		if ($other->content !== $this->content) return false;
		if ($other->extra !== $this->extra) return false;
		if ($other->tag !== $this->tag) return false;
		if ($other->modifier != $this->modifier) return false;
		return true;
	}
}

class MDState {}

class MDHTMLFilter {}

class MDHTMLTag {}

class MDTagModifier {}


// -- Readers ---------------------------------------------------------------


class MDReader {}

class MDUnderlinedHeadingReader extends MDReader {}

class MDHashHeadingReader extends MDReader {}

class MDSubtextReader extends MDReader {}

class MDBlockQuoteReader extends MDReader {}

class _MDListReader extends MDReader {}

class MDUnorderedListReader extends _MDListReader {}

class MDOrderedListReader extends _MDListReader {}

class MDFencedCodeBlockReader extends MDReader {}

class MDIndentedCodeBlockReader extends MDReader {}

class MDHorizontalRuleReader extends MDReader {}

class MDTableReader extends MDReader {}

class MDDefinitionListReader extends MDReader {}

class MDFootnoteReader extends MDReader {}

class MDAbbreviationReader extends MDReader {}

class MDParagraphReader extends MDReader {}

class MDSimplePairInlineReader extends MDReader {}

class MDEmphasisReader extends MDSimplePairInlineReader {}

class MDStrongReader extends MDSimplePairInlineReader {}

class MDStrikethroughReader extends MDSimplePairInlineReader {}

class MDUnderlineReader extends MDSimplePairInlineReader {}

class MDHighlightReader extends MDSimplePairInlineReader {}

class MDCodeSpanReader extends MDSimplePairInlineReader {}

class MDSubscriptReader extends MDSimplePairInlineReader {}

class MDSuperscriptReader extends MDSimplePairInlineReader {}

class MDLinkReader extends MDReader {}

class MDReferencedLinkReader extends MDLinkReader {}

class MDImageReader extends MDLinkReader {}

class MDReferencedImageReader extends MDReferencedLinkReader {}

class MDLineBreakReader extends MDReader {}

class MDHTMLTagReader extends MDReader {}

class MDModifierReader extends MDReader {}


// -- Nodes -----------------------------------------------------------------


class MDNode {}

class MDBlockNode extends MDNode {}

class MDParagraphNode extends MDBlockNode {}

class MDHeadingNode extends MDBlockNode {}

class MDSubtextNode extends MDBlockNode {}

class MDHorizontalRuleNode extends MDBlockNode {}

class MDBlockquoteNode extends MDBlockNode {}

class MDUnorderedListNode extends MDBlockNode {}

class MDOrderedListNode extends MDBlockNode {}

class MDListItemNode extends MDBlockNode {}

class MDCodeBlockNode extends MDBlockNode {}

class MDTableNode extends MDBlockNode {}

class MDTableRowNode extends MDBlockNode {}

class MDTableCellNode extends MDBlockNode {}

class MDTableHeaderCellNode extends MDBlockNode {}

class MDDefinitionListNode extends MDBlockNode {}

class MDDefinitionListTermNode extends MDBlockNode {}

class MDDefinitionListDefinitionNode extends MDBlockNode {}

class MDFootnoteListNode extends MDBlockNode {}

class MDInlineNode extends MDNode {}

class MDTextNode extends MDInlineNode {}

class MDObfuscatedTextNode extends MDTextNode {}

class MDEmphasisNode extends MDInlineNode {}

class MDStrongNode extends MDInlineNode {}

class MDStrikethroughNode extends MDInlineNode {}

class MDUnderlineNode extends MDInlineNode {}

class MDHighlightNode extends MDInlineNode {}

class MDSuperscriptNode extends MDInlineNode {}

class MDSubscriptNode extends MDInlineNode {}

class MDCodeNode extends MDInlineNode {}

class MDFootnoteNode extends MDInlineNode {}

class MDLinkNode extends MDInlineNode {}

class MDReferencedLinkNode extends MDLinkNode {}

class MDImageNode extends MDInlineNode {}

class MDReferencedImageNode extends MDImageNode {}

class MDAbbreviationNode extends MDInlineNode {}

class MDLineBreakNode extends MDInlineNode {}

class MDHTMLTagNode extends MDInlineNode {}


// -- Main class ------------------------------------------------------------


class Markdown {}
?>