PHP and Javascript implementations of a simple markdown parser
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

markdown.php 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. <?php
  2. declare(strict_types=1);
  3. class MDUtils {
  4. // Modified from https://urlregex.com/ to remove capture groups. Matches fully qualified URLs only.
  5. public static $baseURLRegex = '(?:(?:(?:[a-z]{3,9}:(?:\\/\\/)?)(?:[\\-;:&=\\+\\$,\\w]+@)?[a-z0-9\\.\\-]+|(?:www\\.|[\\-;:&=\\+\\$,\\w]+@)[a-z0-9\\.\\-]+)(?:(?:\\/[\\+~%\\/\\.\\w\\-_]*)?\\??(?:[\\-\\+=&;%@\\.\\w_]*)#?(?:[\\.\\!\\/\\\\\\w]*))?)';
  6. // Modified from https://emailregex.com/ to remove capture groups.
  7. public static $baseEmailRegex = '(?:(?:[^<>()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))';
  8. /**
  9. * Encodes characters as HTML numeric entities to make it marginally more
  10. * difficult for web scrapers to grab sensitive info. If `text` starts with
  11. * `mailto:` only the email address following it will be obfuscated.
  12. */
  13. public static function escapeObfuscated(string $text): string {
  14. if (str_starts_with($text, 'mailto:')) {
  15. return 'mailto:' . escapeObfuscated(mb_substr($text, 7));
  16. }
  17. $html = '';
  18. $l = mb_strlen($text);
  19. for ($p = 0; $p < $l; $p++) {
  20. $cp = mb_ord(mb_substr($text, $p, 1));
  21. $html .= "&#{{$cp}}";
  22. }
  23. return $html;
  24. }
  25. /**
  26. * Removes illegal characters from an HTML attribute name.
  27. */
  28. public static function scrubAttributeName(string $name): string {
  29. return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', '');
  30. }
  31. /**
  32. * Strips one or more leading indents from a line or lines of markdown. An
  33. * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3
  34. * spaces) are treated like one indent level.
  35. *
  36. * @param string|string[] $line
  37. * @param int $levels
  38. * @return string|string[]
  39. */
  40. public static function stripIndent(string|array $line, int $levels=1): string|array {
  41. $regex = "^(?: {1,4}|\\t){{$levels}}";
  42. return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l)) : mb_ereg_replace($regex, '', $line);
  43. }
  44. /**
  45. * Counts the number of indent levels in a line of text. Partial indents
  46. * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly`
  47. * is `true`.
  48. */
  49. public static function countIndents(string $line, bool $fullIndentsOnly=false): int {
  50. // normalize indents to tabs
  51. $t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line);
  52. // remove content after indent
  53. $t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t);
  54. // count tabs
  55. return mb_strlen($t);
  56. }
  57. /**
  58. * Returns a copy of an array without any whitespace-only lines at the end.
  59. *
  60. * @param string[] $lines
  61. * @return string[]
  62. */
  63. public static function withoutTrailingBlankLines(array $lines): array {
  64. $stripped = $lines;
  65. while (sizeof($stripped) > 0 && sizeof(mb_trim($stripped[sizeof($stripped) - 1])) == 0) {
  66. array_pop($stripped);
  67. }
  68. return $stripped;
  69. }
  70. /**
  71. * Tests if an array of lines contains at least one blank. A blank line
  72. * can contain whitespace.
  73. *
  74. * @param string[] $lines
  75. */
  76. public static function containsBlankLine(array $lines): bool {
  77. foreach ($lines as $line) {
  78. if (mb_len(mb_trim($line)) == 0) return true;
  79. }
  80. return false;
  81. }
  82. public static function equalAssocArrays(array $a, array $b) {
  83. return empty(array_diff_assoc($a, $b));
  84. }
  85. }
  86. /**
  87. * Token type enum for `MDToken`.
  88. */
  89. enum MDTokenType {
  90. case Text;
  91. /**
  92. * Only used for the leading and trailing whitespace around a run of text,
  93. * not every single whitespace character.
  94. */
  95. case Whitespace;
  96. case Underscore;
  97. case Asterisk;
  98. case Slash;
  99. case Tilde;
  100. case Bang;
  101. case Backtick;
  102. case Equal;
  103. case Caret;
  104. case Label; // content=label
  105. case URL; // content=URL, extra=title
  106. case Email; // content=email address, extra=title
  107. case SimpleLink; // content=URL
  108. case SimpleEmail; // content=email address
  109. case Footnote; // content=symbol
  110. case Modifier; // modifier=MDTagModifier
  111. case HTMLTag; // tag=MDHTMLTag
  112. /** Wildcard for `MDToken.findFirstTokens` */
  113. case META_AnyNonWhitespace;
  114. /** Wildcard for `MDToken.findFirstTokens` */
  115. case META_OptionalWhitespace;
  116. }
  117. /**
  118. * Search results from `MDToken.findFirstTokens`.
  119. */
  120. class MDTokenMatch {
  121. /** @var MDToken{} */
  122. public array $tokens;
  123. public int $index;
  124. /**
  125. * @param MDToken[] $tokens
  126. * @param int $index
  127. */
  128. public function __construct($tokens, $index) {
  129. $this->tokens = $tokens;
  130. $this->index = $index;
  131. }
  132. }
  133. /**
  134. * Search results from `MDToken.findPairedTokens`.
  135. */
  136. class MDPairedTokenMatch {
  137. /** @var MDToken[] */
  138. public array $startTokens;
  139. /** @var MDToken[] */
  140. public array $contentTokens;
  141. /** @var MDToken[] */
  142. public array $endTokens;
  143. public int $startIndex;
  144. public int $contentIndex;
  145. public int $endIndex;
  146. public int $totalLength;
  147. public function __construct($startTokens, $contentTokens, $endTokens, $startIndex, $contentIndex, $endIndex, $totalLength) {
  148. $this->startTokens = $startTokens;
  149. $this->contentTokens = $contentTokens;
  150. $this->endTokens = $endTokens;
  151. $this->startIndex = $startIndex;
  152. $this->contentIndex = $contentIndex;
  153. $this->endIndex = $endIndex;
  154. $this->totalLength = $totalLength;
  155. }
  156. }
  157. /**
  158. * One lexical unit in inline markdown syntax parsing.
  159. */
  160. class MDToken {
  161. /**
  162. * The original verbatim token string. Required as a plaintext fallback if
  163. * the token remains unresolved.
  164. */
  165. public string $original;
  166. public MDTokenType $type;
  167. public ?string $content = null;
  168. public ?string $extra = null;
  169. public ?MDHTMLTag $tag = null;
  170. public ?MDTagModifier $modifier = null;
  171. /**
  172. * Creates a token.
  173. *
  174. * @param string original verbatim token string
  175. * @param MDTokenType type token type
  176. * @param string|MDTagModifier|MDHTMLTag|null content primary content of the token
  177. * @param string|null extra additional content
  178. */
  179. public function __construct(string $original, MDTokenType $type,
  180. string|MDTagModifier|MDHTMLTag|null $content=null,
  181. ?string $extra=null) {
  182. $this->original = $original;
  183. $this->type = $type;
  184. if ($content instanceof MDTagModifier) {
  185. $this->modifier = $content;
  186. } elseif ($content instanceof MDHTMLTag) {
  187. $this->tag = $content;
  188. } else {
  189. $this->content = $content;
  190. }
  191. $this->extra = $extra;
  192. }
  193. public function __toString() {
  194. $classname = get_class($this);
  195. return "({$classname} type={$this->type} content={$this->content})";
  196. }
  197. /**
  198. * Attempts to parse a label token from the beginning of `line`. A label is
  199. * of the form `[content]`. If found, returns an array:
  200. * - `0`: the entire label including brackets
  201. * - `1`: the content of the label
  202. *
  203. * @param string $line
  204. * @return ?string[] match groups or null if not found
  205. */
  206. public static function tokenizeLabel(string $line): ?array {
  207. if (!str_starts_with($line, '[')) return null;
  208. $parenCount = 0;
  209. $bracketCount = 0;
  210. $l = mb_strlen($line);
  211. for ($p = 1; $p < $l; $p++) {
  212. $ch = mb_substr($line, $p, 1);
  213. if ($ch == '\\') {
  214. $p++;
  215. } elseif ($ch == '(') {
  216. $parenCount++;
  217. } elseif ($ch == ')') {
  218. $parenCount--;
  219. if ($parenCount < 0) return null;
  220. } elseif ($ch == '[') {
  221. $bracketCount++;
  222. } elseif ($ch == ']') {
  223. if ($bracketCount > 0) {
  224. $bracketCount--;
  225. } else {
  226. return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p) ];
  227. }
  228. }
  229. }
  230. return null;
  231. }
  232. private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)'; // 1=URL, 2=title
  233. private static $urlRegex = '^\\((\\S+?)\\)'; // 1=URL
  234. /**
  235. * Attempts to parse a URL token from the beginning of `line`. A URL token
  236. * is of the form `(url)` or `(url "title")`. If found, returns an array:
  237. * - `0`: the entire URL token including parentheses
  238. * - `1`: the URL
  239. * - `2`: the optional title, or `null`
  240. *
  241. * @param string $line
  242. * @return ?array token tuple
  243. */
  244. public static function tokenizeURL(string $line): ?array {
  245. $groups = [];
  246. if (mb_eregi($urlWithTitleRegex, $line, $groups)) {
  247. if (tokenizeEmail($line)) return null; // make sure it's not better described as an email address
  248. return $groups;
  249. }
  250. if (mb_eregi($urlRegex, $line, $groups)) {
  251. if (tokenizeEmail($line)) return null;
  252. return [ $groups[0], $groups[1], null ];
  253. }
  254. return null;
  255. }
  256. /**
  257. * Attempts to parse an email address from the beginning of `line`. An
  258. * email address is of the form `(user@example.com)` or
  259. * `(user@example.com "link title")`. If found, returns an array:
  260. * - `0`: the entire token including parentheses
  261. * - `1`: the email address
  262. * - `2`: the optional link title, or `null`
  263. *
  264. * @param string $line
  265. * @return string[] token tuple
  266. */
  267. public static function tokenizeEmail(string $line): array {
  268. $groups;
  269. if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)",
  270. $line, $groups)) {
  271. return $groups;
  272. }
  273. if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) {
  274. return [ $groups[0], $groups[1], null ];
  275. }
  276. return null;
  277. }
  278. /**
  279. * Searches an array of `MDToken` for the given pattern of `MDTokenType`s.
  280. * If found, returns a `MDTokenMatch`, otherwise `null`.
  281. *
  282. * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace`
  283. * are special supported token types. Note that `META_OptionalWhitespace`
  284. * may give a result with a variable number of tokens.
  285. *
  286. * @param (MDToken|MDNode)[] tokensToSearch - mixed array of `MDToken` and
  287. * `MDNode` elements
  288. * @param MDTokenType[] pattern - contiguous run of token types to find
  289. * @param int startIndex - token index to begin searching (defaults to 0)
  290. * @return ?MDTokenMatch match object, or `null` if not found
  291. */
  292. public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch {
  293. $matched = [];
  294. for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) {
  295. $matchedAll = true;
  296. $matched = [];
  297. $patternOffset = 0;
  298. for ($p = 0; $p < mb_strlen($pattern); $p++) {
  299. $t0 = $t + $p + $patternOffset;
  300. if ($t0 >= sizeof($tokensToSearch)) return null;
  301. $token = $tokensToSearch[$t0];
  302. $elem = $pattern[$p];
  303. if ($elem == MDTokenType::META_OptionalWhitespace) {
  304. if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
  305. array_push($matched, $token);
  306. } else {
  307. $patternOffset--;
  308. }
  309. } elseif ($elem == MDTokenType::META_AnyNonWhitespace) {
  310. if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
  311. $matchedAll = false;
  312. break;
  313. }
  314. array_push($matched, $token);
  315. } else {
  316. if (!($token instanceof MDToken) || $token->type != $elem) {
  317. $matchedAll = false;
  318. break;
  319. }
  320. array_push($matched, $token);
  321. }
  322. }
  323. if ($matchedAll) {
  324. return new MDTokenMatch($matched, $t);
  325. }
  326. }
  327. return null;
  328. }
  329. /**
  330. * Searches an array of MDToken for a given starting pattern and ending
  331. * pattern and returns match info about both and the tokens in between.
  332. *
  333. * If `contentValidator` is specified, it will be called with the content
  334. * tokens of a potential match. If the validator returns `true`, the result
  335. * will be accepted and returned by this method. If the validator returns
  336. * `false`, this method will keep looking for another matching pair. If no
  337. * validator is given the first match will be returned regardless of content.
  338. *
  339. * If a match is found, a `MDPairedTokenMatch` is returned with details
  340. * of the opening tokens, closing tokens, and content tokens between. Otherwise
  341. * `null` is returned.
  342. *
  343. * @param MDToken[] $tokensToSearch - array of `MDToken` to search in
  344. * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first
  345. * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern`
  346. * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching
  347. * @param number $startIndex - token index where searching should begin
  348. * @return ?MDPairedTokenMatch match, or `null`
  349. */
  350. public static function findPairedTokens(array $tokensToSearch,
  351. array $startPattern, array $endPattern, ?callable $contentValidator=null,
  352. int $startIndex=0): ?MDPairedTokenMatch {
  353. for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) {
  354. $startMatch = findFirstTokens($tokensToSearch, $startPattern, $s);
  355. if ($startMatch === null) return null;
  356. $endStart = $startMatch->index + sizeof($startMatch->tokens);
  357. while ($endStart < sizeof($tokensToSearch)) {
  358. $endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart);
  359. if ($endMatch === null) break;
  360. $contentStart = $startMatch->index + sizeof($startMatch->tokens);
  361. $contentLength = $endMatch->index - $contentStart;
  362. $contents = array_slice($tokensToSearch, $contentStart, $contentLength);
  363. if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) {
  364. return new MDPairedTokenMatch($startMatch->tokens,
  365. $contents,
  366. $endMatch->tokens,
  367. $startMatch->index,
  368. $startMatch->index + sizeof($startMatch->tokens),
  369. $endMatch->index,
  370. $endMatch->index + sizeof($endMatch->tokens) - $startMatch->index);
  371. } else {
  372. // Contents rejected. Try next end match.
  373. $endStart = $endMatch->index + 1;
  374. }
  375. }
  376. // No end matches. Increment start match.
  377. $s = $startMatch->index;
  378. }
  379. return null;
  380. }
  381. public function equals($other) {
  382. if (!($other instanceof MDToken)) return false;
  383. if ($other->original !== $this->original) return false;
  384. if ($other->type != $this->type) return false;
  385. if ($other->content !== $this->content) return false;
  386. if ($other->extra !== $this->extra) return false;
  387. if ($other->tag !== $this->tag) return false;
  388. if ($other->modifier != $this->modifier) return false;
  389. return true;
  390. }
  391. }
  392. class MDState {}
  393. class MDHTMLFilter {}
  394. class MDHTMLTag {}
  395. class MDTagModifier {}
  396. // -- Readers ---------------------------------------------------------------
  397. class MDReader {}
  398. class MDUnderlinedHeadingReader extends MDReader {}
  399. class MDHashHeadingReader extends MDReader {}
  400. class MDSubtextReader extends MDReader {}
  401. class MDBlockQuoteReader extends MDReader {}
  402. class _MDListReader extends MDReader {}
  403. class MDUnorderedListReader extends _MDListReader {}
  404. class MDOrderedListReader extends _MDListReader {}
  405. class MDFencedCodeBlockReader extends MDReader {}
  406. class MDIndentedCodeBlockReader extends MDReader {}
  407. class MDHorizontalRuleReader extends MDReader {}
  408. class MDTableReader extends MDReader {}
  409. class MDDefinitionListReader extends MDReader {}
  410. class MDFootnoteReader extends MDReader {}
  411. class MDAbbreviationReader extends MDReader {}
  412. class MDParagraphReader extends MDReader {}
  413. class MDSimplePairInlineReader extends MDReader {}
  414. class MDEmphasisReader extends MDSimplePairInlineReader {}
  415. class MDStrongReader extends MDSimplePairInlineReader {}
  416. class MDStrikethroughReader extends MDSimplePairInlineReader {}
  417. class MDUnderlineReader extends MDSimplePairInlineReader {}
  418. class MDHighlightReader extends MDSimplePairInlineReader {}
  419. class MDCodeSpanReader extends MDSimplePairInlineReader {}
  420. class MDSubscriptReader extends MDSimplePairInlineReader {}
  421. class MDSuperscriptReader extends MDSimplePairInlineReader {}
  422. class MDLinkReader extends MDReader {}
  423. class MDReferencedLinkReader extends MDLinkReader {}
  424. class MDImageReader extends MDLinkReader {}
  425. class MDReferencedImageReader extends MDReferencedLinkReader {}
  426. class MDLineBreakReader extends MDReader {}
  427. class MDHTMLTagReader extends MDReader {}
  428. class MDModifierReader extends MDReader {}
  429. // -- Nodes -----------------------------------------------------------------
  430. class MDNode {}
  431. class MDBlockNode extends MDNode {}
  432. class MDParagraphNode extends MDBlockNode {}
  433. class MDHeadingNode extends MDBlockNode {}
  434. class MDSubtextNode extends MDBlockNode {}
  435. class MDHorizontalRuleNode extends MDBlockNode {}
  436. class MDBlockquoteNode extends MDBlockNode {}
  437. class MDUnorderedListNode extends MDBlockNode {}
  438. class MDOrderedListNode extends MDBlockNode {}
  439. class MDListItemNode extends MDBlockNode {}
  440. class MDCodeBlockNode extends MDBlockNode {}
  441. class MDTableNode extends MDBlockNode {}
  442. class MDTableRowNode extends MDBlockNode {}
  443. class MDTableCellNode extends MDBlockNode {}
  444. class MDTableHeaderCellNode extends MDBlockNode {}
  445. class MDDefinitionListNode extends MDBlockNode {}
  446. class MDDefinitionListTermNode extends MDBlockNode {}
  447. class MDDefinitionListDefinitionNode extends MDBlockNode {}
  448. class MDFootnoteListNode extends MDBlockNode {}
  449. class MDInlineNode extends MDNode {}
  450. class MDTextNode extends MDInlineNode {}
  451. class MDObfuscatedTextNode extends MDTextNode {}
  452. class MDEmphasisNode extends MDInlineNode {}
  453. class MDStrongNode extends MDInlineNode {}
  454. class MDStrikethroughNode extends MDInlineNode {}
  455. class MDUnderlineNode extends MDInlineNode {}
  456. class MDHighlightNode extends MDInlineNode {}
  457. class MDSuperscriptNode extends MDInlineNode {}
  458. class MDSubscriptNode extends MDInlineNode {}
  459. class MDCodeNode extends MDInlineNode {}
  460. class MDFootnoteNode extends MDInlineNode {}
  461. class MDLinkNode extends MDInlineNode {}
  462. class MDReferencedLinkNode extends MDLinkNode {}
  463. class MDImageNode extends MDInlineNode {}
  464. class MDReferencedImageNode extends MDImageNode {}
  465. class MDAbbreviationNode extends MDInlineNode {}
  466. class MDLineBreakNode extends MDInlineNode {}
  467. class MDHTMLTagNode extends MDInlineNode {}
  468. // -- Main class ------------------------------------------------------------
  469. class Markdown {}
  470. ?>