PHP and Javascript implementations of a simple markdown parser
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

markdown.php 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Static utilities.
  5. */
  6. class MDUtils {
  7. // Modified from https://urlregex.com/ to remove capture groups. Matches fully qualified URLs only.
  8. public static $baseURLRegex = '(?:(?:(?:[a-z]{3,9}:(?:\\/\\/)?)(?:[\\-;:&=\\+\\$,\\w]+@)?[a-z0-9\\.\\-]+|(?:www\\.|[\\-;:&=\\+\\$,\\w]+@)[a-z0-9\\.\\-]+)(?:(?:\\/[\\+~%\\/\\.\\w\\-_]*)?\\??(?:[\\-\\+=&;%@\\.\\w_]*)#?(?:[\\.\\!\\/\\\\\\w]*))?)';
  9. // Modified from https://emailregex.com/ to remove capture groups.
  10. public static $baseEmailRegex = '(?:(?:[^<>()\\[\\]\\\\.,;:\\s@"]+(?:\\.[^<>()\\[\\]\\\\.,;:\\s@"]+)*)|(?:".+"))@(?:(?:\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}])|(?:(?:[a-z\\-0-9]+\\.)+[a-z]{2,}))';
  11. /**
  12. * Encodes characters as HTML numeric entities to make it marginally more
  13. * difficult for web scrapers to grab sensitive info. If `text` starts with
  14. * `mailto:` only the email address following it will be obfuscated.
  15. */
  16. public static function escapeObfuscated(string $text): string {
  17. if (str_starts_with($text, 'mailto:')) {
  18. return 'mailto:' . self::escapeObfuscated(mb_substr($text, 7));
  19. }
  20. $html = '';
  21. $l = mb_strlen($text);
  22. for ($p = 0; $p < $l; $p++) {
  23. $cp = mb_ord(mb_substr($text, $p, 1));
  24. $html .= "&#{{$cp}}";
  25. }
  26. return $html;
  27. }
  28. /**
  29. * Removes illegal characters from an HTML attribute name.
  30. */
  31. public static function scrubAttributeName(string $name): string {
  32. return mb_ereg_replace('[\\t\\n\\f \\/>"\'=]+', '', $name);
  33. }
  34. /**
  35. * Strips one or more leading indents from a line or lines of markdown. An
  36. * indent is defined as 4 spaces or one tab. Incomplete indents (i.e. 1-3
  37. * spaces) are treated like one indent level.
  38. *
  39. * @param string|string[] $line
  40. * @param int $levels
  41. * @return string|string[]
  42. */
  43. public static function stripIndent(string|array &$line, int $levels=1): string|array {
  44. $regex = "^(?: {1,4}|\\t){{$levels}}";
  45. return is_array($line) ? array_map(fn(string $l): string => mb_ereg_replace($regex, '', $l), $line) : mb_ereg_replace($regex, '', $line);
  46. }
  47. /**
  48. * Counts the number of indent levels in a line of text. Partial indents
  49. * (1 to 3 spaces) are counted as one indent level unless `fullIndentsOnly`
  50. * is `true`.
  51. */
  52. public static function countIndents(string &$line, bool $fullIndentsOnly=false): int {
  53. // normalize indents to tabs
  54. $t = mb_ereg_replace($fullIndentsOnly ? "(?: {4}|\\t)" : "(?: {1,4}|\\t)", "\t", $line);
  55. // remove content after indent
  56. $t = mb_ereg_replace("^(\\t*)(.*?)$", "\\1", $t);
  57. // count tabs
  58. return mb_strlen($t);
  59. }
  60. /**
  61. * Returns a copy of an array without any whitespace-only lines at the end.
  62. *
  63. * @param string[] $lines
  64. * @return string[]
  65. */
  66. public static function withoutTrailingBlankLines(array &$lines): array {
  67. $stripped = $lines;
  68. while (sizeof($stripped) > 0 && mb_strlen(trim($stripped[sizeof($stripped) - 1])) == 0) {
  69. array_pop($stripped);
  70. }
  71. return $stripped;
  72. }
  73. /**
  74. * Tests if an array of lines contains at least one blank. A blank line
  75. * can contain whitespace.
  76. *
  77. * @param string[] $lines
  78. */
  79. public static function containsBlankLine(array &$lines): bool {
  80. foreach ($lines as $line) {
  81. if (mb_strlen(trim($line)) == 0) return true;
  82. }
  83. return false;
  84. }
  85. public static function equalAssocArrays(array &$a, array &$b) {
  86. return empty(array_diff_assoc($a, $b));
  87. }
  88. }
  89. /**
  90. * Token type enum for `MDToken`.
  91. */
  92. enum MDTokenType {
  93. case Text;
  94. /**
  95. * Only used for the leading and trailing whitespace around a run of text,
  96. * not every single whitespace character.
  97. */
  98. case Whitespace;
  99. case Underscore;
  100. case Asterisk;
  101. case Slash;
  102. case Tilde;
  103. case Bang;
  104. case Backtick;
  105. case Equal;
  106. case Caret;
  107. case Label; // content=label
  108. case URL; // content=URL, extra=title
  109. case Email; // content=email address, extra=title
  110. case SimpleLink; // content=URL
  111. case SimpleEmail; // content=email address
  112. case Footnote; // content=symbol
  113. case Modifier; // modifier=MDTagModifier
  114. case HTMLTag; // tag=MDHTMLTag
  115. /** Wildcard for `MDToken::findFirstTokens` */
  116. case META_AnyNonWhitespace;
  117. /** Wildcard for `MDToken::findFirstTokens` */
  118. case META_OptionalWhitespace;
  119. }
  120. /**
  121. * Search results from `MDToken.findFirstTokens`.
  122. */
  123. class MDTokenMatch {
  124. /** @var MDToken{} */
  125. public array $tokens;
  126. public int $index;
  127. /**
  128. * @param MDToken[] $tokens
  129. * @param int $index
  130. */
  131. public function __construct(array $tokens, int $index) {
  132. $this->tokens = $tokens;
  133. $this->index = $index;
  134. }
  135. }
  136. /**
  137. * Search results from `MDToken.findPairedTokens`.
  138. */
  139. class MDPairedTokenMatch {
  140. /** @var MDToken[] */
  141. public array $startTokens;
  142. /** @var MDToken[] */
  143. public array $contentTokens;
  144. /** @var MDToken[] */
  145. public array $endTokens;
  146. public int $startIndex;
  147. public int $contentIndex;
  148. public int $endIndex;
  149. public int $totalLength;
  150. public function __construct(array $startTokens, array $contentTokens,
  151. array $endTokens, int $startIndex, int $contentIndex, int $endIndex,
  152. int $totalLength) {
  153. $this->startTokens = $startTokens;
  154. $this->contentTokens = $contentTokens;
  155. $this->endTokens = $endTokens;
  156. $this->startIndex = $startIndex;
  157. $this->contentIndex = $contentIndex;
  158. $this->endIndex = $endIndex;
  159. $this->totalLength = $totalLength;
  160. }
  161. }
  162. /**
  163. * One lexical unit in inline markdown syntax parsing.
  164. */
  165. class MDToken {
  166. /**
  167. * The original verbatim token string. Required as a plaintext fallback if
  168. * the token remains unresolved.
  169. */
  170. public string $original;
  171. public MDTokenType $type;
  172. public ?string $content = null;
  173. public ?string $extra = null;
  174. public ?MDHTMLTag $tag = null;
  175. public ?MDTagModifier $modifier = null;
  176. /**
  177. * Creates a token.
  178. *
  179. * @param string $original verbatim token string
  180. * @param MDTokenType $type token type
  181. * @param string|MDTagModifier|MDHTMLTag|null $content primary content of
  182. * the token
  183. * @param string|null $extra additional content
  184. */
  185. public function __construct(string $original, MDTokenType $type,
  186. string|MDTagModifier|MDHTMLTag|null $content=null,
  187. ?string $extra=null) {
  188. $this->original = $original;
  189. $this->type = $type;
  190. if ($content instanceof MDTagModifier) {
  191. $this->modifier = $content;
  192. } elseif ($content instanceof MDHTMLTag) {
  193. $this->tag = $content;
  194. } else {
  195. $this->content = $content;
  196. }
  197. $this->extra = $extra;
  198. }
  199. public function __toString(): string {
  200. $classname = get_class($this);
  201. return "({$classname} type={$this->type} content={$this->content})";
  202. }
  203. /**
  204. * Attempts to parse a label token from the beginning of `line`. A label is
  205. * of the form `[content]`. If found, returns an array:
  206. * - `0`: the entire label including brackets
  207. * - `1`: the content of the label
  208. *
  209. * @param string $line
  210. * @return ?string[] match groups or null if not found
  211. */
  212. public static function tokenizeLabel(string $line): ?array {
  213. if (!str_starts_with($line, '[')) return null;
  214. $parenCount = 0;
  215. $bracketCount = 0;
  216. $l = mb_strlen($line);
  217. for ($p = 1; $p < $l; $p++) {
  218. $ch = mb_substr($line, $p, 1);
  219. if ($ch == '\\') {
  220. $p++;
  221. } elseif ($ch == '(') {
  222. $parenCount++;
  223. } elseif ($ch == ')') {
  224. $parenCount--;
  225. if ($parenCount < 0) return null;
  226. } elseif ($ch == '[') {
  227. $bracketCount++;
  228. } elseif ($ch == ']') {
  229. if ($bracketCount > 0) {
  230. $bracketCount--;
  231. } else {
  232. return [ mb_substr($line, 0, $p + 1), mb_substr($line, 1, $p - 1) ];
  233. }
  234. }
  235. }
  236. return null;
  237. }
  238. private static $urlWithTitleRegex = '^\\((\\S+?)\\s+"(.*?)"\\)'; // 1=URL, 2=title
  239. private static $urlRegex = '^\\((\\S+?)\\)'; // 1=URL
  240. /**
  241. * Attempts to parse a URL token from the beginning of `line`. A URL token
  242. * is of the form `(url)` or `(url "title")`. If found, returns an array:
  243. * - `0`: the entire URL token including parentheses
  244. * - `1`: the URL
  245. * - `2`: the optional title, or `null`
  246. *
  247. * @param string $line
  248. * @return ?array token tuple
  249. */
  250. public static function tokenizeURL(string $line): ?array {
  251. $groups = [];
  252. if (mb_eregi(self::$urlWithTitleRegex, $line, $groups)) {
  253. if (self::tokenizeEmail($line)) return null; // make sure it's not better described as an email address
  254. return $groups;
  255. }
  256. if (mb_eregi(self::$urlRegex, $line, $groups)) {
  257. if (self::tokenizeEmail($line)) return null;
  258. return [ $groups[0], $groups[1], null ];
  259. }
  260. return null;
  261. }
  262. /**
  263. * Attempts to parse an email address from the beginning of `line`. An
  264. * email address is of the form `(user@example.com)` or
  265. * `(user@example.com "link title")`. If found, returns an array:
  266. * - `0`: the entire token including parentheses
  267. * - `1`: the email address
  268. * - `2`: the optional link title, or `null`
  269. *
  270. * @param string $line
  271. * @return ?string[] token tuple
  272. */
  273. public static function tokenizeEmail(string $line): ?array {
  274. $groups;
  275. if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s+\"(.*?)\"\\s*\\)",
  276. $line, $groups)) {
  277. return $groups;
  278. }
  279. if (mb_eregi("^\\(\\s*(" . MDUtils::$baseEmailRegex . ")\\s*\\)", $line, $groups)) {
  280. return [ $groups[0], $groups[1], null ];
  281. }
  282. return null;
  283. }
  284. /**
  285. * Searches an array of `MDToken` for the given pattern of `MDTokenType`s.
  286. * If found, returns a `MDTokenMatch`, otherwise `null`.
  287. *
  288. * Special token types `META_AnyNonWhitespace` and `META_OptionalWhitespace`
  289. * are special supported token types. Note that `META_OptionalWhitespace`
  290. * may give a result with a variable number of tokens.
  291. *
  292. * @param (MDToken|MDNode)[] $tokensToSearch - mixed array of `MDToken` and
  293. * `MDNode` elements
  294. * @param MDTokenType[] $pattern - contiguous run of token types to find
  295. * @param int $startIndex - token index to begin searching (defaults to 0)
  296. * @return ?MDTokenMatch match object, or `null` if not found
  297. */
  298. public static function findFirstTokens(array $tokensToSearch, array $pattern, int $startIndex=0): ?MDTokenMatch {
  299. $matched = [];
  300. for ($t = $startIndex; $t < sizeof($tokensToSearch); $t++) {
  301. $matchedAll = true;
  302. $matched = [];
  303. $patternOffset = 0;
  304. for ($p = 0; $p < mb_strlen($pattern); $p++) {
  305. $t0 = $t + $p + $patternOffset;
  306. if ($t0 >= sizeof($tokensToSearch)) return null;
  307. $token = $tokensToSearch[$t0];
  308. $elem = $pattern[$p];
  309. if ($elem == MDTokenType::META_OptionalWhitespace) {
  310. if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
  311. array_push($matched, $token);
  312. } else {
  313. $patternOffset--;
  314. }
  315. } elseif ($elem == MDTokenType::META_AnyNonWhitespace) {
  316. if ($token instanceof MDToken && $token->type == MDTokenType::Whitespace) {
  317. $matchedAll = false;
  318. break;
  319. }
  320. array_push($matched, $token);
  321. } else {
  322. if (!($token instanceof MDToken) || $token->type != $elem) {
  323. $matchedAll = false;
  324. break;
  325. }
  326. array_push($matched, $token);
  327. }
  328. }
  329. if ($matchedAll) {
  330. return new MDTokenMatch($matched, $t);
  331. }
  332. }
  333. return null;
  334. }
  335. /**
  336. * Searches an array of MDToken for a given starting pattern and ending
  337. * pattern and returns match info about both and the tokens in between.
  338. *
  339. * If `contentValidator` is specified, it will be called with the content
  340. * tokens of a potential match. If the validator returns `true`, the result
  341. * will be accepted and returned by this method. If the validator returns
  342. * `false`, this method will keep looking for another matching pair. If no
  343. * validator is given the first match will be returned regardless of content.
  344. *
  345. * If a match is found, a `MDPairedTokenMatch` is returned with details
  346. * of the opening tokens, closing tokens, and content tokens between. Otherwise
  347. * `null` is returned.
  348. *
  349. * @param MDToken[] $tokensToSearch - array of `MDToken` to search in
  350. * @param MDTokenType[] $startPattern - array of `MDTokenType` to find first
  351. * @param MDTokenType[] $endPattern - array of `MDTokenType` to find positioned after `startPattern`
  352. * @param ?callable $contentValidator - optional validator function. If provided, will be passed an array of inner `MDToken`, and the function can return `true` to accept the contents or `false` to keep searching
  353. * @param number $startIndex - token index where searching should begin
  354. * @return ?MDPairedTokenMatch match, or `null`
  355. */
  356. public static function findPairedTokens(array $tokensToSearch,
  357. array $startPattern, array $endPattern, ?callable $contentValidator=null,
  358. int $startIndex=0): ?MDPairedTokenMatch {
  359. for ($s = $startIndex; $s < sizeof($tokensToSearch); $s++) {
  360. $startMatch = findFirstTokens($tokensToSearch, $startPattern, $s);
  361. if ($startMatch === null) return null;
  362. $endStart = $startMatch->index + sizeof($startMatch->tokens);
  363. while ($endStart < sizeof($tokensToSearch)) {
  364. $endMatch = findFirstTokens($tokensToSearch, $endPattern, $endStart);
  365. if ($endMatch === null) break;
  366. $contentStart = $startMatch->index + sizeof($startMatch->tokens);
  367. $contentLength = $endMatch->index - $contentStart;
  368. $contents = array_slice($tokensToSearch, $contentStart, $contentLength);
  369. if (sizeof($contents) > 0 && ($contentValidator === null || $contentValidator($contents))) {
  370. return new MDPairedTokenMatch($startMatch->tokens,
  371. $contents,
  372. $endMatch->tokens,
  373. $startMatch->index,
  374. $startMatch->index + sizeof($startMatch->tokens),
  375. $endMatch->index,
  376. $endMatch->index + sizeof($endMatch->tokens) - $startMatch->index);
  377. } else {
  378. // Contents rejected. Try next end match.
  379. $endStart = $endMatch->index + 1;
  380. }
  381. }
  382. // No end matches. Increment start match.
  383. $s = $startMatch->index;
  384. }
  385. return null;
  386. }
  387. public function equals($other) {
  388. if (!($other instanceof MDToken)) return false;
  389. if ($other->original !== $this->original) return false;
  390. if ($other->type != $this->type) return false;
  391. if ($other->content !== $this->content) return false;
  392. if ($other->extra !== $this->extra) return false;
  393. if ($other->tag !== $this->tag) return false;
  394. if ($other->modifier != $this->modifier) return false;
  395. return true;
  396. }
  397. }
  398. /**
  399. * Parsing and rendering state. Passed around throughout the parsing process.
  400. *
  401. * States are hierarchical. A sub-state can be created by calling `.copy()` with
  402. * a new array of lines. The sub-state points back to its parent state. This
  403. * is done to parse inner content of a syntax as its own standalone document.
  404. *
  405. * If a custom `MDReader` implementation wants to store data in this object,
  406. * always do so on `state.root` to ensure it's stored on the original state,
  407. * not a child state. Otherwise data may be lost when the sub-state is discarded.
  408. */
  409. class MDState {
  410. /**
  411. * Ascends the parent chain to the root `MDState` instance. This should be
  412. * used when referencing most stored fields except `lines` and `p`.
  413. */
  414. public function root(): MDState {
  415. return $this->parent ? $this->parent->root() : $this;
  416. }
  417. /**
  418. * Lines of the markdown document. The current line index is pointed to by `p`.
  419. *
  420. * @var string[]
  421. */
  422. public array $lines;
  423. /**
  424. * The current line in `lines`.
  425. */
  426. public function currentLine(): ?string {
  427. return ($this->p < sizeof($this->lines)) ? $this->lines[$this->p] : null;
  428. }
  429. /**
  430. * Current line pointer into array `lines`.
  431. */
  432. public int $p = 0;
  433. private ?MDState $parent = null;
  434. /**
  435. * Array of `MDReader`s sorted by block reading priority.
  436. * @var MDReader[]
  437. */
  438. public array $readersByBlockPriority = [];
  439. /**
  440. * Array of `MDReader`s sorted by tokenization priority.
  441. * @var MDReader[]
  442. */
  443. public array $readersByTokenPriority = [];
  444. /**
  445. * Array of tuples of `pass:number` and `MDReader` sorted by substitution
  446. * priority.
  447. * @var array[]
  448. */
  449. public array $readersBySubstitutePriority = [];
  450. /**
  451. * Prefix to include in any generated `id` attributes on HTML elements.
  452. * Useful for keeping elements unique in multiple parsed documents in the
  453. * same HTML page.
  454. */
  455. public string $elementIdPrefix = '';
  456. /**
  457. * Filter for removing unapproved HTML tags, attributes, and values.
  458. */
  459. public MDHTMLFilter $tagFilter;
  460. private static string $textWhitespaceRegex = '^(\\s*)(?:(\\S|\\S.*\\S)(\\s*?))?$'; // 1=leading WS, 2=text, 3=trailing WS
  461. /**
  462. * @param string[] $lines - lines of markdown text
  463. */
  464. public function __construct(array $lines) {
  465. $this->lines = $lines;
  466. }
  467. /**
  468. * Creates a copy of this state with new lines. Useful for parsing nested
  469. * content.
  470. *
  471. * @param string[] $lines
  472. * @return MDState copied sub-state
  473. */
  474. public function copy(array $lines) {
  475. $cp = new MDState($lines);
  476. $cp->parent = $this;
  477. return $cp;
  478. }
  479. /**
  480. * Tests if there are at least `minCount` lines available to read. If `p`
  481. * is not provided it will be relative to `this.p`.
  482. */
  483. public function hasLines(int $minCount, ?int $p=null): bool {
  484. $relativeTo = ($p === null) ? $this->p : $p;
  485. return $relativeTo + $minCount <= sizeof($this->lines);
  486. }
  487. /**
  488. * Reads and returns an array of blocks from the current line pointer.
  489. *
  490. * @return MDBlockNode[] parsed blocks
  491. */
  492. public function readBlocks(): array {
  493. $blocks = [];
  494. while ($this->hasLines(1)) {
  495. $block = $this->readNextBlock();
  496. if ($block) {
  497. array_push($blocks, $block);
  498. } else {
  499. break;
  500. }
  501. }
  502. return $blocks;
  503. }
  504. /**
  505. * Creates a simple `MDBlockNode` if no other registered blocks match.
  506. */
  507. private function readFallbackBlock(): ?MDBlockNode {
  508. if ($this->p >= sizeof($this->lines)) return null;
  509. $lines = MDUtils::withoutTrailingBlankLines(array_slice($this->lines, $this->p));
  510. if (sizeof($lines) == 0) return null;
  511. $this->p = sizeof($this->lines);
  512. return $this->inlineMarkdownToNode(implode("\n", $lines));
  513. }
  514. /**
  515. * Attempts to read one block from the current line pointer. The pointer
  516. * will be positioned just after the end of the block.
  517. */
  518. private function readNextBlock(): ?MDBlockNode {
  519. while ($this->hasLines(1) && mb_strlen(trim($this->lines[$this->p])) == 0) {
  520. $this->p++;
  521. }
  522. if (!$this->hasLines(1)) return null;
  523. foreach ($this->root()->readersByBlockPriority as $reader) {
  524. $startP = $this->p;
  525. $block = $reader->readBlock($this);
  526. if ($block) {
  527. if ($this->p == $startP) {
  528. $readerClassName = get_class($reader);
  529. $blockClassName = get_class($block);
  530. throw new Error("{$readerClassName} returned an " +
  531. "{$blockClassName} without incrementing MDState.p. " +
  532. "This could lead to an infinite loop.");
  533. }
  534. return $block;
  535. }
  536. }
  537. $fallback = $this->readFallbackBlock();
  538. return $fallback;
  539. }
  540. /**
  541. * @param string $line
  542. * @return MDToken[]
  543. */
  544. private function inlineMarkdownToTokens(string $line): array {
  545. if ($this->parent) return $this->parent->inlineMarkdownToTokens($line);
  546. $tokens = [];
  547. $text = '';
  548. $expectLiteral = false;
  549. /**
  550. * Flushes accumulated content in `text` to `tokens`.
  551. */
  552. function endText() {
  553. if (mb_strlen($text) == 0) return;
  554. $textGroups = null;
  555. if (mb_eregi(MDState::$textWhitespaceRegex, $text, $textGroups)) {
  556. if (mb_strlen($textGroups[1]) > 0) {
  557. array_push($tokens, new MDToken($textGroups[1], MDTokenType::Whitespace, $textGroups[1]));
  558. }
  559. if ($textGroups[2] && mb_strlen($textGroups[2]) > 0) {
  560. $tokens.push(new MDToken($textGroups[2], MDTokenType::Text, $textGroups[2]));
  561. }
  562. if ($textGroups[3] && mb_strlen($textGroups[3]) > 0) {
  563. $tokens.push(new MDToken($textGroups[3], MDTokenType::Whitespace, $textGroups[3]));
  564. }
  565. } else {
  566. array_push($tokens, new MDToken($text, MDTokenType::Text, $text));
  567. }
  568. $text = '';
  569. }
  570. for ($p = 0; $p < mb_strlen(line); $p++) {
  571. $ch = mb_substr($line, p, 1);
  572. $remainder = mb_substr($line, $p);
  573. if ($expectLiteral) {
  574. $text .= $ch;
  575. $expectLiteral = false;
  576. continue;
  577. }
  578. if ($ch == '\\') {
  579. $expectLiteral = true;
  580. continue;
  581. }
  582. $found = false;
  583. foreach ($this->root()->readersByTokenPriority as $reader) {
  584. $token = $reader->readToken($this, $remainder);
  585. if ($token === null) continue;
  586. endText();
  587. array_push($tokens, $token);
  588. if ($token->original == null || mb_strlen($token->original) == 0) {
  589. $readerClassName = get_class($reader);
  590. throw new Error(`{$readerClassName} returned a token with an empty .original. This would cause an infinite loop.`);
  591. }
  592. $p += mb_strlen($token->original) - 1;
  593. $found = true;
  594. break;
  595. }
  596. if (!$found) {
  597. $text += $ch;
  598. }
  599. }
  600. endText();
  601. return $tokens;
  602. }
  603. /**
  604. * Converts a line of markdown to an `MDInlineNode`.
  605. *
  606. * @param string|string[] $line
  607. * @return MDInlineNode
  608. */
  609. public function inlineMarkdownToNode(string|array $line): MDInlineNode {
  610. $nodes = $this->inlineMarkdownToNodes($line);
  611. return (sizeof($nodes) == 1) ? $nodes[0] : new MDInlineNode($nodes);
  612. }
  613. /**
  614. * Converts a line of markdown to an array of `MDInlineNode`s.
  615. *
  616. * @param string|string[] $line
  617. * @return MDInlineNode[]
  618. */
  619. public function inlineMarkdownToNodes(string|array $line): array {
  620. $tokens = $this->inlineMarkdownToTokens(is_array($line) ? implode("\n", $line) : $line);
  621. return $this->tokensToNodes($tokens);
  622. }
  623. /**
  624. * Converts a mixed array of `MDToken` and `MDInlineNode` elements into an array
  625. * of only `MDInlineNode` via repeated `MDReader` substition.
  626. *
  627. * @param (MDToken|MDInlineNode)[] $tokens
  628. * @return MDInlineNode[]
  629. */
  630. public function tokensToNodes(array $tokens): array {
  631. $nodes = $tokens;
  632. // Perform repeated substitutions, converting sequences of tokens into
  633. // nodes, until no more substitutions can be made.
  634. $anyChanges = false;
  635. do {
  636. $anyChanges = false;
  637. foreach ($this->root->readersBySubstitutePriority as $readerTuple) {
  638. /** @var int */
  639. $pass = $readerTuple[0];
  640. /** @var MDReader */
  641. $reader = $readerTuple[1];
  642. $changed = $reader->substituteTokens($this, $pass, $nodes);
  643. if (!$changed) continue;
  644. $anyChanges = true;
  645. break;
  646. }
  647. } while ($anyChanges);
  648. // Convert any remaining tokens to text nodes. Also apply any inline
  649. // CSS modifiers.
  650. $lastNode = null;
  651. $me = $this;
  652. $nodes = array_map(function($node) use ($lastNode, $me) {
  653. if ($node instanceof MDToken) {
  654. /** @var MDToken */
  655. $token = $node;
  656. if ($token->type == MDTokenType::Modifier && $lastNode) {
  657. $me->root()->tagFilter->scrubModifier($token->modifier);
  658. $token->modifier->applyTo($lastNode);
  659. $lastNode = null;
  660. return new MDTextNode('');
  661. }
  662. $lastNode = null;
  663. return new MDTextNode($token->original);
  664. } elseif ($node instanceof MDNode) {
  665. $lastNode = ($node instanceof MDTextNode) ? null : $node;
  666. return $node;
  667. } else {
  668. $nodeClassName = get_class($node);
  669. throw new Error("Unexpected node type {$nodeClassName}");
  670. }
  671. }, $nodes);
  672. return $nodes;
  673. }
  674. /**
  675. * Mapping of reference symbols to URLs. Used by `MDReferencedLinkReader`
  676. * and `MDReferencedImageReader`.
  677. * @var array symbol -> URL
  678. */
  679. private array $referenceToURL = [];
  680. /**
  681. * Mapping of reference symbols to titles. Used by `MDReferencedLinkReader`
  682. * and `MDReferencedImageReader`.
  683. * @var array symbol -> title string
  684. */
  685. private array $referenceToTitle = [];
  686. /**
  687. * Defines a URL by reference symbol.
  688. */
  689. public function defineURL(string $reference, string $url, ?string $title=null) {
  690. $this->root->referenceToURL[mb_strtolower($reference)] = $url;
  691. if ($title !== null) $this->root()->referenceToTitle[mb_strtolower($reference)] = $title;
  692. }
  693. /**
  694. * Returns the URL associated with a reference symbol.
  695. */
  696. public function urlForReference(string $reference): ?string {
  697. return $this->root()->referenceToURL[mb_strtolower($reference)] ?? null;
  698. }
  699. /**
  700. * Returns the link title associated with a reference symbol.
  701. */
  702. public function urlTitleForReference(string $reference): ?string {
  703. return $this->root()->referenceToTitle[mb_strtolower($reference)] ?? null;
  704. }
  705. }
  706. /**
  707. * Defines a set of allowable HTML tags, attributes, and CSS.
  708. */
  709. class MDHTMLFilter {
  710. /**
  711. * Mapping of permitted lowercase tag names to objects containing allowable
  712. * attributes for those tags. Does not need to include those attributes
  713. * defined in `allowableGlobalAttributes`.
  714. *
  715. * Values are objects with allowable lowercase attribute names mapped to
  716. * allowable value patterns. A `*` means any value is acceptable. Multiple
  717. * allowable values can be joined together with `|`. These special symbols
  718. * represent certain kinds of values and can be used in combination or in
  719. * place of literal values.
  720. *
  721. * - `{classlist}`: A list of legal CSS classnames, separated by spaces
  722. * - `{int}`: An integer
  723. * - `{none}`: No value (an attribute with no `=` or value, like `checked`)
  724. * - `{style}`: One or more CSS declarations, separated by semicolons (simple
  725. * `key: value;` syntax only)
  726. * - `{url}`: A URL
  727. * @type {object}
  728. */
  729. public array $allowableTags = [
  730. 'address' => [
  731. 'cite' => '{url}',
  732. ],
  733. 'h1' => [],
  734. 'h2' => [],
  735. 'h3' => [],
  736. 'h4' => [],
  737. 'h5' => [],
  738. 'h6' => [],
  739. 'blockquote' => [],
  740. 'dl' => [],
  741. 'dt' => [],
  742. 'dd' => [],
  743. 'div' => [],
  744. 'hr' => [],
  745. 'ul' => [],
  746. 'ol' => [
  747. 'start' => '{int}',
  748. 'type' => 'a|A|i|I|1',
  749. ],
  750. 'li' => [
  751. 'value' => '{int}',
  752. ],
  753. 'p' => [],
  754. 'pre' => [],
  755. 'table' => [],
  756. 'thead' => [],
  757. 'tbody' => [],
  758. 'tfoot' => [],
  759. 'tr' => [],
  760. 'td' => [],
  761. 'th' => [],
  762. 'a' => [
  763. 'href' => '{url}',
  764. 'target' => '*',
  765. ],
  766. 'abbr' => [],
  767. 'b' => [],
  768. 'br' => [],
  769. 'cite' => [],
  770. 'code' => [],
  771. 'data' => [
  772. 'value' => '*',
  773. ],
  774. 'dfn' => [],
  775. 'em' => [],
  776. 'i' => [],
  777. 'kbd' => [],
  778. 'mark' => [],
  779. 'q' => [
  780. 'cite' => '{url}',
  781. ],
  782. 's' => [],
  783. 'samp' => [],
  784. 'small' => [],
  785. 'span' => [],
  786. 'strong' => [],
  787. 'sub' => [],
  788. 'sup' => [],
  789. 'time' => [
  790. 'datetime' => '*',
  791. ],
  792. 'u' => [],
  793. 'var' => [],
  794. 'wbr' => [],
  795. 'img' => [
  796. 'alt' => '*',
  797. 'href' => '{url}',
  798. ],
  799. 'figure' => [],
  800. 'figcaption' => [],
  801. 'del' => [],
  802. 'ins' => [],
  803. 'details' => [],
  804. 'summary' => [],
  805. ];
  806. /**
  807. * Mapping of allowable lowercase global attributes to their permitted
  808. * values. Uses same value pattern syntax as described in `allowableTags`.
  809. * @type {object}
  810. */
  811. public array $allowableGlobalAttributes = [
  812. 'class' => '{classlist}',
  813. 'data-*' => '*',
  814. 'dir' => 'ltr|rtl|auto',
  815. 'id' => '*',
  816. 'lang' => '*',
  817. 'style' => '{style}',
  818. 'title' => '*',
  819. 'translate' => 'yes|no|{none}',
  820. ];
  821. /**
  822. * Mapping of allowable CSS style names to their allowable value patterns.
  823. * Multiple values can be delimited with `|` characters. Limited support
  824. * so far.
  825. *
  826. * Recognized special values:
  827. * - `{color}`: A hex or named color
  828. *
  829. * @type {object}
  830. */
  831. public array $allowableStyleKeys = [
  832. 'background-color' => '{color}',
  833. 'color' => '{color}',
  834. ];
  835. /**
  836. * Scrubs all forbidden attributes from an HTML tag. Assumes the tag name
  837. * itself has already been whitelisted.
  838. *
  839. * @param {MDHTMLTag} tag - HTML tag
  840. */
  841. public function scrubTag(MDHTMLTag $tag) {
  842. foreach ($tag->attributes as $name => $value) {
  843. if (!$this->isValidAttributeName($tag->tagName, $name)) {
  844. unset($tag->attributes[$name]);
  845. }
  846. if (!$this->isValidAttributeValue($tag->tagName, $name, $value)) {
  847. unset($tag->attributes[$name]);
  848. }
  849. }
  850. }
  851. /**
  852. * Scrubs all forbidden attributes from an HTML modifier.
  853. *
  854. * @param MDTagModifier $modifier
  855. * @param ?string $tagName HTML tag name, if known, otherwise only
  856. * global attributes will be permitted
  857. */
  858. public function scrubModifier(MDHTMLModifier $modifier, ?string $tagName) {
  859. if (sizeof($modifier->cssClasses) > 0) {
  860. $classList = implode(' ', $modifier->cssClasses);
  861. if (!$this->isValidAttributeValue($tagName, 'class', $classList)) {
  862. $modifier->cssClasses = [];
  863. }
  864. }
  865. if ($modifier->cssId !== null) {
  866. if (!$this->isValidAttributeValue($tagName, 'id', $modifier->cssId)) {
  867. $modifier->cssId = null;
  868. }
  869. }
  870. if (!$this->isValidAttributeName($tagName, 'style')) {
  871. $modifier->cssStyles = [];
  872. } else {
  873. foreach ($modifier->cssStyles as $key => $val) {
  874. if (!$this->isValidStyleValue($key, $val)) {
  875. unset($modifier->cssStyles[$key]);
  876. }
  877. }
  878. }
  879. foreach ($modifier->attributes as $key => $val) {
  880. if (!$this->isValidAttributeValue($tagName, $key, $val)) {
  881. unset($modifier->attributes[$key]);
  882. }
  883. }
  884. }
  885. /**
  886. * Tests if an HTML tag name is permitted.
  887. */
  888. public function isValidTagName(string $tagName): bool {
  889. return ($this->allowableTags[mb_strtolower($tagName)] ?? null) !== null;
  890. }
  891. /**
  892. * Tests if an HTML attribute name is permitted.
  893. */
  894. public function isValidAttributeName(?string $tagName, string $attributeName): bool {
  895. $lcAttributeName = mb_strtolower($attributeName);
  896. if (($this->allowableGlobalAttributes[$lcAttributeName] ?? null) !== null) {
  897. return true;
  898. }
  899. foreach ($this->allowableGlobalAttributes as $pattern => $valuePattern) {
  900. if (!str_ends_with($pattern, '*')) continue;
  901. $patternPrefix = mb_substr($pattern, 0, mb_strlen($pattern) - 1);
  902. if (str_starts_with($lcAttributeName, $patternPrefix)) {
  903. return true;
  904. }
  905. }
  906. if ($tagName === null) return false;
  907. $lcTagName = mb_strtolower($tagName);
  908. $tagAttributes = $this->allowableTags[$lcTagName];
  909. if ($tagAttributes !== null) {
  910. return ($tagAttributes[$lcAttributeName] ?? null) !== null;
  911. }
  912. return false;
  913. }
  914. /**
  915. * Tests if an attribute value is allowable.
  916. */
  917. public function isValidAttributeValue(?string $tagName, string $attributeName, $attributeValue): bool {
  918. $lcAttributeName = mb_strtolower($attributeName);
  919. $globalPattern = $this->allowableGlobalAttributes[$lcAttributeName] ?? null;
  920. if ($globalPattern !== null) {
  921. return $this->attributeValueMatchesPattern($attributeValue, $globalPattern);
  922. }
  923. foreach ($this->allowableGlobalAttributes as $namePattern => $valuePattern) {
  924. if (str_ends_with($namePattern, '*') && str_starts_with($lcAttributeName, mb_substr($namePattern, 0, mb_strlen($namePattern) - 1))) {
  925. return $this->attributeValueMatchesPattern($attributeValue, $valuePattern);
  926. }
  927. }
  928. if ($tagName === null) return false;
  929. $lcTagName = mb_strtolower($tagName);
  930. $tagAttributes = $this->allowableTags[$lcTagName] ?? null;
  931. if ($tagAttributes === null) return false;
  932. $valuePattern = $tagAttributes[$lcAttributeName] ?? null;
  933. if ($valuePattern === null) return false;
  934. return $this->attributeValueMatchesPattern($attributeValue, $valuePattern);
  935. }
  936. private static string $permissiveURLRegex = '^\\S+$';
  937. private static string $integerRegex = '^[\\-]?\\d+$';
  938. private static string $classListRegex = '^-?[_a-zA-Z]+[_a-zA-Z0-9-]*(?:\\s+-?[_a-zA-Z]+[_a-zA-Z0-9-]*)*$';
  939. private function attributeValueMatchesPattern(string|bool $value, string $pattern): bool {
  940. $options = explode('|', $pattern);
  941. foreach ($options as $option) {
  942. switch ($option) {
  943. case '*':
  944. return true;
  945. case '{classlist}':
  946. if (mb_eregi(self::classListRegex, $value)) return true;
  947. break;
  948. case '{int}':
  949. if (mb_eregi(self::integerRegex, $value)) return true;
  950. break;
  951. case '{none}':
  952. if ($value === true) return true;
  953. break;
  954. case '{style}':
  955. if ($this->isValidStyleDeclaration($value)) return true;
  956. break;
  957. case '{url}':
  958. if (mb_eregi(self::permissiveURLRegex, $value)) return true;
  959. break;
  960. default:
  961. if ($value === $option) return true;
  962. break;
  963. }
  964. }
  965. return false;
  966. }
  967. /**
  968. * Tests if a string of one or more style `key: value;` declarations is
  969. * fully allowable.
  970. */
  971. public function isValidStyleDeclaration(string $styles): bool {
  972. $settings = explode(';', $styles);
  973. foreach ($settings as $setting) {
  974. if (mb_strlen(trim($setting)) == 0) continue;
  975. $parts = explode(':', $setting);
  976. if (sizeof($parts) != 2) return false;
  977. $name = trim($parts[0]);
  978. if (!$this->isValidStyleKey($name)) return false;
  979. $value = trim($parts[1]);
  980. if (!$this->isValidStyleValue($name, $value)) return false;
  981. }
  982. return true;
  983. }
  984. /**
  985. * Tests if a CSS style key is allowable.
  986. */
  987. public function isValidStyleKey(string $key): bool {
  988. return ($this->allowableStyleKeys[$key] ?? null) !== null;
  989. }
  990. /**
  991. * Tests if a CSS style value is allowable.
  992. */
  993. public function isValidStyleValue(string $key, string $value): bool {
  994. $pattern = $this->allowableStyleKeys[$key] ?? null;
  995. if ($pattern === null) return false;
  996. $options = explode('|', $pattern);
  997. foreach ($options as $option) {
  998. switch ($option) {
  999. case '{color}':
  1000. if ($this->isValidCSSColor($value)) return true;
  1001. default:
  1002. if ($value === $option) return true;
  1003. }
  1004. }
  1005. return false;
  1006. }
  1007. private static string $styleColorRegex = '^#[0-9a-f]{3}(?:[0-9a-f]{3})?$|^[a-zA-Z]+$';
  1008. private function isValidCSSColor(string $value): bool {
  1009. return mb_eregi(self::$styleColorRegex, $value);
  1010. }
  1011. }
  1012. /**
  1013. * Represents a single HTML tag. Paired tags are represented separately.
  1014. */
  1015. class MDHTMLTag {
  1016. /**
  1017. * Verbatim string of the original parsed tag. Not modified. Should be
  1018. * considered unsafe for inclusion in the final document. Use `toString()`
  1019. * instead.
  1020. */
  1021. public string $original;
  1022. public string $tagName;
  1023. public bool $isCloser;
  1024. /**
  1025. * Map of attribute names to value strings.
  1026. */
  1027. public array $attributes;
  1028. /**
  1029. * @param string $original
  1030. * @param string $tagName
  1031. * @param bool $isCloser
  1032. * @param array $attributes
  1033. */
  1034. public function __construct(string $original, string $tagName, bool $isCloser,
  1035. array $attributes) {
  1036. $this->original = $original;
  1037. $this->tagName = $tagName;
  1038. $this->isCloser = $isCloser;
  1039. $this->attributes = $attributes;
  1040. }
  1041. public function __toString(): string {
  1042. if ($this->isCloser) {
  1043. return "</{$this->tagName}>";
  1044. }
  1045. $html = '<';
  1046. $html .= $this->tagName;
  1047. foreach ($this->attributes as $key => $value) {
  1048. $safeName = MDUtils::scrubAttributeName($key);
  1049. if ($value === true) {
  1050. $html .= " {$safeName}";
  1051. } else {
  1052. $escapedValue = MDUtils::escapeHTML("{$value}");
  1053. $html .= " {$safeName}=\"{$escapedValue}\"";
  1054. }
  1055. }
  1056. $html .= '>';
  1057. return $html;
  1058. }
  1059. public function equals($other): bool {
  1060. if (!($other instanceof MDHTMLTag)) return false;
  1061. if ($other->tagName != $this->tagName) return false;
  1062. if ($other->isCloser != $this->isCloser) return false;
  1063. return MDUtils::equal($other->attributes, $this->attributes);
  1064. }
  1065. private static string $htmlTagNameFirstRegex = '[a-z]';
  1066. private static string $htmlTagNameMedialRegex = '[a-z0-9]';
  1067. private static string $htmlAttributeNameFirstRegex = '[a-z]';
  1068. private static string $htmlAttributeNameMedialRegex = '[a-z0-9-]';
  1069. private static string $whitespaceCharRegex = '\\s';
  1070. /**
  1071. * Checks the start of the given string for presence of an HTML tag.
  1072. */
  1073. public static function fromLineStart(string $line): ?MDHTMLTag {
  1074. $expectOpenBracket = 0;
  1075. $expectCloserOrName = 1;
  1076. $expectName = 2;
  1077. $expectAttributeNameOrEnd = 3;
  1078. $expectEqualsOrAttributeOrEnd = 4;
  1079. $expectAttributeValue = 5;
  1080. $expectCloseBracket = 6;
  1081. $isCloser = false;
  1082. $tagName = '';
  1083. $attributeName = '';
  1084. $attributeValue = '';
  1085. $attributeQuote = null;
  1086. $attributes = [];
  1087. $fullTag = null;
  1088. $endAttribute = function(bool $unescape=false) use (&$attributes, &$attributeName, &$attributeValue, &$attributeQuote) {
  1089. if (mb_strlen($attributeName) > 0) {
  1090. if (mb_strlen($attributeValue) > 0 || $attributeQuote !== null) {
  1091. $attributes[$attributeName] = $unescape ? html_entity_decode($attributeValue, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401, 'UTF-8') : $attributeValue;
  1092. } else {
  1093. $attributes[$attributeName] = true;
  1094. }
  1095. }
  1096. $attributeName = '';
  1097. $attributeValue = '';
  1098. $attributeQuote = null;
  1099. };
  1100. $expect = $expectOpenBracket;
  1101. for ($p = 0; $p < mb_strlen($line) && $fullTag === null; $p++) {
  1102. $ch = mb_substr($line, $p, 1);
  1103. $isWhitespace = mb_eregi(self::$whitespaceCharRegex, $ch);
  1104. switch ($expect) {
  1105. case $expectOpenBracket:
  1106. if ($ch != '<') return null;
  1107. $expect = $expectCloserOrName;
  1108. break;
  1109. case $expectCloserOrName:
  1110. if ($ch == '/') {
  1111. $isCloser = true;
  1112. } else {
  1113. $p--;
  1114. }
  1115. $expect = $expectName;
  1116. break;
  1117. case $expectName:
  1118. if (mb_strlen($tagName) == 0) {
  1119. if (!mb_eregi(self::$htmlTagNameFirstRegex, $ch)) return null;
  1120. $tagName .= $ch;
  1121. } else {
  1122. if (mb_eregi(self::$htmlTagNameMedialRegex, $ch)) {
  1123. $tagName .= $ch;
  1124. } else {
  1125. $p--;
  1126. $expect = ($isCloser) ? $expectCloseBracket : $expectAttributeNameOrEnd;
  1127. }
  1128. }
  1129. break;
  1130. case $expectAttributeNameOrEnd:
  1131. if (mb_strlen($attributeName) == 0) {
  1132. if ($isWhitespace) {
  1133. // skip whitespace
  1134. } elseif ($ch == '/') {
  1135. $expect = $expectCloseBracket;
  1136. } elseif ($ch == '>') {
  1137. $fullTag = mb_substr($line, 0, $p + 1);
  1138. break;
  1139. } elseif (mb_eregi(self::$htmlAttributeNameFirstRegex, $ch)) {
  1140. $attributeName .= $ch;
  1141. } else {
  1142. return null;
  1143. }
  1144. } elseif ($isWhitespace) {
  1145. $expect = $expectEqualsOrAttributeOrEnd;
  1146. } elseif ($ch == '/') {
  1147. $endAttribute();
  1148. $expect = $expectCloseBracket;
  1149. } elseif ($ch == '>') {
  1150. $endAttribute();
  1151. $fullTag = mb_substr($line, 0, $p + 1);
  1152. break;
  1153. } elseif ($ch == '=') {
  1154. $expect = $expectAttributeValue;
  1155. } elseif (mb_eregi(self::$htmlAttributeNameMedialRegex, $ch)) {
  1156. $attributeName .= $ch;
  1157. } else {
  1158. return null;
  1159. }
  1160. break;
  1161. case $expectEqualsOrAttributeOrEnd:
  1162. if ($ch == '=') {
  1163. $expect = $expectAttributeValue;
  1164. } elseif ($isWhitespace) {
  1165. // skip whitespace
  1166. } elseif ($ch == '/') {
  1167. $expect = $expectCloseBracket;
  1168. } elseif ($ch == '>') {
  1169. $fullTag = mb_substr($line, 0, $p + 1);
  1170. break;
  1171. } elseif (mb_eregi(self::$htmlAttributeNameFirstRegex, $ch)) {
  1172. $endAttribute();
  1173. $expect = $expectAttributeNameOrEnd;
  1174. $p--;
  1175. }
  1176. break;
  1177. case $expectAttributeValue:
  1178. if (mb_strlen($attributeValue) == 0) {
  1179. if ($attributeQuote === null) {
  1180. if ($isWhitespace) {
  1181. // skip whitespace
  1182. } elseif ($ch == '"' || $ch == "'") {
  1183. $attributeQuote = $ch;
  1184. } else {
  1185. $attributeQuote = ''; // explicitly unquoted
  1186. $p--;
  1187. }
  1188. } else {
  1189. if ($ch === $attributeQuote) {
  1190. // Empty string
  1191. $endAttribute($attributeQuote != '');
  1192. $expect = $expectAttributeNameOrEnd;
  1193. } elseif ($attributeQuote === '' && ($ch == '/' || $ch == '>')) {
  1194. return null;
  1195. } else {
  1196. $attributeValue .= $ch;
  1197. }
  1198. }
  1199. } else {
  1200. if ($ch === $attributeQuote) {
  1201. $endAttribute($attributeQuote != '');
  1202. $expect = $expectAttributeNameOrEnd;
  1203. } elseif ($attributeQuote === '' && $isWhitespace) {
  1204. $endAttribute();
  1205. $expect = $expectAttributeNameOrEnd;
  1206. } else {
  1207. $attributeValue .= $ch;
  1208. }
  1209. }
  1210. break;
  1211. case $expectCloseBracket:
  1212. if ($isWhitespace) {
  1213. // ignore whitespace
  1214. } elseif ($ch == '>') {
  1215. $fullTag = mb_substr($line, 0, $p + 1);
  1216. break;
  1217. }
  1218. break;
  1219. }
  1220. }
  1221. if ($fullTag === null) return null;
  1222. $endAttribute();
  1223. return new MDHTMLTag($fullTag, $tagName, $isCloser, $attributes);
  1224. }
  1225. }
  1226. /**
  1227. * Represents HTML modifications to a node, such as CSS classes to add or
  1228. * additional attributes. See `MDHTMLFilter.scrubModifier()` to remove disallowed
  1229. * values.
  1230. */
  1231. class MDTagModifier {
  1232. /**
  1233. * Verbatim markdown syntax. Unmodified by changes to other properties.
  1234. */
  1235. public string $original;
  1236. /** @var string[] */
  1237. public array $cssClasses = [];
  1238. public ?string $cssId = null;
  1239. public array $cssStyles = [];
  1240. public array $attributes = [];
  1241. private static $baseClassRegex = '\\.([a-z_\\-][a-z0-9_\\-]*?)';
  1242. private static $baseIdRegex = '#([a-z_\\-][a-z0-9_\\-]*?)';
  1243. private static $baseAttributeRegex = '([a-z0-9]+?)=([^\\s\\}]+?)';
  1244. private static $baseRegex = '\\{([^}]+?)}';
  1245. private static $leadingClassRegex = '^\\{([^}]+?)}';
  1246. private static $trailingClassRegex = '^(.*?)\\s*\\{([^}]+?)}\\s*$';
  1247. private static $classRegex = '^\\.([a-z_\\-][a-z0-9_\\-]*?)$'; // 1=classname
  1248. private static $idRegex = '^#([a-z_\\-][a-z0-9_\\-]*?)$'; // 1=id
  1249. private static $attributeRegex = '^([a-z0-9]+?)=([^\\s\\}]+?)$'; // 1=attribute name, 2=attribute value
  1250. public function applyTo(MDNode $node) {
  1251. if ($node instanceof MDNode) {
  1252. foreach ($this->cssClasses as $cssClass) {
  1253. $node->addClass($cssClass);
  1254. }
  1255. if ($this->cssId) $node->cssId = $this->cssId;
  1256. foreach ($this->attributes as $name => $value) {
  1257. $node->attributes[$name] = $value;
  1258. }
  1259. foreach ($this->cssStyles as $name => $value) {
  1260. $node->cssStyles[$name] = $value;
  1261. }
  1262. }
  1263. }
  1264. /**
  1265. * Adds a CSS class. If already present it will not be duplicated.
  1266. */
  1267. public function addClass(string $cssClass): bool {
  1268. if (array_search($cssClass, $this->cssClasses) !== false) return false;
  1269. array_push($this->cssClasses, $cssClass);
  1270. return true;
  1271. }
  1272. /**
  1273. * Removes a CSS class.
  1274. */
  1275. public function removeClass(string $cssClass): bool {
  1276. $beforeLength = sizeof($this->cssClasses);
  1277. $this->cssClasses = array_diff($this->cssClasses, [ $cssClass ]);
  1278. return sizeof($this->cssClasses) != beforeLength;
  1279. }
  1280. public function equals($other): bool {
  1281. if (!($other instanceof MDTagModifier)) return false;
  1282. if (!MDUtils::equal($other->cssClasses, $this->cssClasses)) return false;
  1283. if ($other->cssId !== $this->cssId) return false;
  1284. if (!MDUtils::equal($other->attributes, $this->attributes)) return false;
  1285. return true;
  1286. }
  1287. public function __toString(): string {
  1288. return $this->original;
  1289. }
  1290. private static function styleToObject(string $styleValue): array {
  1291. $pairs = explode(';', $styleValue);
  1292. $styles = [];
  1293. foreach ($pairs as $pair) {
  1294. $keyAndValue = explode(':', $pair);
  1295. if (sizeof($keyAndValue) != 2) continue;
  1296. $styles[$keyAndValue[0]] = $keyAndValue[1];
  1297. }
  1298. return $styles;
  1299. }
  1300. private static function fromContents(string $contents): ?MDTagModifier {
  1301. $modifierTokens = mb_split('\\s+', $contents);
  1302. $mod = new MDTagModifier();
  1303. $mod->original = "{{$contents}}";
  1304. foreach ($modifierTokens as $token) {
  1305. if (trim($token) == '') continue;
  1306. if (mb_eregi(self::$classRegex, $token, $groups)) {
  1307. $mod->addClass($groups[1]);
  1308. } elseif (mb_eregi(self::$idRegex, $token, $groups)) {
  1309. $mod->cssId = $groups[1];
  1310. } elseif (mb_eregi(self::$attributeRegex, $token, $groups)) {
  1311. if ($groups[1] == 'style') {
  1312. $mod->cssStyles = self::styleToObject($groups[2]);
  1313. } else {
  1314. $mod->attributes[$groups[1]] = $groups[2];
  1315. }
  1316. } else {
  1317. return null;
  1318. }
  1319. }
  1320. return $mod;
  1321. }
  1322. /**
  1323. * Extracts block modifier from end of a line. Always returns a 2-element
  1324. * tuple array:
  1325. * - `0`: the line without the modifier
  1326. * - `1`: an `MDTagModifier` if found or `null` if not
  1327. *
  1328. * @param string $line
  1329. * @param ?MDState $state
  1330. * @return array tuple with remaining line and `MDTagModifier` or `null`
  1331. */
  1332. public static function fromLine(string $line, ?MDState $state): array {
  1333. if ($state) {
  1334. $found = false;
  1335. foreach ($state->root()->readersByBlockPriority as $reader) {
  1336. if ($reader instanceof MDModifierReader) {
  1337. $found = true;
  1338. break;
  1339. }
  1340. }
  1341. if (!$found) return [ $line, null ];
  1342. }
  1343. if (!mb_eregi(self::$trailingClassRegex, $line, $groups)) return [ $line, null ];
  1344. $bareLine = $groups[1];
  1345. $mod = self::fromContents($groups[2]);
  1346. return [ $bareLine, $mod ];
  1347. }
  1348. /**
  1349. * Attempts to extract modifier from head of string.
  1350. */
  1351. public static function fromStart(string $line): ?MDTagModifier {
  1352. if (!mb_eregi(self::$leadingClassRegex, $line, $groups)) return null;
  1353. return self::fromContents($groups[1]);
  1354. }
  1355. /**
  1356. * Discards any modifiers from a line and returns what remains.
  1357. */
  1358. public static function strip(string $line): string {
  1359. if (!mb_eregi(self::$trailingClassRegex, $line, $groups)) return $line;
  1360. return $groups[1];
  1361. }
  1362. }
  1363. // -- Readers ---------------------------------------------------------------
  1364. class MDReader {}
  1365. class MDUnderlinedHeadingReader extends MDReader {}
  1366. class MDHashHeadingReader extends MDReader {}
  1367. class MDSubtextReader extends MDReader {}
  1368. class MDBlockQuoteReader extends MDReader {}
  1369. class _MDListReader extends MDReader {}
  1370. class MDUnorderedListReader extends _MDListReader {}
  1371. class MDOrderedListReader extends _MDListReader {}
  1372. class MDFencedCodeBlockReader extends MDReader {}
  1373. class MDIndentedCodeBlockReader extends MDReader {}
  1374. class MDHorizontalRuleReader extends MDReader {}
  1375. class MDTableReader extends MDReader {}
  1376. class MDDefinitionListReader extends MDReader {}
  1377. class MDFootnoteReader extends MDReader {}
  1378. class MDAbbreviationReader extends MDReader {}
  1379. class MDParagraphReader extends MDReader {}
  1380. class MDSimplePairInlineReader extends MDReader {}
  1381. class MDEmphasisReader extends MDSimplePairInlineReader {}
  1382. class MDStrongReader extends MDSimplePairInlineReader {}
  1383. class MDStrikethroughReader extends MDSimplePairInlineReader {}
  1384. class MDUnderlineReader extends MDSimplePairInlineReader {}
  1385. class MDHighlightReader extends MDSimplePairInlineReader {}
  1386. class MDCodeSpanReader extends MDSimplePairInlineReader {}
  1387. class MDSubscriptReader extends MDSimplePairInlineReader {}
  1388. class MDSuperscriptReader extends MDSimplePairInlineReader {}
  1389. class MDLinkReader extends MDReader {}
  1390. class MDReferencedLinkReader extends MDLinkReader {}
  1391. class MDImageReader extends MDLinkReader {}
  1392. class MDReferencedImageReader extends MDReferencedLinkReader {}
  1393. class MDLineBreakReader extends MDReader {}
  1394. class MDHTMLTagReader extends MDReader {}
  1395. class MDModifierReader extends MDReader {}
  1396. // -- Nodes -----------------------------------------------------------------
  1397. class MDNode {}
  1398. class MDBlockNode extends MDNode {}
  1399. class MDParagraphNode extends MDBlockNode {}
  1400. class MDHeadingNode extends MDBlockNode {}
  1401. class MDSubtextNode extends MDBlockNode {}
  1402. class MDHorizontalRuleNode extends MDBlockNode {}
  1403. class MDBlockquoteNode extends MDBlockNode {}
  1404. class MDUnorderedListNode extends MDBlockNode {}
  1405. class MDOrderedListNode extends MDBlockNode {}
  1406. class MDListItemNode extends MDBlockNode {}
  1407. class MDCodeBlockNode extends MDBlockNode {}
  1408. class MDTableNode extends MDBlockNode {}
  1409. class MDTableRowNode extends MDBlockNode {}
  1410. class MDTableCellNode extends MDBlockNode {}
  1411. class MDTableHeaderCellNode extends MDBlockNode {}
  1412. class MDDefinitionListNode extends MDBlockNode {}
  1413. class MDDefinitionListTermNode extends MDBlockNode {}
  1414. class MDDefinitionListDefinitionNode extends MDBlockNode {}
  1415. class MDFootnoteListNode extends MDBlockNode {}
  1416. class MDInlineNode extends MDNode {}
  1417. class MDTextNode extends MDInlineNode {}
  1418. class MDObfuscatedTextNode extends MDTextNode {}
  1419. class MDEmphasisNode extends MDInlineNode {}
  1420. class MDStrongNode extends MDInlineNode {}
  1421. class MDStrikethroughNode extends MDInlineNode {}
  1422. class MDUnderlineNode extends MDInlineNode {}
  1423. class MDHighlightNode extends MDInlineNode {}
  1424. class MDSuperscriptNode extends MDInlineNode {}
  1425. class MDSubscriptNode extends MDInlineNode {}
  1426. class MDCodeNode extends MDInlineNode {}
  1427. class MDFootnoteNode extends MDInlineNode {}
  1428. class MDLinkNode extends MDInlineNode {}
  1429. class MDReferencedLinkNode extends MDLinkNode {}
  1430. class MDImageNode extends MDInlineNode {}
  1431. class MDReferencedImageNode extends MDImageNode {}
  1432. class MDAbbreviationNode extends MDInlineNode {}
  1433. class MDLineBreakNode extends MDInlineNode {}
  1434. class MDHTMLTagNode extends MDInlineNode {}
  1435. // -- Main class ------------------------------------------------------------
  1436. class Markdown {}
  1437. ?>