
711 lines
20 KiB
Raw Normal View History

2024-02-14 14:10:47 +00:00
* @typedef {import('hast').Comment} Comment
* @typedef {import('hast').Doctype} Doctype
* @typedef {import('hast').Element} Element
* @typedef {import('hast').Nodes} Nodes
* @typedef {import('hast').Root} Root
* @typedef {import('hast').RootContent} RootContent
* @typedef {import('hast').Text} Text
* @typedef {import('mdast-util-to-hast').Raw} Raw
* @typedef {import('parse5').DefaultTreeAdapterMap} DefaultTreeAdapterMap
* @typedef {import('parse5').ParserOptions<DefaultTreeAdapterMap>} ParserOptions
* @typedef {import('parse5').Token.CharacterToken} CharacterToken
* @typedef {import('parse5').Token.CommentToken} CommentToken
* @typedef {import('parse5').Token.DoctypeToken} DoctypeToken
* @typedef {import('parse5').Token.Location} Location
* @typedef {import('parse5').Token.TagToken} TagToken
* @typedef {import('unist').Point} Point
* @typedef {import('vfile').VFile} VFile
* @typedef Options
* Configuration.
* @property {VFile | null | undefined} [file]
* Corresponding virtual file representing the input document (optional).
* @property {Array<Nodes['type']> | null | undefined} [passThrough]
* List of custom hast node types to pass through (as in, keep) (optional).
* If the passed through nodes have children, those children are expected to
* be hast again and will be handled.
* @typedef State
* Info passed around about the current state.
* @property {(node: Nodes) => undefined} handle
* Add a hast node to the parser.
* @property {Options} options
* User configuration.
* @property {Parser<DefaultTreeAdapterMap>} parser
* Current parser.
* @property {boolean} stitches
* Whether there are stitches.
* @typedef {{type: 'comment', value: {stitch: Nodes}}} Stitch
* Custom comment-like value we pass through parse5, which contains a
* replacement node that well swap back in afterwards.
import structuredClone from '@ungap/structured-clone'
import {fromParse5} from 'hast-util-from-parse5'
import {toParse5} from 'hast-util-to-parse5'
import {htmlVoidElements} from 'html-void-elements'
import {Parser, Token, TokenizerMode, html} from 'parse5'
import {pointEnd, pointStart} from 'unist-util-position'
import {visit} from 'unist-util-visit'
import {webNamespaces} from 'web-namespaces'
import {zwitch} from 'zwitch'
// Node types associated with MDX.
// <>
const knownMdxNames = new Set([
/** @type {ParserOptions} */
const parseOptions = {sourceCodeLocationInfo: true, scriptingEnabled: false}
* Pass a hast tree through an HTML parser, which will fix nesting, and turn
* raw nodes into actual nodes.
* @param {Nodes} tree
* Original hast tree to transform.
* @param {Options | null | undefined} [options]
* Configuration (optional).
* @returns {Nodes}
* Parsed again tree.
export function raw(tree, options) {
const document = documentMode(tree)
/** @type {(node: Nodes, state: State) => undefined} */
const one = zwitch('type', {
handlers: {root, element, text, comment, doctype, raw: handleRaw},
/** @type {State} */
const state = {
parser: document
? new Parser(parseOptions)
: Parser.getFragmentParser(undefined, parseOptions),
handle(node) {
one(node, state)
stitches: false,
options: options || {}
one(tree, state)
resetTokenizer(state, pointStart())
const p5 = document ? state.parser.document : state.parser.getFragment()
const result = fromParse5(p5, {
// To do: support `space`?
file: state.options.file
if (state.stitches) {
visit(result, 'comment', function (node, index, parent) {
const stitch = /** @type {Stitch} */ (/** @type {unknown} */ (node))
if (stitch.value.stitch && parent && index !== undefined) {
/** @type {Array<RootContent>} */
const siblings = parent.children
// @ts-expect-error: assume the stitch is allowed.
siblings[index] = stitch.value.stitch
return index
// Unpack if possible and when not given a `root`.
if (
result.type === 'root' &&
result.children.length === 1 &&
result.children[0].type === tree.type
) {
return result.children[0]
return result
* Transform all nodes
* @param {Array<RootContent>} nodes
* hast content.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function all(nodes, state) {
let index = -1
/* istanbul ignore else - invalid nodes, see rehypejs/rehype-raw#7. */
if (nodes) {
while (++index < nodes.length) {
* Transform a root.
* @param {Root} node
* hast root node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function root(node, state) {
all(node.children, state)
* Transform an element.
* @param {Element} node
* hast element node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function element(node, state) {
startTag(node, state)
all(node.children, state)
endTag(node, state)
* Transform a text.
* @param {Text} node
* hast text node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function text(node, state) {
/** @type {CharacterToken} */
const token = {
type: Token.TokenType.CHARACTER,
chars: node.value,
location: createParse5Location(node)
resetTokenizer(state, pointStart(node))
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = token
// @ts-expect-error: private.
// type-coverage:ignore-next-line
* Transform a doctype.
* @param {Doctype} node
* hast doctype node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function doctype(node, state) {
/** @type {DoctypeToken} */
const token = {
type: Token.TokenType.DOCTYPE,
name: 'html',
forceQuirks: false,
publicId: '',
systemId: '',
location: createParse5Location(node)
resetTokenizer(state, pointStart(node))
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = token
// @ts-expect-error: private.
// type-coverage:ignore-next-line
* Transform a stitch.
* @param {Nodes} node
* unknown node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function stitch(node, state) {
// Mark that there are stitches, so we need to walk the tree and revert them.
state.stitches = true
/** @type {Nodes} */
const clone = cloneWithoutChildren(node)
// Recurse, because to somewhat handle `[<x>]</x>` (where `[]` denotes the
// passed through node).
if ('children' in node && 'children' in clone) {
// Root in root out.
const fakeRoot = /** @type {Root} */ (
raw({type: 'root', children: node.children}, state.options)
clone.children = fakeRoot.children
// Hack: `value` is supposed to be a string, but as none of the tools
// (`parse5` or `hast-util-from-parse5`) looks at it, we can pass nodes
// through.
comment({type: 'comment', value: {stitch: clone}}, state)
* Transform a comment (or stitch).
* @param {Comment | Stitch} node
* hast comment node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function comment(node, state) {
/** @type {string} */
// @ts-expect-error: we pass stitches through.
const data = node.value
/** @type {CommentToken} */
const token = {
type: Token.TokenType.COMMENT,
location: createParse5Location(node)
resetTokenizer(state, pointStart(node))
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = token
// @ts-expect-error: private.
// type-coverage:ignore-next-line
* Transform a raw node.
* @param {Raw} node
* hast raw node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function handleRaw(node, state) {
// Reset preprocessor:
// See: <>.
state.parser.tokenizer.preprocessor.html = ''
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.pos = -1
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.lastGapPos = -2
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.gapStack = []
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.skipNextNewLine = false
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.lastChunkWritten = false
state.parser.tokenizer.preprocessor.endOfChunkHit = false
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.isEol = false
// Now pass `node.value`.
setPoint(state, pointStart(node))
state.parser.tokenizer.write(node.value, false)
// @ts-expect-error: private.
// type-coverage:ignore-next-line
// Character references hang, so if we ended there, we need to flush
// those too.
// We reset the preprocessor as if the document ends here.
// Then one single call to the relevant state does the trick, parse5
// consumes the whole token.
// Note: `State` is not exposed by `parse5`, so these numbers are fragile.
// See: <>
// Note: a change to `parse5`, which breaks this, was merged but not released.
// Investigate when it is.
if (
state.parser.tokenizer.state === 72 /* NAMED_CHARACTER_REFERENCE */ ||
state.parser.tokenizer.state === 78 /* NUMERIC_CHARACTER_REFERENCE_END */
) {
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.lastChunkWritten = true
/** @type {number} */
// @ts-expect-error: private.
// type-coverage:ignore-next-line
const cp = state.parser.tokenizer._consume()
// @ts-expect-error: private.
// type-coverage:ignore-next-line
* Crash on an unknown node.
* @param {unknown} node_
* unknown node.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Never.
function unknown(node_, state) {
const node = /** @type {Nodes} */ (node_)
if (
state.options.passThrough &&
) {
stitch(node, state)
} else {
let extra = ''
if (knownMdxNames.has(node.type)) {
extra =
". It looks like you are using MDX nodes with `hast-util-raw` (or `rehype-raw`). If you use this because you are using remark or rehype plugins that inject `'html'` nodes, then please raise an issue with that plugin, as its a bad and slow idea. If you use this because you are using markdown syntax, then you have to configure this utility (or plugin) to pass through these nodes (see `passThrough` in docs), but you can also migrate to use the MDX syntax"
throw new Error('Cannot compile `' + node.type + '` node' + extra)
* Reset the tokenizer of a parser.
* @param {State} state
* Info passed around about the current state.
* @param {Point | undefined} point
* Point.
* @returns {undefined}
* Nothing.
function resetTokenizer(state, point) {
setPoint(state, point)
// Process final characters if theyre still there after hibernating.
/** @type {CharacterToken} */
// @ts-expect-error: private.
// type-coverage:ignore-next-line
const token = state.parser.tokenizer.currentCharacterToken
if (token && token.location) {
token.location.endLine = state.parser.tokenizer.preprocessor.line
token.location.endCol = state.parser.tokenizer.preprocessor.col + 1
token.location.endOffset = state.parser.tokenizer.preprocessor.offset + 1
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = token
// @ts-expect-error: private.
// type-coverage:ignore-next-line
// Reset tokenizer:
// See: <>.
// Especially putting it back in the `data` state is useful: some elements,
// like textareas and iframes, change the state.
// See GH-7.
// But also if broken HTML is in `raw`, and then a correct element is given.
// See GH-11.
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.paused = false
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.inLoop = false
// Note: dont reset `state`, `inForeignNode`, or `lastStartTagName`, we
// manually update those when needed. = false
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.returnState = TokenizerMode.DATA
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.charRefCode = -1
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.consumedAfterSnapshot = -1
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.currentLocation = null
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.currentCharacterToken = null
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.currentToken = null
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.currentAttr = {name: '', value: ''}
* Set current location.
* @param {State} state
* Info passed around about the current state.
* @param {Point | undefined} point
* Point.
* @returns {undefined}
* Nothing.
function setPoint(state, point) {
if (point && point.offset !== undefined) {
/** @type {Location} */
const location = {
startLine: point.line,
startCol: point.column,
startOffset: point.offset,
endLine: -1,
endCol: -1,
endOffset: -1
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.preprocessor.lineStartPos = -point.column + 1 // Looks weird, but ensures we get correct positional info.
state.parser.tokenizer.preprocessor.droppedBufferSize = point.offset
state.parser.tokenizer.preprocessor.line = point.line
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.tokenizer.currentLocation = location
* Emit a start tag.
* @param {Element} node
* Element.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function startTag(node, state) {
// Ignore tags if were in plain text.
if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return
resetTokenizer(state, pointStart(node))
const current = state.parser.openElements.current
let ns = 'namespaceURI' in current ? current.namespaceURI : webNamespaces.html
if (ns === webNamespaces.html && node.tagName === 'svg') {
ns = webNamespaces.svg
const result = toParse5(
// Shallow clone to not delve into `children`: we only need the attributes.
{...node, children: []},
{space: ns === webNamespaces.svg ? 'svg' : 'html'}
// Always element.
/* c8 ignore next */
const attrs = 'attrs' in result ? result.attrs : []
/** @type {TagToken} */
const tag = {
type: Token.TokenType.START_TAG,
tagName: node.tagName,
tagID: html.getTagID(node.tagName),
// We always send start and end tags.
selfClosing: false,
ackSelfClosing: false,
location: createParse5Location(node)
// The HTML parsing algorithm works by doing half of the state management in
// the tokenizer and half in the parser.
// We cant use the tokenizer here, as we dont have strings.
// So we act *as if* the tokenizer emits tokens:
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = tag
// @ts-expect-error: private.
// type-coverage:ignore-next-line
// …but then we still need a bunch of work that the tokenizer would normally
// do, such as:
// Set a tag name, similar to how the tokenizer would do it.
state.parser.tokenizer.lastStartTagName = node.tagName
// `inForeignNode` is correctly set by the parser.
* Emit an end tag.
* @param {Element} node
* Element.
* @param {State} state
* Info passed around about the current state.
* @returns {undefined}
* Nothing.
function endTag(node, state) {
// Do not emit closing tags for HTML void elements.
if (
!state.parser.tokenizer.inForeignNode &&
) {
// Ignore tags if were in plain text.
if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return
resetTokenizer(state, pointEnd(node))
/** @type {TagToken} */
const tag = {
type: Token.TokenType.END_TAG,
tagName: node.tagName,
tagID: html.getTagID(node.tagName),
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: createParse5Location(node)
// The HTML parsing algorithm works by doing half of the state management in
// the tokenizer and half in the parser.
// We cant use the tokenizer here, as we dont have strings.
// So we act *as if* the tokenizer emits tokens:
// @ts-expect-error: private.
// type-coverage:ignore-next-line
state.parser.currentToken = tag
// @ts-expect-error: private.
// type-coverage:ignore-next-line
// …but then we still need a bunch of work that the tokenizer would normally
// do, such as:
// Switch back to the data state after alternative states that dont accept
// tags:
if (
// Current element is closed.
tag.tagName === state.parser.tokenizer.lastStartTagName &&
// `<textarea>` and `<title>`
(state.parser.tokenizer.state === TokenizerMode.RCDATA ||
// `<iframe>`, `<noembed>`, `<style>`, `<xmp>`
state.parser.tokenizer.state === TokenizerMode.RAWTEXT ||
// `<script>`
state.parser.tokenizer.state === TokenizerMode.SCRIPT_DATA)
) {
state.parser.tokenizer.state = TokenizerMode.DATA
* Check if `node` represents a whole document or a fragment.
* @param {Nodes} node
* hast node.
* @returns {boolean}
* Whether this represents a whole document or a fragment.
function documentMode(node) {
const head = node.type === 'root' ? node.children[0] : node
return Boolean(
head &&
(head.type === 'doctype' ||
(head.type === 'element' && head.tagName === 'html'))
* Get a `parse5` location from a node.
* @param {Nodes | Stitch} node
* hast node.
* @returns {Location}
* `parse5` location.
function createParse5Location(node) {
const start = pointStart(node) || {
line: undefined,
column: undefined,
offset: undefined
const end = pointEnd(node) || {
line: undefined,
column: undefined,
offset: undefined
/** @type {Record<keyof Location, number | undefined>} */
const location = {
startLine: start.line,
startCol: start.column,
startOffset: start.offset,
endLine: end.line,
endCol: end.column,
endOffset: end.offset
// @ts-expect-error: unist point values can be `undefined` in hast, which
// `parse5` types dont want.
return location
* @template {Nodes} NodeType
* Node type.
* @param {NodeType} node
* Node to clone.
* @returns {NodeType}
* Cloned node, without children.
function cloneWithoutChildren(node) {
return 'children' in node
? structuredClone({...node, children: []})
: structuredClone(node)