/** * @typedef {import('micromark-util-types').Chunk} Chunk * @typedef {import('micromark-util-types').Code} Code * @typedef {import('micromark-util-types').Construct} Construct * @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord * @typedef {import('micromark-util-types').Effects} Effects * @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct * @typedef {import('micromark-util-types').ParseContext} ParseContext * @typedef {import('micromark-util-types').Point} Point * @typedef {import('micromark-util-types').State} State * @typedef {import('micromark-util-types').Token} Token * @typedef {import('micromark-util-types').TokenType} TokenType * @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext */ /** * @callback Restore * @returns {undefined} * * @typedef Info * @property {Restore} restore * @property {number} from * * @callback ReturnHandle * Handle a successful run. * @param {Construct} construct * @param {Info} info * @returns {undefined} */ import createDebug from 'debug' import {markdownLineEnding} from 'micromark-util-character' import {push, splice} from 'micromark-util-chunked' import {resolveAll} from 'micromark-util-resolve-all' import {codes, values} from 'micromark-util-symbol' import {ok as assert} from 'devlop' const debug = createDebug('micromark') /** * Create a tokenizer. * Tokenizers deal with one type of data (e.g., containers, flow, text). * The parser is the object dealing with it all. * `initialize` works like other constructs, except that only its `tokenize` * function is used, in which case it doesn’t receive an `ok` or `nok`. * `from` can be given to set the point before the first character, although * when further lines are indented, they must be set with `defineSkip`. * * @param {ParseContext} parser * @param {InitialConstruct} initialize * @param {Omit | undefined} [from] * @returns {TokenizeContext} */ export function createTokenizer(parser, initialize, from) { /** @type {Point} */ let point = Object.assign( from ? Object.assign({}, from) : {line: 1, column: 1, offset: 0}, {_index: 0, _bufferIndex: -1} ) /** @type {Record} */ const columnStart = {} /** @type {Array} */ const resolveAllConstructs = [] /** @type {Array} */ let chunks = [] /** @type {Array} */ let stack = [] /** @type {boolean | undefined} */ let consumed = true /** * Tools used for tokenizing. * * @type {Effects} */ const effects = { consume, enter, exit, attempt: constructFactory(onsuccessfulconstruct), check: constructFactory(onsuccessfulcheck), interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}) } /** * State and tools for resolving and serializing. * * @type {TokenizeContext} */ const context = { previous: codes.eof, code: codes.eof, containerState: {}, events: [], parser, sliceStream, sliceSerialize, now, defineSkip, write } /** * The state function. * * @type {State | undefined} */ let state = initialize.tokenize.call(context, effects) /** * Track which character we expect to be consumed, to catch bugs. * * @type {Code} */ let expectedCode if (initialize.resolveAll) { resolveAllConstructs.push(initialize) } return context /** @type {TokenizeContext['write']} */ function write(slice) { chunks = push(chunks, slice) main() // Exit if we’re not done, resolve might change stuff. if (chunks[chunks.length - 1] !== codes.eof) { return [] } addResult(initialize, 0) // Otherwise, resolve, and exit. context.events = resolveAll(resolveAllConstructs, context.events, context) return context.events } // // Tools. // /** @type {TokenizeContext['sliceSerialize']} */ function sliceSerialize(token, expandTabs) { return serializeChunks(sliceStream(token), expandTabs) } /** @type {TokenizeContext['sliceStream']} */ function sliceStream(token) { return sliceChunks(chunks, token) } /** @type {TokenizeContext['now']} */ function now() { // This is a hot path, so we clone manually instead of `Object.assign({}, point)` const {line, column, offset, _index, _bufferIndex} = point return {line, column, offset, _index, _bufferIndex} } /** @type {TokenizeContext['defineSkip']} */ function defineSkip(value) { columnStart[value.line] = value.column accountForPotentialSkip() debug('position: define skip: `%j`', point) } // // State management. // /** * Main loop (note that `_index` and `_bufferIndex` in `point` are modified by * `consume`). * Here is where we walk through the chunks, which either include strings of * several characters, or numerical character codes. * The reason to do this in a loop instead of a call is so the stack can * drain. * * @returns {undefined} */ function main() { /** @type {number} */ let chunkIndex while (point._index < chunks.length) { const chunk = chunks[point._index] // If we’re in a buffer chunk, loop through it. if (typeof chunk === 'string') { chunkIndex = point._index if (point._bufferIndex < 0) { point._bufferIndex = 0 } while ( point._index === chunkIndex && point._bufferIndex < chunk.length ) { go(chunk.charCodeAt(point._bufferIndex)) } } else { go(chunk) } } } /** * Deal with one code. * * @param {Code} code * @returns {undefined} */ function go(code) { assert(consumed === true, 'expected character to be consumed') consumed = undefined debug('main: passing `%s` to %s', code, state && state.name) expectedCode = code assert(typeof state === 'function', 'expected state') state = state(code) } /** @type {Effects['consume']} */ function consume(code) { assert(code === expectedCode, 'expected given code to equal expected code') debug('consume: `%s`', code) assert( consumed === undefined, 'expected code to not have been consumed: this might be because `return x(code)` instead of `return x` was used' ) assert( code === null ? context.events.length === 0 || context.events[context.events.length - 1][0] === 'exit' : context.events[context.events.length - 1][0] === 'enter', 'expected last token to be open' ) if (markdownLineEnding(code)) { point.line++ point.column = 1 point.offset += code === codes.carriageReturnLineFeed ? 2 : 1 accountForPotentialSkip() debug('position: after eol: `%j`', point) } else if (code !== codes.virtualSpace) { point.column++ point.offset++ } // Not in a string chunk. if (point._bufferIndex < 0) { point._index++ } else { point._bufferIndex++ // At end of string chunk. // @ts-expect-error Points w/ non-negative `_bufferIndex` reference // strings. if (point._bufferIndex === chunks[point._index].length) { point._bufferIndex = -1 point._index++ } } // Expose the previous character. context.previous = code // Mark as consumed. consumed = true } /** @type {Effects['enter']} */ function enter(type, fields) { /** @type {Token} */ // @ts-expect-error Patch instead of assign required fields to help GC. const token = fields || {} token.type = type token.start = now() assert(typeof type === 'string', 'expected string type') assert(type.length > 0, 'expected non-empty string') debug('enter: `%s`', type) context.events.push(['enter', token, context]) stack.push(token) return token } /** @type {Effects['exit']} */ function exit(type) { assert(typeof type === 'string', 'expected string type') assert(type.length > 0, 'expected non-empty string') const token = stack.pop() assert(token, 'cannot close w/o open tokens') token.end = now() assert(type === token.type, 'expected exit token to match current token') assert( !( token.start._index === token.end._index && token.start._bufferIndex === token.end._bufferIndex ), 'expected non-empty token (`' + type + '`)' ) debug('exit: `%s`', token.type) context.events.push(['exit', token, context]) return token } /** * Use results. * * @type {ReturnHandle} */ function onsuccessfulconstruct(construct, info) { addResult(construct, info.from) } /** * Discard results. * * @type {ReturnHandle} */ function onsuccessfulcheck(_, info) { info.restore() } /** * Factory to attempt/check/interrupt. * * @param {ReturnHandle} onreturn * @param {{interrupt?: boolean | undefined} | undefined} [fields] */ function constructFactory(onreturn, fields) { return hook /** * Handle either an object mapping codes to constructs, a list of * constructs, or a single construct. * * @param {Array | Construct | ConstructRecord} constructs * @param {State} returnState * @param {State | undefined} [bogusState] * @returns {State} */ function hook(constructs, returnState, bogusState) { /** @type {Array} */ let listOfConstructs /** @type {number} */ let constructIndex /** @type {Construct} */ let currentConstruct /** @type {Info} */ let info return Array.isArray(constructs) ? /* c8 ignore next 1 */ handleListOfConstructs(constructs) : 'tokenize' in constructs ? // @ts-expect-error Looks like a construct. handleListOfConstructs([constructs]) : handleMapOfConstructs(constructs) /** * Handle a list of construct. * * @param {ConstructRecord} map * @returns {State} */ function handleMapOfConstructs(map) { return start /** @type {State} */ function start(code) { const def = code !== null && map[code] const all = code !== null && map.null const list = [ // To do: add more extension tests. /* c8 ignore next 2 */ ...(Array.isArray(def) ? def : def ? [def] : []), ...(Array.isArray(all) ? all : all ? [all] : []) ] return handleListOfConstructs(list)(code) } } /** * Handle a list of construct. * * @param {Array} list * @returns {State} */ function handleListOfConstructs(list) { listOfConstructs = list constructIndex = 0 if (list.length === 0) { assert(bogusState, 'expected `bogusState` to be given') return bogusState } return handleConstruct(list[constructIndex]) } /** * Handle a single construct. * * @param {Construct} construct * @returns {State} */ function handleConstruct(construct) { return start /** @type {State} */ function start(code) { // To do: not needed to store if there is no bogus state, probably? // Currently doesn’t work because `inspect` in document does a check // w/o a bogus, which doesn’t make sense. But it does seem to help perf // by not storing. info = store() currentConstruct = construct if (!construct.partial) { context.currentConstruct = construct } // Always populated by defaults. assert( context.parser.constructs.disable.null, 'expected `disable.null` to be populated' ) if ( construct.name && context.parser.constructs.disable.null.includes(construct.name) ) { return nok(code) } return construct.tokenize.call( // If we do have fields, create an object w/ `context` as its // prototype. // This allows a “live binding”, which is needed for `interrupt`. fields ? Object.assign(Object.create(context), fields) : context, effects, ok, nok )(code) } } /** @type {State} */ function ok(code) { assert(code === expectedCode, 'expected code') consumed = true onreturn(currentConstruct, info) return returnState } /** @type {State} */ function nok(code) { assert(code === expectedCode, 'expected code') consumed = true info.restore() if (++constructIndex < listOfConstructs.length) { return handleConstruct(listOfConstructs[constructIndex]) } return bogusState } } } /** * @param {Construct} construct * @param {number} from * @returns {undefined} */ function addResult(construct, from) { if (construct.resolveAll && !resolveAllConstructs.includes(construct)) { resolveAllConstructs.push(construct) } if (construct.resolve) { splice( context.events, from, context.events.length - from, construct.resolve(context.events.slice(from), context) ) } if (construct.resolveTo) { context.events = construct.resolveTo(context.events, context) } assert( construct.partial || context.events.length === 0 || context.events[context.events.length - 1][0] === 'exit', 'expected last token to end' ) } /** * Store state. * * @returns {Info} */ function store() { const startPoint = now() const startPrevious = context.previous const startCurrentConstruct = context.currentConstruct const startEventsIndex = context.events.length const startStack = Array.from(stack) return {restore, from: startEventsIndex} /** * Restore state. * * @returns {undefined} */ function restore() { point = startPoint context.previous = startPrevious context.currentConstruct = startCurrentConstruct context.events.length = startEventsIndex stack = startStack accountForPotentialSkip() debug('position: restore: `%j`', point) } } /** * Move the current point a bit forward in the line when it’s on a column * skip. * * @returns {undefined} */ function accountForPotentialSkip() { if (point.line in columnStart && point.column < 2) { point.column = columnStart[point.line] point.offset += columnStart[point.line] - 1 } } } /** * Get the chunks from a slice of chunks in the range of a token. * * @param {Array} chunks * @param {Pick} token * @returns {Array} */ function sliceChunks(chunks, token) { const startIndex = token.start._index const startBufferIndex = token.start._bufferIndex const endIndex = token.end._index const endBufferIndex = token.end._bufferIndex /** @type {Array} */ let view if (startIndex === endIndex) { assert(endBufferIndex > -1, 'expected non-negative end buffer index') assert(startBufferIndex > -1, 'expected non-negative start buffer index') // @ts-expect-error `_bufferIndex` is used on string chunks. view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)] } else { view = chunks.slice(startIndex, endIndex) if (startBufferIndex > -1) { const head = view[0] if (typeof head === 'string') { view[0] = head.slice(startBufferIndex) } else { assert(startBufferIndex === 0, 'expected `startBufferIndex` to be `0`') view.shift() } } if (endBufferIndex > 0) { // @ts-expect-error `_bufferIndex` is used on string chunks. view.push(chunks[endIndex].slice(0, endBufferIndex)) } } return view } /** * Get the string value of a slice of chunks. * * @param {Array} chunks * @param {boolean | undefined} [expandTabs=false] * @returns {string} */ function serializeChunks(chunks, expandTabs) { let index = -1 /** @type {Array} */ const result = [] /** @type {boolean | undefined} */ let atTab while (++index < chunks.length) { const chunk = chunks[index] /** @type {string} */ let value if (typeof chunk === 'string') { value = chunk } else switch (chunk) { case codes.carriageReturn: { value = values.cr break } case codes.lineFeed: { value = values.lf break } case codes.carriageReturnLineFeed: { value = values.cr + values.lf break } case codes.horizontalTab: { value = expandTabs ? values.space : values.ht break } case codes.virtualSpace: { if (!expandTabs && atTab) continue value = values.space break } default: { assert(typeof chunk === 'number', 'expected number') // Currently only replacement character. value = String.fromCharCode(chunk) } } atTab = chunk === codes.horizontalTab result.push(value) } return result.join('') }