tokenize.ts · grove

3.7 KB97 lines

1	/**
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	*
4	* This source code is licensed under the MIT license found in the
5	* LICENSE file in the root directory of this source tree.
6	*/
7
8	import type {IGrammar} from 'vscode-textmate';
9
10	import {INITIAL} from 'vscode-textmate';
11
12	// The following values come from the MetadataConsts enum in vscode-textmate.
13	// Although they are declared in the main.d.ts file, our TypeScript/Webpack
14	// setup does not appear to be able to inline them properly.
15	const FOREGROUND_MASK = 8372224;
16	const FOREGROUND_OFFSET = 14;
17
18	/**
19	* Specify a timeout when tokenizing a line to prevent a long line from locking
20	* up the main thread. Note this is used in VS Code:
21	* https://github.com/microsoft/vscode/blob/504c5a768a001b2099dd2b44e9dc39e10ccdfb56/src/vs/workbench/services/textMate/common/TMTokenization.ts#L39
22	*/
23	const DEFAULT_TOKENIZE_TIMEOUT_MS = 500;
24
25	export type HighlightedToken = {
26	/** Start index within a line, inclusive. */
27	start: number;
28
29	/** End index within a line, exclusive. */
30	end: number;
31
32	/** Index into a color map. */
33	color: number;
34	};
35
36	export function tokenizeFileContents(
37	fileContents: string,
38	grammar: IGrammar,
39	timeLimit: number = DEFAULT_TOKENIZE_TIMEOUT_MS,
40	): Array<Array<HighlightedToken>> {
41	// As fileContents could be quite large, we are assuming that, even though
42	// split() generates a potentially large array, because it is one native
43	// call, it is likely to be more efficient than us doing our own bookkeeping
44	// to slice off one substring at a time (though that would avoid the array
45	// allocation).
46	return tokenizeLines(fileContents.split('\n'), grammar, timeLimit);
47	}
48
49	export function tokenizeLines(
50	lines: ReadonlyArray<string>,
51	grammar: IGrammar,
52	timeLimit: number = DEFAULT_TOKENIZE_TIMEOUT_MS,
53	): Array<Array<HighlightedToken>> {
54	let state = INITIAL;
55	return lines.map((line: string) => {
56	// Line-processing logic taken from:
57	// https://github.com/microsoft/vscode-textmate/blob/cc8ae321cfb47940470bd82c87a8ac61366fbd80/src/tests/themedTokenizer.ts#L20-L41
58	const result = grammar.tokenizeLine2(line, state, timeLimit);
59
60	// Note that even if `result.stoppedEarly` is true, we still use the list of
61	// tokens that were returned to tokenize as much of the line as possible.
62	// eslint-disable-next-line no-bitwise
63	const tokensLength = result.tokens.length >> 1;
64	const singleLine = [];
65	for (let j = 0; j < tokensLength; j++) {
66	const startIndex = result.tokens[2 * j];
67	const nextStartIndex = j + 1 < tokensLength ? result.tokens[2 * j + 2] : line.length;
68	const tokenText = line.substring(startIndex, nextStartIndex);
69	if (tokenText === '') {
70	continue;
71	}
72
73	const metaData = result.tokens[2 * j + 1];
74
75	// Get foreground index from metaData so that we can index into TokensCSS
76	// (a map from className to styles). Note this code comes from:
77	// https://github.com/microsoft/vscode-textmate/blob/cc8ae321cfb47940470bd82c87a8ac61366fbd80/src/grammar.ts#L1032-L1034
78	// We have to inline it here because StackElementMetadata does not appear
79	// to be exported as part of the vscode-textmate npm module.
80	// eslint-disable-next-line no-bitwise
81	const foregroundIdx = (metaData & FOREGROUND_MASK) >>> FOREGROUND_OFFSET;
82
83	singleLine.push({
84	start: startIndex,
85	end: nextStartIndex,
86	color: foregroundIdx,
87	});
88	}
89
90	// If we get result.stoppedEarly, continue tokenizing using the state used
91	// to tokenize this line as a "best guess" of what state tokenizing this
92	// line would have left us in had it completed.
93	state = result.stoppedEarly ? state : result.ruleStack;
94	return singleLine;
95	});
96	}
97