createTokenizedIntralineDiff.tsx

7.7 KB261 lines

1	/**
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	*
4	* This source code is licensed under the MIT license found in the
5	* LICENSE file in the root directory of this source tree.
6	*/
7
8	import type {HighlightedToken} from './textmate-lib/tokenize';
9
10	import {diffWordsWithSpace} from 'diff';
11	import {CSS_CLASS_PREFIX} from './textmate-lib/textmateStyles';
12
13	/** Type of Chunk: Added, Removed, Unmodified. */
14	type ChunkType = 'A' \| 'R' \| 'U';
15
16	/**
17	* The Myers difference algorithm used by the `diff` Node module is O(ND) where
18	* N is the sum of the lengths of the two inputs and D is size of the minimum
19	* edit script for the two inputs. As such, large values of N can result in
20	* extremely long running times while likely providing little value for the
21	* user. For example, a large blob of JSON on a single line compared with the
22	* first line of the pretty-printed version (containing only `{`) could be an
23	* expensive diff to compute while telling the user nothing of interest.
24	*
25	* To defend against such pathological cases, we should not bother to compute
26	* the intraline diff in certain cases. As an initial heuristic, we impose a
27	* threshold for the "maximum input" (i.e., the sum of the lengths of the
28	* strings being compared) for which an intraline diff should be computed.
29	*
30	* Incidentally, tokenization for syntax highlighting has similar issues. At
31	* least as of Oct 2022, rather than impose a "max line length," VS Code imposes
32	* a "time spent" threshold:
33	*
34	* https://github.com/microsoft/vscode/blob/504c5a768a001b2099dd2b44e9dc39e10ccdfb56/src/vs/workbench/services/textMate/common/TMTokenization.ts#L39
35	*
36	* It might be worth considering something similar for intraline diffs.
37	*/
38	export const MAX_INPUT_LENGTH_FOR_INTRALINE_DIFF = 300;
39
40	/**
41	* Normalized version of a `diff.Change` returned by `diffWords()` that is
42	* easier to work with when interleaving with syntax highlight information.
43	*/
44	type Chunk = {
45	type: ChunkType;
46	start: number;
47	end: number;
48	};
49
50	/**
51	* Takes a modified line in the form of `beforeLine` and `afterLine` along with
52	* syntax highlighting information that covers each respective line and
53	* returns a ReactFragment to display the before/after versions of a line in a
54	* side-by-side diff.
55	*/
56	export function createTokenizedIntralineDiff(
57	beforeLine: string,
58	beforeTokens: HighlightedToken[],
59	afterLine: string,
60	afterTokens: HighlightedToken[],
61	): [React.ReactFragment \| null, React.ReactFragment \| null] {
62	if (beforeLine.length + afterLine.length > MAX_INPUT_LENGTH_FOR_INTRALINE_DIFF) {
63	return [
64	applyTokenizationToLine(beforeLine, beforeTokens),
65	applyTokenizationToLine(afterLine, afterTokens),
66	];
67	}
68
69	const changes = diffWordsWithSpace(beforeLine, afterLine);
70	const beforeChunks: Chunk[] = [];
71	const afterChunks: Chunk[] = [];
72	let beforeLength = 0;
73	let afterLength = 0;
74
75	changes.forEach(change => {
76	const {added, removed, value} = change;
77	const len = value.length;
78	if (added) {
79	const end = afterLength + len;
80	addOrExtend(afterChunks, 'A', afterLength, end);
81	afterLength = end;
82	} else if (removed) {
83	const end = beforeLength + len;
84	addOrExtend(beforeChunks, 'R', beforeLength, end);
85	beforeLength = end;
86	} else {
87	const beforeEnd = beforeLength + len;
88	addOrExtend(beforeChunks, 'U', beforeLength, beforeEnd);
89	beforeLength = beforeEnd;
90
91	const afterEnd = afterLength + len;
92	addOrExtend(afterChunks, 'U', afterLength, afterEnd);
93	afterLength = afterEnd;
94	}
95	});
96
97	// Note that the logic in mergeChunksAndTokens() could be done as part of this
98	// function to avoid an additional pass over the chunks, but the bookkeeping
99	// might get messy.
100	return [
101	mergeChunksAndTokens(beforeLine, beforeChunks, beforeTokens),
102	mergeChunksAndTokens(afterLine, afterChunks, afterTokens),
103	];
104	}
105
106	function addOrExtend(chunks: Chunk[], type: ChunkType, start: number, end: number) {
107	const lastEntry = chunks[chunks.length - 1];
108	if (lastEntry?.type === type && lastEntry?.end === start) {
109	lastEntry.end = end;
110	} else {
111	chunks.push({type, start, end});
112	}
113	}
114
115	/** TODO: Create proper machinery to strip assertions from production builds. */
116	const ENABLE_ASSERTIONS = false;
117
118	type ChunkSpanProps = {
119	key: number;
120	className: string;
121	content: string;
122	isChunkStart: boolean;
123	isChunkEnd: boolean;
124	};
125
126	function ChunkSpan({
127	key,
128	className,
129	content,
130	isChunkStart,
131	isChunkEnd,
132	}: ChunkSpanProps): React.ReactNode {
133	let fullClassName = className;
134	if (isChunkStart) {
135	fullClassName += ' patch-word-begin';
136	}
137	if (isChunkEnd) {
138	fullClassName += ' patch-word-end';
139	}
140	return (
141	<span key={key} className={fullClassName}>
142	{content}
143	</span>
144	);
145	}
146
147	/**
148	* Interleave chunks and tokens to produce a properly styled intraline diff.
149	*/
150	function mergeChunksAndTokens(
151	line: string,
152	chunks: Chunk[],
153	tokens: HighlightedToken[],
154	): React.ReactFragment \| null {
155	if (tokens.length == 0) {
156	return null;
157	}
158
159	if (ENABLE_ASSERTIONS) {
160	// We expect the following invariants to hold, by construction.
161	// eslint-disable-next-line no-console
162	console.assert(
163	chunks.length !== 0,
164	'chunks is never empty, even if the line is the empty string',
165	);
166	// eslint-disable-next-line no-console
167	console.assert(
168	chunks[chunks.length - 1].end === tokens[tokens.length - 1].end,
169	'the final chunk and token must have the same end index to ensure the loop breaks properly',
170	);
171	}
172
173	const spans: ChunkSpanProps[] = [];
174	let chunkIndex = 0;
175	let tokenIndex = 0;
176	let lastEndIndex = 0;
177	let lastChunkType: ChunkType = 'U';
178	let isChunkStart = false;
179	const maxChunkIndex = chunks.length;
180	const maxTokenIndex = tokens.length;
181	while (chunkIndex < maxChunkIndex && tokenIndex < maxTokenIndex) {
182	const chunk = chunks[chunkIndex];
183	const token = tokens[tokenIndex];
184	const start = lastEndIndex;
185	if (chunk.end === token.end) {
186	lastEndIndex = chunk.end;
187	++chunkIndex;
188	++tokenIndex;
189	} else if (chunk.end < token.end) {
190	lastEndIndex = chunk.end;
191	++chunkIndex;
192	} else {
193	lastEndIndex = token.end;
194	++tokenIndex;
195	}
196
197	const chunkType = chunk.type;
198	if (lastChunkType !== 'U' && chunkType !== lastChunkType) {
199	spans[spans.length - 1].isChunkEnd = true;
200	}
201	isChunkStart = chunkType !== 'U' && (chunkType !== lastChunkType \|\| spans.length === 0);
202	spans.push(createSpan(line, start, lastEndIndex, token.color, chunkType, isChunkStart));
203	lastChunkType = chunkType;
204	}
205
206	// Check if last span needs to have isChunkEnd set.
207	if (lastChunkType !== 'U') {
208	spans[spans.length - 1].isChunkEnd = true;
209	}
210
211	return spans.map(ChunkSpan);
212	}
213
214	/**
215	* Creates the <span> with the appropriate CSS classes to display a portion of
216	* an intraline diff with syntax highlighting.
217	*/
218	function createSpan(
219	line: string,
220	start: number,
221	end: number,
222	color: number,
223	type: ChunkType,
224	isChunkStart: boolean,
225	): ChunkSpanProps {
226	let patchClass;
227	switch (type) {
228	case 'U':
229	patchClass = '';
230	break;
231	case 'A':
232	patchClass = ' patch-add-word';
233	break;
234	case 'R':
235	patchClass = ' patch-remove-word';
236	break;
237	}
238
239	const className = `${CSS_CLASS_PREFIX}${color}${patchClass}`;
240	return {
241	key: start,
242	className,
243	content: line.slice(start, end),
244	isChunkStart,
245	isChunkEnd: false,
246	};
247	}
248
249	export function applyTokenizationToLine(
250	line: string,
251	tokenization: readonly HighlightedToken[],
252	): React.ReactFragment {
253	return tokenization.map(({start, end, color}) => {
254	return (
255	<span key={start} className={`${CSS_CLASS_PREFIX}${color}`}>
256	{line.slice(start, end)}
257	</span>
258	);
259	});
260	}
261