| 1 | /** |
| 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | * |
| 4 | * This source code is licensed under the MIT license found in the |
| 5 | * LICENSE file in the root directory of this source tree. |
| 6 | */ |
| 7 | |
| 8 | import type {HighlightedToken} from './textmate-lib/tokenize'; |
| 9 | |
| 10 | import {diffWordsWithSpace} from 'diff'; |
| 11 | import {CSS_CLASS_PREFIX} from './textmate-lib/textmateStyles'; |
| 12 | |
| 13 | /** Type of Chunk: Added, Removed, Unmodified. */ |
| 14 | type ChunkType = 'A' | 'R' | 'U'; |
| 15 | |
| 16 | /** |
| 17 | * The Myers difference algorithm used by the `diff` Node module is O(ND) where |
| 18 | * N is the sum of the lengths of the two inputs and D is size of the minimum |
| 19 | * edit script for the two inputs. As such, large values of N can result in |
| 20 | * extremely long running times while likely providing little value for the |
| 21 | * user. For example, a large blob of JSON on a single line compared with the |
| 22 | * first line of the pretty-printed version (containing only `{`) could be an |
| 23 | * expensive diff to compute while telling the user nothing of interest. |
| 24 | * |
| 25 | * To defend against such pathological cases, we should not bother to compute |
| 26 | * the intraline diff in certain cases. As an initial heuristic, we impose a |
| 27 | * threshold for the "maximum input" (i.e., the sum of the lengths of the |
| 28 | * strings being compared) for which an intraline diff should be computed. |
| 29 | * |
| 30 | * Incidentally, tokenization for syntax highlighting has similar issues. At |
| 31 | * least as of Oct 2022, rather than impose a "max line length," VS Code imposes |
| 32 | * a "time spent" threshold: |
| 33 | * |
| 34 | * https://github.com/microsoft/vscode/blob/504c5a768a001b2099dd2b44e9dc39e10ccdfb56/src/vs/workbench/services/textMate/common/TMTokenization.ts#L39 |
| 35 | * |
| 36 | * It might be worth considering something similar for intraline diffs. |
| 37 | */ |
| 38 | export const MAX_INPUT_LENGTH_FOR_INTRALINE_DIFF = 300; |
| 39 | |
| 40 | /** |
| 41 | * Normalized version of a `diff.Change` returned by `diffWords()` that is |
| 42 | * easier to work with when interleaving with syntax highlight information. |
| 43 | */ |
| 44 | type Chunk = { |
| 45 | type: ChunkType; |
| 46 | start: number; |
| 47 | end: number; |
| 48 | }; |
| 49 | |
| 50 | /** |
| 51 | * Takes a modified line in the form of `beforeLine` and `afterLine` along with |
| 52 | * syntax highlighting information that covers each respective line and |
| 53 | * returns a ReactFragment to display the before/after versions of a line in a |
| 54 | * side-by-side diff. |
| 55 | */ |
| 56 | export function createTokenizedIntralineDiff( |
| 57 | beforeLine: string, |
| 58 | beforeTokens: HighlightedToken[], |
| 59 | afterLine: string, |
| 60 | afterTokens: HighlightedToken[], |
| 61 | ): [React.ReactFragment | null, React.ReactFragment | null] { |
| 62 | if (beforeLine.length + afterLine.length > MAX_INPUT_LENGTH_FOR_INTRALINE_DIFF) { |
| 63 | return [ |
| 64 | applyTokenizationToLine(beforeLine, beforeTokens), |
| 65 | applyTokenizationToLine(afterLine, afterTokens), |
| 66 | ]; |
| 67 | } |
| 68 | |
| 69 | const changes = diffWordsWithSpace(beforeLine, afterLine); |
| 70 | const beforeChunks: Chunk[] = []; |
| 71 | const afterChunks: Chunk[] = []; |
| 72 | let beforeLength = 0; |
| 73 | let afterLength = 0; |
| 74 | |
| 75 | changes.forEach(change => { |
| 76 | const {added, removed, value} = change; |
| 77 | const len = value.length; |
| 78 | if (added) { |
| 79 | const end = afterLength + len; |
| 80 | addOrExtend(afterChunks, 'A', afterLength, end); |
| 81 | afterLength = end; |
| 82 | } else if (removed) { |
| 83 | const end = beforeLength + len; |
| 84 | addOrExtend(beforeChunks, 'R', beforeLength, end); |
| 85 | beforeLength = end; |
| 86 | } else { |
| 87 | const beforeEnd = beforeLength + len; |
| 88 | addOrExtend(beforeChunks, 'U', beforeLength, beforeEnd); |
| 89 | beforeLength = beforeEnd; |
| 90 | |
| 91 | const afterEnd = afterLength + len; |
| 92 | addOrExtend(afterChunks, 'U', afterLength, afterEnd); |
| 93 | afterLength = afterEnd; |
| 94 | } |
| 95 | }); |
| 96 | |
| 97 | // Note that the logic in mergeChunksAndTokens() could be done as part of this |
| 98 | // function to avoid an additional pass over the chunks, but the bookkeeping |
| 99 | // might get messy. |
| 100 | return [ |
| 101 | mergeChunksAndTokens(beforeLine, beforeChunks, beforeTokens), |
| 102 | mergeChunksAndTokens(afterLine, afterChunks, afterTokens), |
| 103 | ]; |
| 104 | } |
| 105 | |
| 106 | function addOrExtend(chunks: Chunk[], type: ChunkType, start: number, end: number) { |
| 107 | const lastEntry = chunks[chunks.length - 1]; |
| 108 | if (lastEntry?.type === type && lastEntry?.end === start) { |
| 109 | lastEntry.end = end; |
| 110 | } else { |
| 111 | chunks.push({type, start, end}); |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | /** TODO: Create proper machinery to strip assertions from production builds. */ |
| 116 | const ENABLE_ASSERTIONS = false; |
| 117 | |
| 118 | type ChunkSpanProps = { |
| 119 | key: number; |
| 120 | className: string; |
| 121 | content: string; |
| 122 | isChunkStart: boolean; |
| 123 | isChunkEnd: boolean; |
| 124 | }; |
| 125 | |
| 126 | function ChunkSpan({ |
| 127 | key, |
| 128 | className, |
| 129 | content, |
| 130 | isChunkStart, |
| 131 | isChunkEnd, |
| 132 | }: ChunkSpanProps): React.ReactNode { |
| 133 | let fullClassName = className; |
| 134 | if (isChunkStart) { |
| 135 | fullClassName += ' patch-word-begin'; |
| 136 | } |
| 137 | if (isChunkEnd) { |
| 138 | fullClassName += ' patch-word-end'; |
| 139 | } |
| 140 | return ( |
| 141 | <span key={key} className={fullClassName}> |
| 142 | {content} |
| 143 | </span> |
| 144 | ); |
| 145 | } |
| 146 | |
| 147 | /** |
| 148 | * Interleave chunks and tokens to produce a properly styled intraline diff. |
| 149 | */ |
| 150 | function mergeChunksAndTokens( |
| 151 | line: string, |
| 152 | chunks: Chunk[], |
| 153 | tokens: HighlightedToken[], |
| 154 | ): React.ReactFragment | null { |
| 155 | if (tokens.length == 0) { |
| 156 | return null; |
| 157 | } |
| 158 | |
| 159 | if (ENABLE_ASSERTIONS) { |
| 160 | // We expect the following invariants to hold, by construction. |
| 161 | // eslint-disable-next-line no-console |
| 162 | console.assert( |
| 163 | chunks.length !== 0, |
| 164 | 'chunks is never empty, even if the line is the empty string', |
| 165 | ); |
| 166 | // eslint-disable-next-line no-console |
| 167 | console.assert( |
| 168 | chunks[chunks.length - 1].end === tokens[tokens.length - 1].end, |
| 169 | 'the final chunk and token must have the same end index to ensure the loop breaks properly', |
| 170 | ); |
| 171 | } |
| 172 | |
| 173 | const spans: ChunkSpanProps[] = []; |
| 174 | let chunkIndex = 0; |
| 175 | let tokenIndex = 0; |
| 176 | let lastEndIndex = 0; |
| 177 | let lastChunkType: ChunkType = 'U'; |
| 178 | let isChunkStart = false; |
| 179 | const maxChunkIndex = chunks.length; |
| 180 | const maxTokenIndex = tokens.length; |
| 181 | while (chunkIndex < maxChunkIndex && tokenIndex < maxTokenIndex) { |
| 182 | const chunk = chunks[chunkIndex]; |
| 183 | const token = tokens[tokenIndex]; |
| 184 | const start = lastEndIndex; |
| 185 | if (chunk.end === token.end) { |
| 186 | lastEndIndex = chunk.end; |
| 187 | ++chunkIndex; |
| 188 | ++tokenIndex; |
| 189 | } else if (chunk.end < token.end) { |
| 190 | lastEndIndex = chunk.end; |
| 191 | ++chunkIndex; |
| 192 | } else { |
| 193 | lastEndIndex = token.end; |
| 194 | ++tokenIndex; |
| 195 | } |
| 196 | |
| 197 | const chunkType = chunk.type; |
| 198 | if (lastChunkType !== 'U' && chunkType !== lastChunkType) { |
| 199 | spans[spans.length - 1].isChunkEnd = true; |
| 200 | } |
| 201 | isChunkStart = chunkType !== 'U' && (chunkType !== lastChunkType || spans.length === 0); |
| 202 | spans.push(createSpan(line, start, lastEndIndex, token.color, chunkType, isChunkStart)); |
| 203 | lastChunkType = chunkType; |
| 204 | } |
| 205 | |
| 206 | // Check if last span needs to have isChunkEnd set. |
| 207 | if (lastChunkType !== 'U') { |
| 208 | spans[spans.length - 1].isChunkEnd = true; |
| 209 | } |
| 210 | |
| 211 | return spans.map(ChunkSpan); |
| 212 | } |
| 213 | |
| 214 | /** |
| 215 | * Creates the <span> with the appropriate CSS classes to display a portion of |
| 216 | * an intraline diff with syntax highlighting. |
| 217 | */ |
| 218 | function createSpan( |
| 219 | line: string, |
| 220 | start: number, |
| 221 | end: number, |
| 222 | color: number, |
| 223 | type: ChunkType, |
| 224 | isChunkStart: boolean, |
| 225 | ): ChunkSpanProps { |
| 226 | let patchClass; |
| 227 | switch (type) { |
| 228 | case 'U': |
| 229 | patchClass = ''; |
| 230 | break; |
| 231 | case 'A': |
| 232 | patchClass = ' patch-add-word'; |
| 233 | break; |
| 234 | case 'R': |
| 235 | patchClass = ' patch-remove-word'; |
| 236 | break; |
| 237 | } |
| 238 | |
| 239 | const className = `${CSS_CLASS_PREFIX}${color}${patchClass}`; |
| 240 | return { |
| 241 | key: start, |
| 242 | className, |
| 243 | content: line.slice(start, end), |
| 244 | isChunkStart, |
| 245 | isChunkEnd: false, |
| 246 | }; |
| 247 | } |
| 248 | |
| 249 | export function applyTokenizationToLine( |
| 250 | line: string, |
| 251 | tokenization: readonly HighlightedToken[], |
| 252 | ): React.ReactFragment { |
| 253 | return tokenization.map(({start, end, color}) => { |
| 254 | return ( |
| 255 | <span key={start} className={`${CSS_CLASS_PREFIX}${color}`}> |
| 256 | {line.slice(start, end)} |
| 257 | </span> |
| 258 | ); |
| 259 | }); |
| 260 | } |
| 261 | |