Process unicode results from ripgrep correctly

2024-09-20 07:28:08 +03:00 · 2019-05-22 21:32:19 +02:00 · 2019-05-22 21:32:19 +02:00 · 6748b84060
commit 6748b84060
parent 411e2a95fb
2 changed files with 53 additions and 1 deletions
--- a/spec/workspace-spec.js
+++ b/spec/workspace-spec.js
@ -2647,6 +2647,28 @@ describe('Workspace', () => {
          })
        }

+        it('returns results on lines with unicode strings', async () => {
+          const results = []
+
+          await scan(
+            /line with unico/,
+            {},
+            result => results.push(result)
+          )
+          expect(results.length).toBe(1)
+          const { filePath, matches } = results[0]
+          expect(filePath).toBe(atom.project.getDirectories()[0].resolve('file-with-unicode'))
+          expect(matches).toHaveLength(1)
+          expect(matches[0]).toEqual({
+            matchText: 'line with unico',
+            lineText: 'ДДДДДДДДДДДДДДДДДД line with unicode',
+            lineTextOffset: 0,
+            range: [[0, 19], [0, 34]],
+            leadingContextLines: [],
+            trailingContextLines: []
+          })
+        })
+
        describe('when the core.excludeVcsIgnoredPaths config is truthy', () => {
          let projectPath
          let ignoredPath
--- a/src/ripgrep-directory-searcher.js
+++ b/src/ripgrep-directory-searcher.js
@ -92,6 +92,35 @@ function getPositionFromColumn (lines, column) {
  return [currentLine - 1, column - previousLength]
 }

+function processUnicodeMatch (match) {
+  if (match.lines.text.length === Buffer.byteLength(match.lines.text)) {
+    // fast codepath for lines that only contain characters of 1 byte length.
+    return
+  }
+
+  let remainingBuffer = Buffer.from(match.lines.text)
+  let currentLength = 0
+  let previousPosition = 0
+
+  function convertPosition (position) {
+    const currentBuffer = remainingBuffer.slice(0, position - previousPosition)
+    currentLength = currentBuffer.toString().length + currentLength
+    remainingBuffer = remainingBuffer.slice(position)
+
+    previousPosition = position
+
+    return currentLength
+  }
+
+  // Iterate over all the submatches to find the convert the start and end values
+  // (which come as bytes from ripgrep) to character positions.
+  // We can do this because submatches come ordered by position.
+  for (const submatch of match.submatches) {
+    submatch.start = convertPosition(submatch.start)
+    submatch.end = convertPosition(submatch.end)
+  }
+}
+
 // This function processes a ripgrep submatch to create the correct
 // range. This is mostly needed for multi-line results, since the range
 // will have differnt start and end rows and we need to calculate these
@ -247,7 +276,6 @@ module.exports = class RipgrepDirectorySearcher {
        buffer = lines.pop()
        for (const line of lines) {
          const message = JSON.parse(line)
-
          updateTrailingContexts(message, pendingTrailingContexts, options)

          if (message.type === 'begin') {
@ -261,6 +289,8 @@ module.exports = class RipgrepDirectorySearcher {
            const trailingContextLines = []
            pendingTrailingContexts.add(trailingContextLines)

+            processUnicodeMatch(message.data)
+
            for (const submatch of message.data.submatches) {
              const { lineText, range } = processSubmatch(
                submatch,