Skip unique items before computing Myer's diff on text

This substantially improves performance on text files where there are few lines in common. For example, 10,000 line files with no lines in common is more than 10x faster (8.5 seconds to 0.49 seconds on my machine), and sample_files/huge_cpp_before.cpp is nearly 2% faster. Fixes the case mentioned by @quackenbush in #236. This is inspired by the heuristics discussions at mitsuhiko/similar#15
Wilfred · Jan 15, 2023 · 0e3c57c · 0e3c57c
1 parent c08eefb
commit 0e3c57c
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@
 
 Improved CSS parsing and HTML sublanguage parsing.
 
+### Diffing
+
+Improved textual diffing performance, particularly when the two files
+have few lines in common.
+
 ### Display
 
 Fixed an issue with unwanted underlines with textual diffing when

diff --git a/sample_files/compare.expected b/sample_files/compare.expected
@@ -71,7 +71,7 @@ sample_files/html_simple_before.html sample_files/html_simple_after.html
 ce3bfa12bc21d0eb5528766e18387e86  -
 
 sample_files/huge_cpp_before.cpp sample_files/huge_cpp_after.cpp
-8910dbf7dae13b1a7229b0497602b414  -
+a85613f8c3cad686d592a276cad8d883  -
 
 sample_files/identical_before.scala sample_files/identical_after.scala
 9c7319f61833e46a0a8cb6c01cc997c9  -

diff --git a/src/diff/myers_diff.rs b/src/diff/myers_diff.rs
@@ -1,6 +1,6 @@
 //! A fast diff for linear content, using Myer's diff algorithm.
 
-use rustc_hash::FxHashMap;
+use rustc_hash::{FxHashMap, FxHashSet};
 use std::hash::Hash;
 
 #[derive(Debug, PartialEq)]
@@ -10,6 +10,8 @@ pub enum DiffResult<T> {
     Right(T),
 }
 
+/// Compute a linear diff between `lhs` and `rhs`. This is the
+/// traditional Myer's diff algorithm.
 pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
     wu_diff::diff(lhs, rhs)
         .into_iter()
@@ -25,12 +27,18 @@ pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
         .collect::<Vec<_>>()
 }
 
-/// Compute a unique numeric value for each item, use that for
-/// diffing, then return diff results in terms of the original type.
+/// Compute a linear diff between `lhs` and `rhs`, but use hashed
+/// values internally.
 ///
-/// This is the decorate-sort-undecorate pattern, or Schwartzian
-/// transform, for diffing.
+/// This is faster when equality checks on `T` are expensive, such as
+/// large strings.
 pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
+    // Compute a unique numeric value for each item, use that for
+    // diffing, then return diff results in terms of the original
+    // type.
+    //
+    // This is the decorate-sort-undecorate pattern, or Schwartzian
+    // transform, for diffing.
     let mut value_ids: FxHashMap<&T, u32> = FxHashMap::default();
     let mut id_values: FxHashMap<u32, &T> = FxHashMap::default();
 
@@ -75,6 +83,101 @@ pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
         .collect::<Vec<_>>()
 }
 
+/// Compute the linear diff between `lhs` and `rhs`. If there are
+/// items that only occur on a single side, mark them as novel without
+/// processing them with Myer's diff.
+///
+/// This is substantially faster than `slice`, when `lhs` and `rhs`
+/// have few items in common.
+///
+/// (This heuristic is used in traditional diff tools too, such as GNU
+/// diff.)
+pub fn slice_unique_by_hash<'a, T: Eq + Clone + Hash>(
+    lhs: &'a [T],
+    rhs: &'a [T],
+) -> Vec<DiffResult<&'a T>> {
+    let mut lhs_set = FxHashSet::default();
+    for item in lhs {
+        lhs_set.insert(item);
+    }
+    let mut rhs_set = FxHashSet::default();
+    for item in rhs {
+        rhs_set.insert(item);
+    }
+
+    let lhs_without_unique: Vec<&'a T> = lhs.iter().filter(|n| rhs_set.contains(n)).collect();
+    let rhs_without_unique: Vec<&'a T> = rhs.iter().filter(|n| lhs_set.contains(n)).collect();
+
+    let mut res: Vec<DiffResult<&'a T>> = Vec::with_capacity(lhs.len());
+    let mut lhs_i = 0;
+    let mut rhs_i = 0;
+
+    for item in slice_by_hash(&lhs_without_unique, &rhs_without_unique) {
+        match item {
+            DiffResult::Left(lhs_item) => {
+                while lhs_i < lhs.len() {
+                    if &lhs[lhs_i] != *lhs_item {
+                        res.push(DiffResult::Left(&lhs[lhs_i]));
+                        lhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Left(*lhs_item));
+                lhs_i += 1;
+            }
+            DiffResult::Both(lhs_item, rhs_item) => {
+                while lhs_i < lhs.len() {
+                    if &lhs[lhs_i] != *lhs_item {
+                        res.push(DiffResult::Left(&lhs[lhs_i]));
+                        lhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                while rhs_i < rhs.len() {
+                    if &rhs[rhs_i] != *rhs_item {
+                        res.push(DiffResult::Right(&rhs[rhs_i]));
+                        rhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Both(*lhs_item, *rhs_item));
+                lhs_i += 1;
+                rhs_i += 1;
+            }
+            DiffResult::Right(rhs_item) => {
+                while rhs_i < rhs.len() {
+                    if &rhs[rhs_i] != *rhs_item {
+                        res.push(DiffResult::Right(&rhs[rhs_i]));
+                        rhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Right(*rhs_item));
+                rhs_i += 1;
+            }
+        }
+    }
+
+    while lhs_i < lhs.len() {
+        res.push(DiffResult::Left(&lhs[lhs_i]));
+        lhs_i += 1;
+    }
+    while rhs_i < rhs.len() {
+        res.push(DiffResult::Right(&rhs[rhs_i]));
+        rhs_i += 1;
+    }
+
+    res
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -124,4 +227,27 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_slice_unique_same_items() {
+        let diff_items = slice_unique_by_hash(&["a", "b"], &["a", "b"]);
+        assert_eq!(
+            diff_items,
+            vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
+        );
+    }
+
+    #[test]
+    fn test_slice_unique_different_items() {
+        let diff_items = slice_unique_by_hash(&["a", "b"], &["c", "d"]);
+        assert_eq!(
+            diff_items,
+            vec![
+                DiffResult::Left(&"a"),
+                DiffResult::Left(&"b"),
+                DiffResult::Right(&"c"),
+                DiffResult::Right(&"d"),
+            ]
+        );
+    }
 }
diff --git a/src/line_parser.rs b/src/line_parser.rs
@@ -78,7 +78,7 @@ fn changed_parts<'a>(
     let opposite_src_lines = split_lines_keep_newline(opposite_src);
 
     let mut res: Vec<(TextChangeKind, Vec<&'a str>, Vec<&'a str>)> = vec![];
-    for diff_res in myers_diff::slice_by_hash(&src_lines, &opposite_src_lines) {
+    for diff_res in myers_diff::slice_unique_by_hash(&src_lines, &opposite_src_lines) {
         match diff_res {
             myers_diff::DiffResult::Left(line) => {
                 res.push((TextChangeKind::Novel, vec![line], vec![]));
@@ -141,7 +141,7 @@ pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
                 let lhs_part = lhs_lines.join("");
                 let rhs_part = rhs_lines.join("");
 
-                for diff_res in myers_diff::slice(&split_words(&lhs_part), &split_words(&rhs_part))
+                for diff_res in myers_diff::slice_unique_by_hash(&split_words(&lhs_part), &split_words(&rhs_part))
                 {
                     match diff_res {
                         myers_diff::DiffResult::Left(lhs_word) => {

diff --git a/src/parse/syntax.rs b/src/parse/syntax.rs
@@ -674,7 +674,7 @@ fn split_comment_words(
     let mut opposite_offset = 0;
 
     let mut res = vec![];
-    for diff_res in myers_diff::slice(&content_parts, &other_parts) {
+    for diff_res in myers_diff::slice_by_hash(&content_parts, &other_parts) {
         match diff_res {
             myers_diff::DiffResult::Left(word) => {
                 // This word is novel to this side.