Skip to content

Commit

Permalink
Skip unique items before computing Myer's diff on text
Browse files Browse the repository at this point in the history
This substantially improves performance on text files where there are
few lines in common.

For example, 10,000 line files with no lines in common is more than 10x
faster (8.5 seconds to 0.49 seconds on my machine), and
sample_files/huge_cpp_before.cpp is nearly 2% faster.

Fixes the case mentioned by @quackenbush in #236.

This is inspired by the heuristics discussions at
mitsuhiko/similar#15
  • Loading branch information
Wilfred committed Jan 15, 2023
1 parent c08eefb commit 0e3c57c
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 9 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

Improved CSS parsing and HTML sublanguage parsing.

### Diffing

Improved textual diffing performance, particularly when the two files
have few lines in common.

### Display

Fixed an issue with unwanted underlines with textual diffing when
Expand Down
2 changes: 1 addition & 1 deletion sample_files/compare.expected
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ sample_files/html_simple_before.html sample_files/html_simple_after.html
ce3bfa12bc21d0eb5528766e18387e86 -

sample_files/huge_cpp_before.cpp sample_files/huge_cpp_after.cpp
8910dbf7dae13b1a7229b0497602b414 -
a85613f8c3cad686d592a276cad8d883 -

sample_files/identical_before.scala sample_files/identical_after.scala
9c7319f61833e46a0a8cb6c01cc997c9 -
Expand Down
136 changes: 131 additions & 5 deletions src/diff/myers_diff.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! A fast diff for linear content, using Myer's diff algorithm.

use rustc_hash::FxHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use std::hash::Hash;

#[derive(Debug, PartialEq)]
Expand All @@ -10,6 +10,8 @@ pub enum DiffResult<T> {
Right(T),
}

/// Compute a linear diff between `lhs` and `rhs`. This is the
/// traditional Myer's diff algorithm.
pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
wu_diff::diff(lhs, rhs)
.into_iter()
Expand All @@ -25,12 +27,18 @@ pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
.collect::<Vec<_>>()
}

/// Compute a unique numeric value for each item, use that for
/// diffing, then return diff results in terms of the original type.
/// Compute a linear diff between `lhs` and `rhs`, but use hashed
/// values internally.
///
/// This is the decorate-sort-undecorate pattern, or Schwartzian
/// transform, for diffing.
/// This is faster when equality checks on `T` are expensive, such as
/// large strings.
pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
// Compute a unique numeric value for each item, use that for
// diffing, then return diff results in terms of the original
// type.
//
// This is the decorate-sort-undecorate pattern, or Schwartzian
// transform, for diffing.
let mut value_ids: FxHashMap<&T, u32> = FxHashMap::default();
let mut id_values: FxHashMap<u32, &T> = FxHashMap::default();

Expand Down Expand Up @@ -75,6 +83,101 @@ pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
.collect::<Vec<_>>()
}

/// Compute the linear diff between `lhs` and `rhs`. If there are
/// items that only occur on a single side, mark them as novel without
/// processing them with Myer's diff.
///
/// This is substantially faster than `slice`, when `lhs` and `rhs`
/// have few items in common.
///
/// (This heuristic is used in traditional diff tools too, such as GNU
/// diff.)
pub fn slice_unique_by_hash<'a, T: Eq + Clone + Hash>(
lhs: &'a [T],
rhs: &'a [T],
) -> Vec<DiffResult<&'a T>> {
let mut lhs_set = FxHashSet::default();
for item in lhs {
lhs_set.insert(item);
}
let mut rhs_set = FxHashSet::default();
for item in rhs {
rhs_set.insert(item);
}

let lhs_without_unique: Vec<&'a T> = lhs.iter().filter(|n| rhs_set.contains(n)).collect();
let rhs_without_unique: Vec<&'a T> = rhs.iter().filter(|n| lhs_set.contains(n)).collect();

let mut res: Vec<DiffResult<&'a T>> = Vec::with_capacity(lhs.len());
let mut lhs_i = 0;
let mut rhs_i = 0;

for item in slice_by_hash(&lhs_without_unique, &rhs_without_unique) {
match item {
DiffResult::Left(lhs_item) => {
while lhs_i < lhs.len() {
if &lhs[lhs_i] != *lhs_item {
res.push(DiffResult::Left(&lhs[lhs_i]));
lhs_i += 1;
} else {
break;
}
}

res.push(DiffResult::Left(*lhs_item));
lhs_i += 1;
}
DiffResult::Both(lhs_item, rhs_item) => {
while lhs_i < lhs.len() {
if &lhs[lhs_i] != *lhs_item {
res.push(DiffResult::Left(&lhs[lhs_i]));
lhs_i += 1;
} else {
break;
}
}

while rhs_i < rhs.len() {
if &rhs[rhs_i] != *rhs_item {
res.push(DiffResult::Right(&rhs[rhs_i]));
rhs_i += 1;
} else {
break;
}
}

res.push(DiffResult::Both(*lhs_item, *rhs_item));
lhs_i += 1;
rhs_i += 1;
}
DiffResult::Right(rhs_item) => {
while rhs_i < rhs.len() {
if &rhs[rhs_i] != *rhs_item {
res.push(DiffResult::Right(&rhs[rhs_i]));
rhs_i += 1;
} else {
break;
}
}

res.push(DiffResult::Right(*rhs_item));
rhs_i += 1;
}
}
}

while lhs_i < lhs.len() {
res.push(DiffResult::Left(&lhs[lhs_i]));
lhs_i += 1;
}
while rhs_i < rhs.len() {
res.push(DiffResult::Right(&rhs[rhs_i]));
rhs_i += 1;
}

res
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -124,4 +227,27 @@ mod tests {
]
);
}

#[test]
fn test_slice_unique_same_items() {
let diff_items = slice_unique_by_hash(&["a", "b"], &["a", "b"]);
assert_eq!(
diff_items,
vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
);
}

#[test]
fn test_slice_unique_different_items() {
let diff_items = slice_unique_by_hash(&["a", "b"], &["c", "d"]);
assert_eq!(
diff_items,
vec![
DiffResult::Left(&"a"),
DiffResult::Left(&"b"),
DiffResult::Right(&"c"),
DiffResult::Right(&"d"),
]
);
}
}
4 changes: 2 additions & 2 deletions src/line_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ fn changed_parts<'a>(
let opposite_src_lines = split_lines_keep_newline(opposite_src);

let mut res: Vec<(TextChangeKind, Vec<&'a str>, Vec<&'a str>)> = vec![];
for diff_res in myers_diff::slice_by_hash(&src_lines, &opposite_src_lines) {
for diff_res in myers_diff::slice_unique_by_hash(&src_lines, &opposite_src_lines) {
match diff_res {
myers_diff::DiffResult::Left(line) => {
res.push((TextChangeKind::Novel, vec![line], vec![]));
Expand Down Expand Up @@ -141,7 +141,7 @@ pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
let lhs_part = lhs_lines.join("");
let rhs_part = rhs_lines.join("");

for diff_res in myers_diff::slice(&split_words(&lhs_part), &split_words(&rhs_part))
for diff_res in myers_diff::slice_unique_by_hash(&split_words(&lhs_part), &split_words(&rhs_part))
{
match diff_res {
myers_diff::DiffResult::Left(lhs_word) => {
Expand Down
2 changes: 1 addition & 1 deletion src/parse/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ fn split_comment_words(
let mut opposite_offset = 0;

let mut res = vec![];
for diff_res in myers_diff::slice(&content_parts, &other_parts) {
for diff_res in myers_diff::slice_by_hash(&content_parts, &other_parts) {
match diff_res {
myers_diff::DiffResult::Left(word) => {
// This word is novel to this side.
Expand Down

0 comments on commit 0e3c57c

Please sign in to comment.