Skip to content

Commit 9104ca6

Browse files
committed
cleanup semantic lossless
1 parent b717ef4 commit 9104ca6

1 file changed

Lines changed: 120 additions & 97 deletions

File tree

DiffMatchPatch.zig

Lines changed: 120 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -594,96 +594,13 @@ fn diffCharsToLines(allocator: std.mem.Allocator, diffs: []Diff, line_array: []c
594594
for (diffs) |d| {
595595
text.items.len = 0;
596596
var j: usize = 0;
597-
while (j < diff.text.Length) : (j += 1) {
597+
while (j < diff.text.len) : (j += 1) {
598598
try text.append(allocator, line_array[d.text[j]]);
599599
}
600600
d.text = text;
601601
}
602602
}
603603

604-
//
605-
// Do a quick line-level diff on both strings, then rediff the parts for
606-
// greater accuracy.
607-
// This speedup can produce non-minimal diffs.
608-
// @param text1 Old string to be diffed.
609-
// @param text2 New string to be diffed.
610-
// @param deadline Time when the diff should be complete by.
611-
// @return List of Diff objects.
612-
//
613-
fn diff_lineMode(
614-
text1: []const u8,
615-
text2: []const u8,
616-
deadline: u64,
617-
) DiffError!ArrayListUnmanaged(Diff) {
618-
// Scan the text on a line-by-line basis first.
619-
var a = diff_linesToChars(text1, text2);
620-
text1 = a[0];
621-
text2 = a[1];
622-
var linearray = a[2];
623-
624-
var diffs: std.ArrayListUnmanaged(Diff) =
625-
diff_main(text1, text2, false, deadline);
626-
627-
// Convert the diff back to original text.
628-
diff_charsToLines(diffs, linearray);
629-
// Eliminate freak matches (e.g. blank lines)
630-
diff_cleanupSemantic(diffs);
631-
632-
// Rediff any replacement blocks, this time character-by-character.
633-
// Add a dummy entry at the end.
634-
try diffs.append(allocator, Diff(.equal, ""));
635-
var pointer: usize = 0;
636-
var count_delete: usize = 0;
637-
var count_insert: usize = 0;
638-
var text_delete: ArrayListUnmanaged(u8) = .{};
639-
var text_insert: ArrayListUnmanaged(u8) = .{};
640-
defer {
641-
text_delete.deinit(allocator);
642-
text_insert.deinit(allocator);
643-
}
644-
while (pointer < diffs.len) {
645-
switch (diffs[pointer].operation) {
646-
.insert => {
647-
count_insert += 1;
648-
// text_insert += diffs[pointer].text;
649-
text_insert.append(allocator, diffs[pointer].text);
650-
},
651-
.delete => {
652-
count_delete += 1;
653-
// text_delete += diffs[pointer].text;
654-
text_delete.append(allocator, diffs[pointer].text);
655-
},
656-
.equal => {
657-
// Upon reaching an equality, check for prior redundancies.
658-
if (count_delete >= 1 and count_insert >= 1) {
659-
// Delete the offending records and add the merged ones.
660-
// diffs.RemoveRange(pointer - count_delete - count_insert, count_delete + count_insert);
661-
diffs.replaceRange(
662-
allocator,
663-
pointer - count_delete - count_insert,
664-
count_delete + count_insert,
665-
&.{},
666-
);
667-
pointer = pointer - count_delete - count_insert;
668-
var subDiff = this.diff_main(text_delete, text_insert, false, deadline);
669-
// diffs.InsertRange(pointer, subDiff);
670-
try diffs.insertSlice(allocator, pointer, subDiff);
671-
pointer = pointer + subDiff.items.len;
672-
}
673-
count_insert = 0;
674-
count_delete = 0;
675-
text_delete.items.len = 0;
676-
text_insert.items.len = 0;
677-
},
678-
}
679-
pointer += 1;
680-
}
681-
// diffs.RemoveAt(diffs.Count - 1); // Remove the dummy entry at the end.
682-
diffs.items.len -= 1;
683-
684-
return diffs;
685-
}
686-
687604
//
688605
// Reorder and merge like edit sections. Merge equalities.
689606
// Any edit section can move as long as it doesn't cross an equality.
@@ -752,7 +669,7 @@ fn diffCleanupMerge(diffs: std.ArrayListUnmanaged(Diff), allocator: mem.Allocato
752669
try diffs.replaceRange(allocator, pointer, 0, &.{Diff{ .operation = .delete, .text = text_delete }});
753670
pointer += 1;
754671
}
755-
if (text_insert.Length != 0) {
672+
if (text_insert.len != 0) {
756673
try diffs.replaceRange(allocator, pointer, 0, &.{Diff{ .operation = .insert, .text = text_insert }});
757674
pointer += 1;
758675
}
@@ -789,16 +706,16 @@ fn diffCleanupMerge(diffs: std.ArrayListUnmanaged(Diff), allocator: mem.Allocato
789706
if (mem.endsWith(u8, diffs[pointer].text.items, diffs[pointer - 1].text.items)) {
790707
// Shift the edit over the previous equality.
791708
diffs[pointer].text = diffs[pointer - 1].text +
792-
diffs[pointer].text.Substring(0, diffs[pointer].text.Length -
793-
diffs[pointer - 1].text.Length);
709+
diffs[pointer].text[0 .. diffs[pointer].text.len -
710+
diffs[pointer - 1].text.len];
794711
diffs[pointer + 1].text = diffs[pointer - 1].text + diffs[pointer + 1].text;
795712
try diffs.replaceRange(allocator, pointer - 1, 1, &.{});
796713
changes = true;
797714
} else if (mem.startsWith(u8, diffs[pointer].text.items, diffs[pointer + 1].text.items)) {
798715
// Shift the edit over the next equality.
799716
diffs[pointer - 1].text += diffs[pointer + 1].text;
800717
diffs[pointer].text =
801-
diffs[pointer].text.Substring(diffs[pointer + 1].text.Length) + diffs[pointer + 1].text;
718+
diffs[pointer].text[diffs[pointer + 1].text.len..] + diffs[pointer + 1].text;
802719
try diffs.replaceRange(allocator, pointer + 1, 1, &.{});
803720
changes = true;
804721
}
@@ -834,13 +751,13 @@ fn diffCleanupSemantic(allocator: std.mem.Allocator, diffs: ArrayListUnmanaged(D
834751
lastEquality = diffs.items[pointer].text;
835752
} else { // an insertion or deletion
836753
if (diffs.items[pointer].operation == .equal) {
837-
length_insertions2 += diffs.items[pointer].text.Length;
754+
length_insertions2 += diffs.items[pointer].text.len;
838755
} else {
839-
length_deletions2 += diffs.items[pointer].text.Length;
756+
length_deletions2 += diffs.items[pointer].text.len;
840757
}
841758
// Eliminate an equality that is smaller or equal to the edits on both
842759
// sides of it.
843-
if (lastEquality != null and (lastEquality.Length <= std.math.max(length_insertions1, length_deletions1)) and (lastEquality.length <= std.math.max(length_insertions2, length_deletions2))) {
760+
if (lastEquality != null and (lastEquality.len <= std.math.max(length_insertions1, length_deletions1)) and (lastEquality.len <= std.math.max(length_insertions2, length_deletions2))) {
844761
// Duplicate record.
845762
diffs.Insert(equalities.Peek(), Diff{ .operation = .delete, .text = lastEquality });
846763
// Change second copy to insert.
@@ -884,27 +801,27 @@ fn diffCleanupSemantic(allocator: std.mem.Allocator, diffs: ArrayListUnmanaged(D
884801
var overlap_length1: isize = diff_commonOverlap(deletion, insertion);
885802
var overlap_length2: isize = diff_commonOverlap(insertion, deletion);
886803
if (overlap_length1 >= overlap_length2) {
887-
if (overlap_length1 >= deletion.Length / 2.0 or
888-
overlap_length1 >= insertion.Length / 2.0)
804+
if (overlap_length1 >= deletion.len / 2.0 or
805+
overlap_length1 >= insertion.len / 2.0)
889806
{
890807
// Overlap found.
891808
// Insert an equality and trim the surrounding edits.
892809
diffs.Insert(pointer, Diff{ .operation = .equal, .text = insertion.Substring(0, overlap_length1) });
893810
diffs.items[pointer - 1].text =
894-
deletion.Substring(0, deletion.Length - overlap_length1);
811+
deletion.Substring(0, deletion.len - overlap_length1);
895812
diffs.items[pointer + 1].text = insertion.Substring(overlap_length1);
896813
pointer += 1;
897814
}
898815
} else {
899-
if (overlap_length2 >= deletion.Length / 2.0 or
900-
overlap_length2 >= insertion.Length / 2.0)
816+
if (overlap_length2 >= deletion.len / 2.0 or
817+
overlap_length2 >= insertion.len / 2.0)
901818
{
902819
// Reverse overlap found.
903820
// Insert an equality and swap and trim the surrounding edits.
904821
diffs.Insert(pointer, Diff{ .operation = .equal, .text = deletion.Substring(0, overlap_length2) });
905822
diffs.items[pointer - 1].operation = Operation.INSERT;
906823
diffs.items[pointer - 1].text =
907-
insertion.Substring(0, insertion.Length - overlap_length2);
824+
insertion.Substring(0, insertion.len - overlap_length2);
908825
diffs.items[pointer + 1].operation = Operation.DELETE;
909826
diffs.items[pointer + 1].text = deletion.Substring(overlap_length2);
910827
pointer += 1;
@@ -915,3 +832,109 @@ fn diffCleanupSemantic(allocator: std.mem.Allocator, diffs: ArrayListUnmanaged(D
915832
pointer += 1;
916833
}
917834
}
835+
836+
/// Look for single edits surrounded on both sides by equalities
837+
/// which can be shifted sideways to align the edit to a word boundary.
838+
/// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
839+
pub fn diffCleanupSemanticLossless(
840+
dmp: DiffMatchPatch,
841+
allocator: std.mem.Allocator,
842+
diffs: *ArrayListUnmanaged(Diff),
843+
) error{OutOfMemory}!void {
844+
var pointer: usize = 1;
845+
// Intentionally ignore the first and last element (don't need checking).
846+
while (pointer < diffs.items.len - 1) {
847+
if (diffs.items[pointer - 1].operation == .equal and
848+
diffs.items[pointer + 1].operation == .equal)
849+
{
850+
// This is a single edit surrounded by equalities.
851+
var equality_1 = std.ArrayListUnmanaged(u8){};
852+
defer equality_1.deinit(allocator);
853+
try equality_1.appendSlice(allocator, diffs.items[pointer - 1].text);
854+
855+
var edit = std.ArrayListUnmanaged(u8){};
856+
defer edit.deinit(allocator);
857+
try edit.appendSlice(allocator, diffs.items[pointer].text);
858+
859+
var equality_2 = std.ArrayListUnmanaged(u8){};
860+
defer equality_2.deinit(allocator);
861+
try equality_2.appendSlice(allocator, diffs.items[pointer + 1].text);
862+
863+
// First, shift the edit as far left as possible.
864+
const common_offset = dmp.diffCommonSuffix(equality_1, edit);
865+
if (common_offset > 0) {
866+
// TODO: Use buffer
867+
const common_string = try allocator.dupe(u8, edit.items[edit.items.len - common_offset ..]);
868+
defer allocator.free(common_string);
869+
870+
equality_1.items.len = equality_1.len - common_offset;
871+
872+
edit.items.len = edit.items.len - common_offset;
873+
try edit.insertSlice(allocator, 0, common_string);
874+
875+
try equality_2.insertSlice(allocator, 0, common_string);
876+
}
877+
878+
// Second, step character by character right,
879+
// looking for the best fit.
880+
var best_equality_1 = ArrayListUnmanaged(u8){};
881+
defer best_equality_1.deinit(allocator);
882+
try best_equality_1.appendSlice(allocator, equality_1.items);
883+
884+
var best_edit = ArrayListUnmanaged(u8){};
885+
defer best_edit.deinit(allocator);
886+
try best_edit.appendSlice(allocator, edit.items);
887+
888+
var best_equality_2 = ArrayListUnmanaged(u8){};
889+
defer best_equality_2.deinit(allocator);
890+
try best_equality_2.appendSlice(allocator, equality_2.items);
891+
892+
var best_score = diffCleanupSemanticScore(equality_1, edit) +
893+
diffCleanupSemanticScore(edit, equality_2);
894+
895+
while (edit.len != 0 and equality_2.len != 0 and edit[0] == equality_2[0]) {
896+
try equality_1.append(allocator, edit.items[0]);
897+
898+
_ = edit.orderedRemove(0);
899+
try edit.append(allocator, equality_2.items[0]);
900+
901+
_ = equality_2.orderedRemove(0);
902+
903+
const score = diffCleanupSemanticScore(equality_1, edit) +
904+
diffCleanupSemanticScore(edit, equality_2);
905+
// The >= encourages trailing rather than leading whitespace on
906+
// edits.
907+
if (score >= best_score) {
908+
best_score = score;
909+
910+
best_equality_1.items.len = 0;
911+
try best_equality_1.appendSlice(allocator, equality_1.items);
912+
913+
best_edit.items.len = 0;
914+
try best_edit.appendSlice(allocator, edit.items);
915+
916+
best_equality_2.items.len = 0;
917+
try best_equality_2.appendSlice(allocator, equality_2.items);
918+
}
919+
}
920+
921+
if (diffs[pointer - 1].text != best_equality_1) {
922+
// We have an improvement, save it back to the diff.
923+
if (best_equality_1.len != 0) {
924+
diffs[pointer - 1].text = try allocator.dupe(u8, best_equality_1);
925+
} else {
926+
_ = diffs.orderedRemove(pointer - 1);
927+
pointer -= 1;
928+
}
929+
diffs[pointer].text = best_edit;
930+
if (best_equality_2.len != 0) {
931+
diffs[pointer + 1].text = try allocator.dupe(u8, best_equality_2);
932+
} else {
933+
_ = diffs.orderedRemove(pointer + 1);
934+
pointer -= 1;
935+
}
936+
}
937+
}
938+
pointer += 1;
939+
}
940+
}

0 commit comments

Comments
 (0)