Skip to content

Commit b0838cb

Browse files
authored
Add Bitap pattern matching (#458)
1 parent 327af3d commit b0838cb

File tree

3 files changed

+363
-0
lines changed

3 files changed

+363
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
using System;
2+
using Algorithms.Strings.PatternMatching;
3+
using NUnit.Framework;
4+
5+
namespace Algorithms.Tests.Strings.PatternMatching;
6+
7+
[TestFixture]
8+
public class BitapTests
9+
{
10+
[Test]
11+
public void FindExactPattern_EmptyTextReturnsError()
12+
{
13+
Assert.That(Bitap.FindExactPattern("", "abc"), Is.EqualTo(-1));
14+
}
15+
16+
[Test]
17+
public void FindExactPattern_EmptyPatternReturnsZero()
18+
{
19+
Assert.That(Bitap.FindExactPattern("abc", ""), Is.EqualTo(0));
20+
}
21+
22+
[Test]
23+
public void FindExactPattern_PatternFoundAtBeginning()
24+
{
25+
Assert.That(Bitap.FindExactPattern("hello world", "hello"), Is.EqualTo(0));
26+
}
27+
28+
[Test]
29+
public void FindExactPattern_PatternFoundInTheMiddle()
30+
{
31+
Assert.That(Bitap.FindExactPattern("abcabc", "cab"), Is.EqualTo(2));
32+
}
33+
34+
[Test]
35+
public void FindExactPattern_PatternFoundAtEnd()
36+
{
37+
Assert.That(Bitap.FindExactPattern("the end", "end"), Is.EqualTo(4));
38+
}
39+
40+
[Test]
41+
public void FindExactPattern_PatternNotFound()
42+
{
43+
Assert.That(Bitap.FindExactPattern("abcdefg", "xyz"), Is.EqualTo(-1));
44+
}
45+
46+
[Test]
47+
public void FindExactPattern_PatternLongerThanText()
48+
{
49+
Assert.That(Bitap.FindExactPattern("short", "longerpattern"), Is.EqualTo(-1));
50+
}
51+
52+
[Test]
53+
public void FindExactPattern_OverlappingPatterns()
54+
{
55+
Assert.That(Bitap.FindExactPattern("ababab", "abab"), Is.EqualTo(0));
56+
}
57+
58+
[Test]
59+
public void FindExactPattern_PatternTooLongThrowsException()
60+
{
61+
var longPattern = new string('a', 32);
62+
Assert.Throws<ArgumentException>(() => Bitap.FindExactPattern("some text", longPattern));
63+
}
64+
65+
[Test]
66+
public void FindExactPattern_SpecialCharactersInPattern()
67+
{
68+
Assert.That(Bitap.FindExactPattern("hello, world!", ", wo"), Is.EqualTo(5));
69+
}
70+
71+
[Test]
72+
public void FindFuzzyPattern_EmptyTextReturnsZero()
73+
{
74+
Assert.That(Bitap.FindFuzzyPattern("", "abc", 1), Is.EqualTo(0));
75+
}
76+
77+
[Test]
78+
public void FindFuzzyPattern_EmptyPatternReturnsZero()
79+
{
80+
Assert.That(Bitap.FindFuzzyPattern("def", "", 1), Is.EqualTo(0));
81+
}
82+
83+
[Test]
84+
public void FindFuzzyPattern_ExactMatchFound()
85+
{
86+
Assert.That(Bitap.FindFuzzyPattern("hello world", "hello", 0), Is.EqualTo(0));
87+
}
88+
89+
[Test]
90+
public void FindFuzzyPattern_FuzzyMatchWithOneMismatch()
91+
{
92+
Assert.That(Bitap.FindFuzzyPattern("hello world", "hellp", 1), Is.EqualTo(0));
93+
}
94+
95+
[Test]
96+
public void FindFuzzyPattern_FuzzyMatchWithMultipleMismatches()
97+
{
98+
Assert.That(Bitap.FindFuzzyPattern("abcde", "xbcdz", 2), Is.EqualTo(0));
99+
}
100+
101+
[Test]
102+
public void FindFuzzyPattern_FuzzyMatchAtEnd()
103+
{
104+
Assert.That(Bitap.FindFuzzyPattern("abcdefg", "efx", 1), Is.EqualTo(4));
105+
}
106+
107+
[Test]
108+
public void FindFuzzyPattern_FuzzyMatchNotFound()
109+
{
110+
Assert.That(Bitap.FindFuzzyPattern("abcdefg", "xyz", 2), Is.EqualTo(-1));
111+
}
112+
113+
[Test]
114+
public void FindFuzzyPattern_PatternTooLongReturnsNegativeOne()
115+
{
116+
var longPattern = new string('a', 32);
117+
Assert.That(Bitap.FindFuzzyPattern("some text", longPattern, 1), Is.EqualTo(-1));
118+
}
119+
}
+243
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
using System;
2+
3+
namespace Algorithms.Strings.PatternMatching;
4+
5+
/// <summary>
6+
/// The Bitap algorithm is a fuzzy string matching technique. It ains to find approximate matches of a pattern within a
7+
/// text, allowing for a certain degree of mismatch (e.g., mistypes, minor variations etc.). It's knowd for its efficiency,
8+
/// using bitwise operations for fast comparisons.
9+
///
10+
/// <para>
11+
/// <b>How it works:</b>
12+
/// <list type="number">
13+
/// <item>
14+
/// <term>Initialization</term>
15+
/// <description>
16+
/// Bitmasks are created for each character in the pattern. These bitmasks are essentially binary numbers where each bit
17+
/// represents a specific character's position within the pattern. An initial state variable <c>R</c> is set to all 1s,
18+
/// indicating that all characters in the pattern are initially unmatched.
19+
/// </description>
20+
/// </item>
21+
/// <item>
22+
/// <term>Iteration</term>
23+
/// <description>
24+
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
25+
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
26+
/// the corresponding character in the pattern.
27+
/// </description>
28+
/// </item>
29+
/// <item>
30+
/// <term>Matching</term>
31+
/// <description>
32+
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
33+
/// If it is, it means there's a potential match at that position, with a mismatch distance that's within the allowed
34+
/// threshold.
35+
/// </description>
36+
/// </item>
37+
/// </list>
38+
/// </para>
39+
/// <para>
40+
/// <b> Finding Matches </b>
41+
/// </para>
42+
/// <para>
43+
/// If the least significant bit of <c>R</c> is 1, it means a potential match is found.
44+
/// The number of leading zeros in <c>R</c> indicates the mismatch distance.
45+
/// If this distance is within the allowed threshold, it's considered a valid match.
46+
/// </para>
47+
/// </summary>
48+
public static class Bitap
49+
{
50+
/// <summary>
51+
/// <para>
52+
/// This function implements the Bitap algorithm for finding exact matches of a pattern within a text.
53+
/// It aims to find the first occurrence of the pattern in the text, allowing for no mismatches.
54+
/// </para>
55+
/// <para>
56+
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
57+
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
58+
/// the corresponding character in the pattern.
59+
/// </para>
60+
/// <para>
61+
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
62+
/// If it is, it means there's a potential match at that position, with a mismatch distance of 0.
63+
/// The function returns the index of the first occurrence of the pattern in the text, or -1 if not found.
64+
/// </para>
65+
/// <para>
66+
/// The function throws an <see cref="ArgumentException"/> if the pattern is longer than 31 characters.
67+
/// This is because the maximum length of the pattern is 31, because if it's longer than that,
68+
/// we won't be able to represent the pattern mask in an int.
69+
/// </para>
70+
/// </summary>
71+
/// <param name="text">The text to search in.</param>
72+
/// <param name="pattern">The pattern to search for.</param>
73+
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
74+
/// <exception cref="ArgumentException">The pattern is longer than 31 characters.</exception>
75+
public static int FindExactPattern(string text, string pattern)
76+
{
77+
// The length of the pattern.
78+
var len = pattern.Length;
79+
80+
// An array of integers that will be used to mask the pattern.
81+
// The pattern mask is a bitmask that we will use to search for the pattern characters
82+
// in the text. We'll set the bit corresponding to the character in the pattern
83+
// to 0, and then use bitwise operations to check for the pattern.
84+
var patternMask = new int[128];
85+
int index;
86+
87+
// Check if the pattern is empty.
88+
if (string.IsNullOrEmpty(pattern))
89+
{
90+
return 0;
91+
}
92+
93+
// Check if the pattern is longer than 31 characters.
94+
if (len > 31)
95+
{
96+
throw new ArgumentException("The pattern is longer than 31 characters.");
97+
}
98+
99+
// Initialize the register <c>R</c> to all 1s.
100+
var r = ~1;
101+
102+
// Initialize the pattern mask to all 1s.
103+
for (index = 0; index <= 127; ++index)
104+
{
105+
patternMask[index] = ~0;
106+
}
107+
108+
// Set the bits corresponding to the characters in the pattern to 0 in the pattern mask.
109+
for (index = 0; index < len; ++index)
110+
{
111+
patternMask[pattern[index]] &= ~(1 << index);
112+
}
113+
114+
// Iterate through each character in the text.
115+
for (index = 0; index < text.Length; ++index)
116+
{
117+
// Update the state <c>R</c> by ORing the pattern mask with the character in the text,
118+
// and then shift it to the left by 1.
119+
r |= patternMask[text[index]];
120+
r <<= 1;
121+
122+
// Check if the least significant bit of <c>R</c> is set to 1.
123+
// If there's a potential match at that position, with a mismatch distance of 0,
124+
// return the index of the first occurrence of the pattern in the text.
125+
if ((r & 1 << len) == 0)
126+
{
127+
return index - len + 1;
128+
}
129+
}
130+
131+
// If no match is found, return -1.
132+
return -1;
133+
}
134+
135+
/// <summary>
136+
/// Finds the first occurrence of a pattern in a given text with a given threshold for mismatches.
137+
/// </summary>
138+
/// <param name="text">The text to search in.</param>
139+
/// <param name="pattern">The pattern to search for.</param>
140+
/// <param name="threshold">The maximum number of mismatches allowed.</param>
141+
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
142+
public static int FindFuzzyPattern(string text, string pattern, int threshold)
143+
{
144+
// Create a pattern mask for each character in the pattern.
145+
// The pattern mask is a bitmask that we will use to search for the pattern characters
146+
// in the text. We'll set the bit corresponding to the character in the pattern
147+
// to 0, and then use bitwise operations to check for the pattern.
148+
var patternMask = new int[128];
149+
150+
// Create a register array.
151+
// The register array is used to keep track of the pattern mask as we search for the pattern.
152+
// We'll start with a register that has all bits set to 1, because all bits in the pattern mask
153+
// will be set to 1 initially.
154+
var r = new int[(threshold + 1) * sizeof(int)];
155+
156+
var len = pattern.Length;
157+
158+
// Check for empty strings.
159+
// If the text is empty, return 0.
160+
// If the pattern is empty, return 0.
161+
if (string.IsNullOrEmpty(text))
162+
{
163+
return 0;
164+
}
165+
166+
if (string.IsNullOrEmpty(pattern))
167+
{
168+
return 0;
169+
}
170+
171+
// Check for a pattern that is too long.
172+
// If the pattern is longer than 31 characters, return -1.
173+
// The maximum length of the pattern is 31, because if it's longer than that,
174+
// we won't be able to represent the pattern mask in an int.
175+
if (len > 31)
176+
{
177+
return -1;
178+
}
179+
180+
// Initialize the register.
181+
// Set the least significant bit in the register to 0 or 1
182+
// depending on whether the current character in the text matches the pattern.
183+
// This will make it easier to check for the pattern later.
184+
for (var i = 0; i <= threshold; ++i)
185+
{
186+
r[i] = ~1;
187+
}
188+
189+
// Initialize the pattern mask.
190+
// Set the bit corresponding to each character in the pattern to 0 in the pattern mask.
191+
// This will make it easier to check for the pattern later.
192+
for (var i = 0; i <= 127; i++)
193+
{
194+
patternMask[i] = ~0;
195+
}
196+
197+
// Set the pattern mask for each character in the pattern.
198+
// Use bitwise AND to clear the bit corresponding to the current character.
199+
for (var i = 0; i < len; ++i)
200+
{
201+
patternMask[pattern[i]] &= ~(1 << i);
202+
}
203+
204+
// Search for the pattern in the text.
205+
// Loop through each character in the text.
206+
for (var i = 0; i < text.Length; ++i)
207+
{
208+
// Update the register.
209+
// Set the least significant bit in the register to 0 or 1
210+
// depending on whether the current character in the text matches the pattern.
211+
// This will make it easier to check for the pattern later.
212+
var oldR = r[0];
213+
214+
r[0] |= patternMask[text[i]];
215+
r[0] <<= 1;
216+
217+
// Update the other registers.
218+
// Set the least significant bit in each register to 0 or 1
219+
// depending on whether the current character in the text matches the pattern.
220+
// This will make it easier to check for the pattern later.
221+
for (var j = 1; j <= threshold; ++j)
222+
{
223+
var tmp = r[j];
224+
225+
r[j] = (oldR & (r[j] | patternMask[text[i]])) << 1;
226+
oldR = tmp;
227+
}
228+
229+
// If the pattern has been found, return the index.
230+
// Check the most significant bit in the register.
231+
// If it's 0, then the pattern has been found.
232+
if ((r[threshold] & 1 << len) == 0)
233+
{
234+
// The pattern has been found.
235+
// Return the index of the first character in the pattern.
236+
return i - len + 1;
237+
}
238+
}
239+
240+
// The pattern has not been found.
241+
return -1;
242+
}
243+
}

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ find more than one implementation for the same objective but using different alg
182182
* [Jaro Similarity](./Algorithms/Strings/Similarity/JaroSimilarity.cs)
183183
* [Jaro-Winkler Distance](./Algorithms/Strings/Similarity/JaroWinklerDistance.cs)
184184
* [Pattern Matching](./Algorithms/Strings/PatternMatching/)
185+
* [Bitop Pattern Matching](./Algorithms/Strings/PatternMatching/Bitap.cs)
185186
* [Naive String Search](./Algorithms/Strings/PatternMatching/NaiveStringSearch.cs)
186187
* [Rabin Karp](./Algorithms/Strings/PatternMatching/RabinKarp.cs)
187188
* [Boyer Moore](./Algorithms/Strings/PatternMatching/BoyerMoore.cs)

0 commit comments

Comments
 (0)