Pattern matching is a cornerstone of computer science, enabling efficient search, validation, and transformation of strings. This tutorial walks you through the most widely used pattern‑matching algorithms, their theoretical foundations, practical implementations, and guidance on choosing the right tool for your application.
Understanding Pattern Matching
At its core, pattern matching seeks to locate occurrences of a pattern (also called a needle) inside a larger text (the haystack). While the problem sounds simple, the naïve approach can be prohibitively slow for large inputs, motivating the development of sophisticated algorithms.
Why Pattern Matching Matters
- Search engines indexing billions of web pages
- Intrusion detection systems scanning network traffic
- Compilers performing lexical analysis
- Bioinformatics tools aligning DNA sequences
- Data validation in form inputs and configuration files
Fundamental Algorithms
1️⃣ Naïve String Search
The simplest method slides the pattern across the text one character at a time, comparing characters sequentially. Its worst‑case time complexity is O(m·n), where m is the pattern length and n is the text length.
def naive_search(text, pattern):
n, m = len(text), len(pattern)
occurrences = []
for i in range(n - m + 1):
match = True
for j in range(m):
if text[i + j] != pattern[j]:
match = False
break
if match:
occurrences.append(i)
return occurrences
public static List<Integer> naiveSearch(String text, String pattern) {
int n = text.length();
int m = pattern.length();
List<Integer> occurrences = new ArrayList<>();
for (int i = 0; i <= n - m; i++) {
int j = 0;
while (j < m && text.charAt(i + j) == pattern.charAt(j)) {
j++;
}
if (j == m) {
occurrences.add(i);
}
}
return occurrences;
}
2️⃣ Rabin‑Karp Algorithm
Rabin‑Karp uses a rolling hash to compare the pattern with substrings of the text, achieving an average time complexity of O(n + m). In the worst case (hash collisions), it degrades to O(n·m).
#include <bits/stdc++.h>
using namespace std;
const long long MOD = 1000000007LL;
const long long BASE = 256LL;
vector<int> rabinKarp(const string& text, const string& pattern) {
int n = text.size(), m = pattern.size();
if (m > n) return {};
long long patHash = 0, winHash = 0, power = 1;
for (int i = 0; i < m; ++i) {
patHash = (patHash * BASE + pattern[i]) % MOD;
winHash = (winHash * BASE + text[i]) % MOD;
if (i > 0) power = (power * BASE) % MOD; // BASE^(m-1)
}
vector<int> occ;
for (int i = 0; i <= n - m; ++i) {
if (patHash == winHash && text.substr(i, m) == pattern) {
occ.push_back(i);
}
if (i < n - m) {
winHash = (winHash - text[i] * power % MOD + MOD) % MOD; // remove left char
winHash = (winHash * BASE + text[i + m]) % MOD; // add right char
}
}
return occ;
}
int main() {
string text = "abedabcabcabf";
string pat = "abc";
auto res = rabinKarp(text, pat);
for (int idx : res) cout << idx << " ";
return 0;
}
def rabin_karp(text, pattern):
n, m = len(text), len(pattern)
if m > n:
return []
base = 256
mod = 101
pat_hash = 0
win_hash = 0
h = 1 # base^(m-1) % mod
for i in range(m - 1):
h = (h * base) % mod
for i in range(m):
pat_hash = (base * pat_hash + ord(pattern[i])) % mod
win_hash = (base * win_hash + ord(text[i])) % mod
occurrences = []
for i in range(n - m + 1):
if pat_hash == win_hash and text[i:i+m] == pattern:
occurrences.append(i)
if i < n - m:
win_hash = (win_hash - ord(text[i]) * h) % mod
win_hash = (win_hash * base + ord(text[i + m])) % mod
win_hash = (win_hash + mod) % mod # ensure positive
return occurrences
3️⃣ Knuth‑Morris‑Pratt (KMP) Algorithm
KMP eliminates backtracking by pre‑computing a prefix function (also called the "failure function"). This yields a guaranteed linear time O(n + m).
public class KMP {
private static int[] computeLPS(String pattern) {
int m = pattern.length();
int[] lps = new int[m];
int len = 0; // length of the previous longest prefix suffix
int i = 1;
while (i < m) {
if (pattern.charAt(i) == pattern.charAt(len)) {
len++;
lps[i] = len;
i++;
} else {
if (len != 0) {
len = lps[len - 1];
} else {
lps[i] = 0;
i++;
}
}
}
return lps;
}
public static List<Integer> search(String text, String pattern) {
int n = text.length();
int m = pattern.length();
int[] lps = computeLPS(pattern);
List<Integer> occ = new ArrayList<>();
int i = 0, j = 0; // i -> text, j -> pattern
while (i < n) {
if (text.charAt(i) == pattern.charAt(j)) {
i++; j++;
if (j == m) {
occ.add(i - j);
j = lps[j - 1];
}
} else {
if (j != 0) {
j = lps[j - 1];
} else {
i++;
}
}
}
return occ;
}
public static void main(String[] args) {
String text = "ababcabcabababd";
String pat = "ababd";
System.out.println(search(text, pat)); // [10]
}
}
4️⃣ Boyer‑Moore Algorithm
Boyer‑Moore processes the pattern from right to left and uses two heuristics—bad character and good suffix—to skip sections of the text. Its average performance is sub‑linear, especially for long patterns.
#include <bits/stdc++.h>
using namespace std;
vector<int> buildBadCharTable(const string& pat) {
const int ALPHABET = 256;
vector<int> bad(ALPHABET, -1);
for (int i = 0; i < (int)pat.size(); ++i)
bad[(unsigned char)pat[i]] = i;
return bad;
}
vector<int> boyerMoore(const string& text, const string& pat) {
int n = text.size();
int m = pat.size();
if (m == 0) return {};
vector<int> bad = buildBadCharTable(pat);
vector<int> occ;
int s = 0; // shift of the pattern with respect to text
while (s <= n - m) {
int j = m - 1;
while (j >= 0 && pat[j] == text[s + j])
j--;
if (j < 0) {
occ.push_back(s);
s += (s + m < n) ? m - bad[(unsigned char)text[s + m]] : 1;
} else {
int shift = max(1, j - bad[(unsigned char)text[s + j]]);
s += shift;
}
}
return occ;
}
int main() {
string txt = "ABAAABCD";
string pat = "ABC";
auto res = boyerMoore(txt, pat);
for (int p : res) cout << p << " ";
return 0;
}
| Algorithm | Time Complexity (Avg) | Time Complexity (Worst) | Space Complexity | Typical Use‑Case |
|---|---|---|---|---|
| Naïve | O(m·n) | O(m·n) | O(1) | Very short texts |
| Rabin‑Karp | O(n+m) | O(n·m) | O(1) | Multiple pattern searches |
| KMP | O(n+m) | O(n+m) | O(m) | Repeated searches on same pattern |
| Boyer‑Moore | Sub‑linear | O(n·m) | O(m+|Σ|) | Large alphabets & long patterns |
Choosing the Right Algorithm
- Determine the size of the pattern and the text.
- If you need to search many patterns simultaneously, prefer Rabin‑Karp.
- For a single pattern that will be reused, pre‑compute the KMP LPS table.
- When the alphabet is large and patterns are long, Boyer‑Moore often wins.
- In resource‑constrained environments, the naïve method may be acceptable for tiny inputs.
The best algorithm is the one that fits the constraints of your problem, not the one that is theoretically fastest on paper.
Q: Can Rabin‑Karp be used for exact matching only?
A: Rabin‑Karp is primarily an exact‑matching algorithm, but with a carefully chosen hash function it can be extended to approximate matching or plagiarism detection.
Q: Why does Boyer‑Moore work better on large alphabets?
A: A larger alphabet increases the chance that a mismatched character will cause a large bad‑character shift, allowing the algorithm to skip more characters.
Q: Is KMP always faster than the naïve method?
A: KMP has linear worst‑case time, so for large texts it outperforms the naïve algorithm. However, for very short patterns the overhead of building the LPS table may outweigh the benefit.
Q. What is the worst‑case time complexity of the Boyer‑Moore algorithm?
- O(n)
- O(m)
- O(n·m)
- O(log n)
Answer: O(n·m)
In pathological cases (e.g., repeated characters), Boyer‑Moore may degrade to O(n·m).
Q. Which data structure does the KMP algorithm rely on?
- Suffix tree
- Prefix (LPS) array
- Hash table
- Trie
Answer: Prefix (LPS) array
KMP pre‑computes the longest proper prefix which is also a suffix (LPS) for the pattern.