Find the Longest Common Substring using Binary search and Rolling Hash - Coding

Given two strings X and Y, the task is to find the length of the longest common substring.

Examples:

Input: X = “GeeksforGeeks”, y = “GeeksQuiz”
Output: 5
Explanation: The longest common substring is “Geeks” and is of length 5.

Input: X = “abcdxyz”, y = “xyzabcd”
Output: 4
Explanation: The longest common substring is “abcd” and is of length 4.

Input: X = “zxabcdezy”, y = “yzabcdezx”
Output: 6
Explanation: The longest common substring is “abcdez” and is of length 6.

Longest Common Substring using Dynamic Programming:

This problem can be solved using dynamic programming in O(len(X) * len(Y)), see this. In this article we are going to discuss about an efficient approach.

Longest Common Substring using Binary Search and Rolling Hash

Pre-requisites:

Observation:

If there is a common substring of length K in both the strings, then there will be common substrings of length 0, 1, …, K – 1. Hence, binary search on answer can be applied.\

Follow the below steps to implement the idea:

Smallest possible answer(low) = 0 and largest possible answer(high) = min(len(X), len(Y)), Range of binary search will be [0, min(len(X), len(Y))].
- For every mid, check if there exists a common substring of length mid, if exists then update low, else update high.
- To check the existence of a common substring of length K, Polynomial rolling hash function can be used.
  - Iterate over all the windows of size K in string X and string Y and get the hash.
  - If there is a common hash return True, else return False.

Below is the implementation of this approach.

C++

#include <iostream>
#include <cmath>
#include <vector>
 
class ComputeHash {
private:
    std::vector<long> hash;
    std::vector<long> invMod;
    long mod;
    long p;
 
public:
    ComputeHash(std::string s, long p, long mod) {
        int n = s.length();
        this->hash.resize(n);
        this->invMod.resize(n);
        this->mod = mod;
        this->p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++) {
            char c = s[i];
            c = static_cast<char>(c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this->mod;
            this->hash[i] = hashValue;
            this->invMod[i] = static_cast<long>(std::pow(pPow, this->mod - 2)) % this->mod;
            pPow = (pPow * this->p) % this->mod;
        }
    }
 
    long getHash(int l, int r) {
        if (l == 0) {
            return this->hash[r];
        }
 
        long window = (this->hash[r] - this->hash[l - 1] + this->mod) % this->mod;
        return (window * this->invMod[l]) % this->mod;
    }
};
 
bool exists(int k, std::string X, std::string Y, ComputeHash &hashX1,
            ComputeHash &hashX2, ComputeHash &hashY1, ComputeHash &hashY2) {
    for (int i = 0; i <= X.length() - k; i++) {
        for (int j = 0; j <= Y.length() - k; j++) {
            if (X.substr(i, k) == Y.substr(j, k)) {
                return true;
            }
        }
    }
    return false;
}
 
int longestCommonSubstr(std::string X, std::string Y) {
    int n = X.length();
    int m = Y.length();
 
    long p1 = 31;
    long p2 = 37;
    long m1 = static_cast<long>(std::pow(10, 9) + 9);
    long m2 = static_cast<long>(std::pow(10, 9) + 7);
 
    // Initialize two hash objects
    // with different p1, p2, m1, m2
    // to reduce collision
    ComputeHash hashX1(X, p1, m1);
    ComputeHash hashX2(X, p2, m2);
 
    ComputeHash hashY1(Y, p1, m1);
    ComputeHash hashY2(Y, p2, m2);
 
    // Function that returns the existence
    // of a common substring of length k
    int low = 0, high = std::min(n, m);
    int answer = 0;
 
    while (low <= high) {
        int mid = (low + high) / 2;
        if (exists(mid, X, Y, hashX1, hashX2, hashY1, hashY2)) {
            answer = mid;
            low = mid + 1;
        } else {
            high = mid - 1;
        }
    }
 
    return answer;
}
 
 
 
int main() {
    std::string X = "GeeksforGeeks";
    std::string Y = "GeeksQuiz";
    std::cout << longestCommonSubstr(X, Y) << std::endl;
 
    return 0;
}

Java

import java.util.HashSet;
import java.util.Set;
 
class ComputeHash {
    private long[] hash;
    private long[] invMod;
    private long mod;
    private long p;
 
    // Generates hash in O(n(log(n)))
    public ComputeHash(String s, long p, long mod) {
        int n = s.length();
        this.hash = new long[n];
        this.invMod = new long[n];
        this.mod = mod;
        this.p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++) {
            char c = s.charAt(i);
            c = (char) (c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this.mod;
            this.hash[i] = hashValue;
            this.invMod[i] = (long)(Math.pow(pPow, this.mod - 2) % this.mod);
            pPow = (pPow * this.p) % this.mod;
        }
    }
 
    // Return hash of a window in O(1)
    public long getHash(int l, int r) {
        if (l == 0) {
            return this.hash[r];
        }
 
        long window = (this.hash[r] - this.hash[l - 1]) % this.mod;
        return (window * this.invMod[l]) % this.mod;
    }
}
 
public class Main {
    // Function to get the longest common substring
    public static int longestCommonSubstr(String X, String Y) {
    int n = X.length();
    int m = Y.length();
 
    long p1 = 31;
    long p2 = 37;
    long m1 = (long) (Math.pow(10, 9) + 9);
    long m2 = (long) (Math.pow(10, 9) + 7);
 
    // Initialize two hash objects
    // with different p1, p2, m1, m2
    // to reduce collision
    ComputeHash hashX1 = new ComputeHash(X, p1, m1);
    ComputeHash hashX2 = new ComputeHash(X, p2, m2);
 
    ComputeHash hashY1 = new ComputeHash(Y, p1, m1);
    ComputeHash hashY2 = new ComputeHash(Y, p2, m2);
 
    // Function that returns the existence
    // of a common substring of length k
    int low = 0, high = Math.min(n, m);
    int answer = 0;
    while (low <= high) {
        int mid = (low + high) / 2;
        if (exists(mid, X, Y)) {
        answer = mid;
        low = mid + 1;
    } else {
        high = mid - 1;
    }
 
    }
    return answer;
}
    private static boolean exists(int k, String X, String Y) {
    for (int i = 0; i <= X.length() - k; i++) {
        for (int j = 0; j <= Y.length() - k; j++) {
            if (X.substring(i, i + k).equals(Y.substring(j, j + k))) {
                return true;
            }
        }
    }
    return false;
}
 
    public static void main(String[] args) {
    String X = "GeeksforGeeks";
    String Y = "GeeksQuiz";
    System.out.println(longestCommonSubstr(X, Y));
}
}

Python3

# Python code to implement the approach
 
# Function to implement rolling hash
class ComputeHash:
 
    # Generates hash in O(n(log(n)))
    def __init__(self, s, p, mod):
        n = len(s)
        self.hash = [0] * n
        self.inv_mod = [0] * n
        self.mod = mod
        self.p = p
 
        p_pow = 1
        hash_value = 0
 
        for i in range(n):
            c = ord(s[i]) - 65 + 1
            hash_value = (hash_value + c * p_pow) % self.mod
            self.hash[i] = hash_value
            self.inv_mod[i] = pow(p_pow, self.mod - 2, self.mod)
            p_pow = (p_pow * self.p) % self.mod
 
    # Return hash of a window in O(1)
    def get_hash(self, l, r):
 
        if l == 0:
            return self.hash[r]
 
        window = (self.hash[r] - self.hash[l - 1]) % self.mod
        return (window * self.inv_mod[l]) % self.mod
 
# Function to get the longest common substring
def longestCommonSubstr(X, Y, n, m):
 
    p1, p2 = 31, 37
    m1, m2 = pow(10, 9) + 9, pow(10, 9) + 7
 
    # Initialize two hash objects
    # with different p1, p2, m1, m2
    # to reduce collision
    hash_X1 = ComputeHash(X, p1, m1)
    hash_X2 = ComputeHash(X, p2, m2)
 
    hash_Y1 = ComputeHash(Y, p1, m1)
    hash_Y2 = ComputeHash(Y, p2, m2)
 
    # Function that returns the existence
    # of a common substring of length k
    def exists(k):
 
        if k == 0:
            return True
 
        st = set()
         
        # Iterate on X and get hash tuple
        # of all windows of size k
        for i in range(n - k + 1):
            h1 = hash_X1.get_hash(i, i + k - 1)
            h2 = hash_X2.get_hash(i, i + k - 1)
 
            cur_window_hash = (h1, h2)
             
            # Put the hash tuple in the set
            st.add(cur_window_hash)
 
        # Iterate on Y and get hash tuple
        # of all windows of size k
        for i in range(m - k + 1):
            h1 = hash_Y1.get_hash(i, i + k - 1)
            h2 = hash_Y2.get_hash(i, i + k - 1)
 
            cur_window_hash = (h1, h2)
             
            # If hash exists in st return True
            if cur_window_hash in st:
                return True
        return False
 
    # Binary Search on length
    answer = 0
    low, high = 0, min(n, m)
 
    while low <= high:
        mid = (low + high) // 2
 
        if exists(mid):
            answer = mid
            low = mid + 1
        else:
            high = mid - 1
 
    return answer
 
 
# Driver Code
if __name__ == '__main__':
    X = 'GeeksforGeeks'
    Y = 'GeeksQuiz'
    print(longestCommonSubstr(X, Y, len(X), len(Y)))

C#

using System;
using System.Collections.Generic;
 
class ComputeHash
{
    private List<long> hash;
    private List<long> invMod;
    private long mod;
    private long p;
 
    // Constructor to initialize hash values for the given string
    public ComputeHash(string s, long p, long mod)
    {
        int n = s.Length;
        this.hash = new List<long>(n);
        this.invMod = new List<long>(n);
        this.mod = mod;
        this.p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++)
        {
            char c = s[i];
            // Convert character to numeric value
            c = (char)(c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this.mod;
            this.hash.Add(hashValue);
            // Compute modular inverse
            this.invMod.Add(ModularInverse(pPow, this.mod));
            pPow = (pPow * this.p) % this.mod;
        }
    }
 
    // Helper function to compute modular inverse using extended Euclidean algorithm
    private long ModularInverse(long a, long m)
    {
        long m0 = m, t, q;
        long x0 = 0, x1 = 1;
 
        if (m == 1)
            return 0;
 
        while (a > 1)
        {
            q = a / m;
            t = m;
            m = a % m;
            a = t;
            t = x0;
            x0 = x1 - q * x0;
            x1 = t;
        }
 
        if (x1 < 0)
            x1 += m0;
 
        return x1;
    }
 
    // Function to get hash value for a substring [l, r]
    public long GetHash(int l, int r)
    {
        if (l == 0)
        {
            return this.hash[r];
        }
 
        long window = (this.hash[r] - this.hash[l - 1] + this.mod) % this.mod;
        return (window * this.invMod[l]) % this.mod;
    }
}
 
class LongestCommonSubstring
{
    // Function to check if a common substring of length k exists
    private static bool Exists(int k, string X, string Y, 
                               ComputeHash hashX1, ComputeHash hashX2, 
                               ComputeHash hashY1, ComputeHash hashY2)
    {
        for (int i = 0; i <= X.Length - k; i++)
        {
            for (int j = 0; j <= Y.Length - k; j++)
            {
                if (X.Substring(i, k) == Y.Substring(j, k))
                {
                    return true;
                }
            }
        }
        return false;
    }
 
    // Function to find the length of the longest common substring of X and Y
    private static int LongestCommonSubstr(string X, string Y)
    {
        int n = X.Length;
        int m = Y.Length;
 
        long p1 = 31;
        long p2 = 37;
        long m1 = (long)Math.Pow(10, 9) + 9;
        long m2 = (long)Math.Pow(10, 9) + 7;
 
        // Initialize two hash objects with different p1, p2, m1, m2 to reduce collision
        ComputeHash hashX1 = new ComputeHash(X, p1, m1);
        ComputeHash hashX2 = new ComputeHash(X, p2, m2);
 
        ComputeHash hashY1 = new ComputeHash(Y, p1, m1);
        ComputeHash hashY2 = new ComputeHash(Y, p2, m2);
 
        // Binary search to find the length of the longest common substring
        int low = 0, high = Math.Min(n, m);
        int answer = 0;
 
        while (low <= high)
        {
            int mid = (low + high) / 2;
            if (Exists(mid, X, Y, hashX1, hashX2, hashY1, hashY2))
            {
                answer = mid;
                low = mid + 1;
            }
            else
            {
                high = mid - 1;
            }
        }
 
        return answer;
    }
 
    static void Main()
    {
        string X = "GeeksforGeeks";
        string Y = "GeeksQuiz";
        Console.WriteLine(LongestCommonSubstr(X, Y));
 
        // Output: 5
    }
}

Javascript

// javascript code to implement the approach
class ComputeHash {
  constructor(mod) {
    this.mod = mod;
  }
  compute(s) {
    let hashValue = 0n;
    let pPow = 1n;
    for (let i = 0; i < s.length; i++) {
      let c = BigInt(s.charCodeAt(i) - "A".charCodeAt(0) + 1);
      hashValue = (hashValue + c * pPow) % this.mod;
      pPow = pPow * 26n % this.mod;
    }
    return hashValue;
  }
}
 
function longestCommonSubstr(s, t) {
  const mod = BigInt(10**9+7);
  const p1 = new ComputeHash(mod);
  const p2 = new ComputeHash(mod);
  let left = 0, right = Math.min(s.length, t.length);
  while (left < right) {
    const mid = Math.floor((left + right + 1) / 2);
    let found = false;
    const set = new Set();
    for (let i = 0; i + mid <= s.length; i++) {
      const hashValue = p1.compute(s.substring(i, i + mid));
      set.add(hashValue);
    }
    for (let i = 0; i + mid <= t.length; i++) {
      const hashValue = p2.compute(t.substring(i, i + mid));
      if (set.has(hashValue)) {
        found = true;
        break;
      }
    }
    if (found) {
      left = mid;
    } else {
      right = mid - 1;
    }
  }
  return left;
}
 
console.log(longestCommonSubstr("ABABAB", "BABABA"));  // expected output: 3
 
//code is implemented by chetanbargal

Output

Time Complexity: O(n * log(m1)) + O(m * log((m1)) + O((n + m) * log(min(n, m)))

Generating hash object takes O(n*log(m1)), where n is the length of string and m1 = pow(10, 9) + 7.
Binary search takes O(log(min(n, m))), where n, m are the lengths of both strings.
Hash of a window takes O(1) time.
Exist function takes O(n + m) time.

Auxiliary Space: O(n + m)

Reffered: https://www.geeksforgeeks.org

Strings

Related
Pattern of Strings
Display the Longest Name
Minimum number of increment operations required to make two Strings equal
Minimize String length by deleting Substring of same character when K flips are allowed
Minimize removal from front or end to make the Binary String at equilibrium

Type:	Geek
Category:	Coding
Sub Category:	Tutorial
Uploaded by:	Admin
Views:	15