You are given a string of length n. If all of its substrings (not necessarily distinct) are ordered lexicographically, what is the kth smallest of them?
Examples:
Input: String = “agctagctagct” , k = 7 Output: AGC Explanation: The 7 smallest substrings in order are a, a, a, ag, ag, ag and agc.
Input: String = “bacbaa” , k = 5 Output: ac Explanation: The 5 smallest substrings in order are a, a, a, aa and ac.
Approach: To solve the problem, follow the below idea:
The problem can be solved by constructing the suffix array and longest common prefix (LCP) array of the input string. Then, we build a segment tree to efficiently count the number of substrings lexicographically smaller than or equal to a given substring. Finally, we iterate through the suffix array in reverse order, updating the segment tree with substring counts and using binary search to find the longest common prefix that satisfies the condition of having at least k substrings lexicographically smaller than or equal to the current substring, thereby identifying the kth smallest substring.
Step-by-step algorithm:
- Construct the suffix array of the given string S.
- Build the longest common prefix (LCP) array using the constructed suffix array.
- For each suffix in the suffix array, calculate the number of substrings starting from that suffix by updating a segment tree.
- Starting from the last suffix in the suffix array, iterate through each suffix:
- Update the segment tree with the count of substrings starting from the current suffix.
- Use binary search to find the longest common prefix that satisfies the condition of having at least k substrings lexicographically smaller than or equal to the current substring.
- Once the longest common prefix is found, print the substring and terminate the loop.
Below is the implementation of the algorithm:
C++
#include <bits/stdc++.h>
using namespace std;
// Define int as long long data type
#define int long long
// Define endl as newline character
#define endl '\n'
// Alias for first element in a pair
#define F first
// Alias for second element in a pair
#define S second
// Define a constant for maximum size
const int mxN = 1e5 + 5;
// Arrays for suffix array, position, temporary, and longest
// common prefix
int sa[mxN], pos[mxN], tmp[mxN], lcp[mxN];
// Variables for gap and size of the string
int gap, N;
// Input string
string S;
// Function to compare two positions in the suffix array
bool comp(int x, int y)
{
// Compare positions based on their values in the pos
// array
if (pos[x] != pos[y])
return pos[x] < pos[y];
// If pos[x] == pos[y], compare their positions with a
// gap
x += gap;
y += gap;
// If within bounds, return comparison result based on
// pos array; otherwise, return comparison of x and y
return (x < N && y < N) ? pos[x] < pos[y] : x > y;
}
// Function to construct the suffix array
void suffix()
{
// Initialize sa and pos arrays with initial values
for (int i = 0; i < N; i++)
sa[i] = i, pos[i] = S[i];
// Loop for different gap values
for (gap = 1;; gap <<= 1) {
// Sort the suffix array based on the comparison
// function comp
sort(sa, sa + N, comp);
// Update temporary array tmp
for (int i = 0; i < N - 1; i++)
tmp[i + 1] = tmp[i] + comp(sa[i], sa[i + 1]);
// Update pos array
for (int i = 0; i < N; i++)
pos[sa[i]] = tmp[i];
// If all suffixes are sorted, break the loop
if (tmp[N - 1] == N - 1)
break;
}
}
// Function to construct the longest common prefix array
void build_lcp()
{
for (int i = 0, k = 0; i < N; i++)
if (pos[i] != N - 1) {
int j = sa[pos[i] + 1];
while (S[i + k] == S[j + k])
k++;
lcp[pos[i]] = k;
if (k)
k--;
}
}
// Segment tree related functions for efficient range
// queries and updates Segment tree array
pair<int, int> seg[mxN * 10];
// Array to mark updates
int mark[mxN * 10];
// Function to push updates down the segment tree
void push(int k)
{
if (mark[k]) {
mark[k] = 0;
seg[2 * k].F = seg[2 * k + 1].F = seg[k].F / 2;
seg[2 * k].S = seg[2 * k + 1].S = 0;
mark[2 * k] = mark[2 * k + 1] = 1;
}
}
// Function to update values in the segment tree
void update(int v, int a, int b, int k, int x, int y)
{
if (b < x || a > y)
return;
if (a <= x && b >= y) {
seg[k].S += v;
return;
}
int h = min(b, y) - max(a, x) + 1;
push(k);
seg[k].F += h * v;
int d = (x + y) / 2;
update(v, a, b, 2 * k, x, d);
update(v, a, b, 2 * k + 1, d + 1, y);
}
// Function to assign values in the segment tree
int assign(int v, int a, int b, int k, int x, int y)
{
if (b < x || a > y)
return seg[k].F + (y - x + 1) * seg[k].S;
if (a <= x && b >= y) {
seg[k].F = (y - x + 1) * v;
seg[k].S = 0;
mark[k] = 1;
return seg[k].F;
}
push(k);
int d = (x + y) / 2;
seg[2 * k].S += seg[k].S, seg[2 * k + 1].S += seg[k].S;
seg[k].S = 0;
seg[k].F = assign(v, a, b, 2 * k, x, d)
+ assign(v, a, b, 2 * k + 1, d + 1, y);
return seg[k].F;
}
// Function to calculate sum in a range using segment tree
int sum(int a, int b, int k, int x, int y)
{
if (b < x || a > y)
return 0;
if (a <= x && b >= y) {
return seg[k].F + (y - x + 1) * seg[k].S;
}
push(k);
seg[k].F += (y - x + 1) * seg[k].S;
seg[2 * k].S += seg[k].S, seg[2 * k + 1].S += seg[k].S;
seg[k].S = 0;
int d = (x + y) / 2;
return sum(a, b, 2 * k, x, d)
+ sum(a, b, 2 * k + 1, d + 1, y);
}
signed main()
{
// Input string
cin >> S;
// Size of the string
N = S.size();
// Construct suffix array
suffix();
// Construct longest common prefix array
build_lcp();
// Given k value for substring extraction
int k;
cin >> k;
// Calculate the k-th smallest lexicographically suffix
// Calculate the position from the last
k = N * (N + 1) / 2 - k + 1;
// Construct a segment tree
// Calculate the size of the segment tree array
int K = 1 << (int)ceil(log2(N + 1));
// Initialize current position
int cur = 0;
for (int i = N - 1; i >= 0; i--) {
// Update the segment tree
update(1, 1, N - sa[i], 1, 0, K - 1);
// Get the previous longest common prefix
int prev = (i ? lcp[i - 1] : 0);
// Set the search range for the binary search
int l = prev + 1, r = N - sa[i];
// Initialize answer variable
int ans = -1;
// Binary search to find the longest common prefix
// that satisfies the condition
while (l <= r) {
int m = l + (r - l) / 2;
if (cur + sum(m, N - sa[i], 1, 0, K - 1) >= k) {
ans = m;
l = m + 1;
}
else
r = m - 1;
}
// If answer is found, print the substring and exit
// the loop
if (ans != -1) {
return cout << S.substr(sa[i], ans), 0;
}
// Update the current position and mark the segment
// tree
cur += sum(prev + 1, N - sa[i], 1, 0, K - 1);
assign(0, prev + 1, N - sa[i], 1, 0, K - 1);
}
}
Python
import math
# Define a constant for maximum size
mxN = 10**5 + 5
# Arrays for suffix array, position, temporary, and longest common prefix
sa = [0] * mxN
pos = [0] * mxN
tmp = [0] * mxN
lcp = [0] * mxN
# Variables for gap and size of the string
gap = 0
N = 0
# Input string
S = ""
# Function to compare two positions in the suffix array
def comp(x, y):
global pos, gap, N
# Compare positions based on their values in the pos array
if pos[x] != pos[y]:
return pos[x] < pos[y]
# If pos[x] == pos[y], compare their positions with a gap
x += gap
y += gap
# If within bounds, return comparison result based on pos array;
# otherwise, return comparison of x and y
return pos[x] < pos[y] if (x < N and y < N) else x > y
# Function to construct the suffix array
def suffix():
global sa, pos, tmp, gap, N
# Initialize sa and pos arrays with initial values
for i in range(N):
sa[i] = i
pos[i] = ord(S[i])
# Loop for different gap values
gap = 1
while True:
# Sort the suffix array based on the comparison function comp
sa.sort(key=lambda x: (pos[x], pos[x + gap] if x + gap < N else -1))
# Update temporary array tmp
tmp[0] = 0
for i in range(1, N):
tmp[i] = tmp[i - 1] + comp(sa[i - 1], sa[i])
# Update pos array
for i in range(N):
pos[sa[i]] = tmp[i]
# If all suffixes are sorted, break the loop
if tmp[N - 1] == N - 1:
break
gap *= 2
# Function to construct the longest common prefix array
def build_lcp():
global lcp, pos, N
k = 0
for i in range(N):
if pos[i] != N - 1:
j = sa[pos[i] + 1]
while i + k < N and j + k < N and S[i + k] == S[j + k]:
k += 1
lcp[pos[i]] = k
if k > 0:
k -= 1
# Segment tree related functions for efficient range queries and updates Segment tree array
seg = [(0, 0)] * (mxN * 10)
# Array to mark updates
mark = [0] * (mxN * 10)
# Function to push updates down the segment tree
def push(k):
global seg, mark
if mark[k]:
mark[k] = 0
seg[2 * k] = seg[2 * k + 1] = (seg[k][0] // 2, 0)
mark[2 * k] = mark[2 * k + 1] = 1
# Function to update values in the segment tree
def update(v, a, b, k, x, y):
global seg
if b < x or a > y:
return
if a <= x and b >= y:
seg[k] = (seg[k][0], seg[k][1] + v)
return
h = min(b, y) - max(a, x) + 1
push(k)
seg[k] = (seg[k][0] + h * v, seg[k][1])
d = (x + y) // 2
update(v, a, b, 2 * k, x, d)
update(v, a, b, 2 * k + 1, d + 1, y)
# Function to assign values in the segment tree
def assign(v, a, b, k, x, y):
global seg, mark
if b < x or a > y:
return seg[k][0] + (y - x + 1) * seg[k][1]
if a <= x and b >= y:
seg[k] = ((y - x + 1) * v, 0)
mark[k] = 1
return seg[k][0]
push(k)
d = (x + y) // 2
seg[2 * k] = (seg[k][0], seg[2 * k][1] + seg[k][1])
seg[2 * k + 1] = (seg[k][0], seg[2 * k + 1][1] + seg[k][1])
seg[k] = assign(v, a, b, 2 * k, x, d) + assign(v, a, b, 2 * k + 1, d + 1, y)
return seg[k][0]
# Function to calculate sum in a range using segment tree
def _sum(a, b, k, x, y):
global seg
if b < x or a > y:
return 0
if a <= x and b >= y:
return seg[k][0] + (y - x + 1) * seg[k][1]
push(k)
seg[k] = (seg[k][0] + (y - x + 1) * seg[k][1], seg[k][1])
seg[2 * k] = (seg[2 * k][0], seg[2 * k][1] + seg[k][1])
seg[2 * k + 1] = (seg[2 * k + 1][0], seg[2 * k + 1][1] + seg[k][1])
d = (x + y) // 2
return _sum(a, b, 2 * k, x, d) + _sum(a, b, 2 * k + 1, d + 1, y)
# Input string
S = input().strip()
# Size of the string
N = len(S)
# Construct suffix array
suffix()
# Construct longest common prefix array
build_lcp()
# Given k value for substring extraction
k = int(input())
# Calculate the k-th smallest lexicographically suffix
# Calculate the position from the last
k = N * (N + 1) // 2 - k + 1
# Construct a segment tree
# Calculate the size of the segment tree array
K = 1 << (math.ceil(math.log2(N + 1)))
# Initialize current position
cur = 0
for i in range(N - 1, -1, -1):
# Update the segment tree
update(1, 1, N - sa[i], 1, 0, K - 1)
# Get the previous longest common prefix
prev = lcp[i - 1] if i > 0 else 0
# Set the search range for the binary search
l, r = prev + 1, N - sa[i]
# Initialize answer variable
ans = -1
# Binary search to find the longest common prefix that satisfies the condition
while l <= r:
m = (l + r) // 2
if cur + _sum(m, N - sa[i], 1, 0, K - 1) >= k:
ans = m
r = m - 1
else:
l = m + 1
# If answer is found, print the substring and exit the loop
if ans != -1:
print(S[sa[i]:sa[i] + ans])
break
# Update the current position and mark the segment tree
cur += _sum(prev + 1, N - sa[i], 1, 0, K - 1)
assign(0, prev + 1, N - sa[i], 1, 0, K - 1)
JavaScript
const mxN = 1e5 + 5;
let sa = new Array(mxN), pos = new Array(mxN), tmp = new Array(mxN),
lcp = new Array(mxN);
let gap, N;
let S;
// Function to compare two positions in the suffix array
function comp(x, y) {
if (pos[x] !== pos[y]) return pos[x] < pos[y];
x += gap;
y += gap;
return (x < N && y < N) ? pos[x] < pos[y] : x > y;
}
// Function to construct the suffix array
function suffix() {
for (let i = 0; i < N; i++) {
sa[i] = i;
pos[i] = S.charCodeAt(i);
}
for (gap = 1; ; gap <<= 1) {
sa.sort((a, b) => comp(a, b) ? -1 : 1);
tmp[0] = 0;
for (let i = 0; i < N - 1; i++) {
tmp[i + 1] = tmp[i] + (comp(sa[i], sa[i + 1]) ? 1 : 0);
}
for (let i = 0; i < N; i++) {
pos[sa[i]] = tmp[i];
}
if (tmp[N - 1] === N - 1) break;
}
}
// Function to construct the longest common prefix array
function build_lcp() {
for (let i = 0, k = 0; i < N; i++) {
if (pos[i] !== N - 1) {
let j = sa[pos[i] + 1];
while (i + k < N && j + k < N && S[i + k] === S[j + k]) k++;
lcp[pos[i]] = k;
if (k) k--;
}
}
}
// Segment tree related functions for efficient range queries and updates
let seg = new Array(mxN * 10).fill([0, 0]);
let mark = new Array(mxN * 10).fill(0);
// Function to push updates down the segment tree
function push(k) {
if (mark[k]) {
mark[k] = 0;
seg[2 * k] = [seg[k][0] / 2, 0];
seg[2 * k + 1] = [seg[k][0] / 2, 0];
mark[2 * k] = mark[2 * k + 1] = 1;
}
}
// Function to update values in the segment tree
function update(v, a, b, k, x, y) {
if (b < x || a > y) return;
if (a <= x && b >= y) {
seg[k][1] += v;
return;
}
let h = Math.min(b, y) - Math.max(a, x) + 1;
push(k);
seg[k][0] += h * v;
let d = Math.floor((x + y) / 2);
update(v, a, b, 2 * k, x, d);
update(v, a, b, 2 * k + 1, d + 1, y);
}
// Function to assign values in the segment tree
function assign(v, a, b, k, x, y) {
if (b < x || a > y) return seg[k][0] + (y - x + 1) * seg[k][1];
if (a <= x && b >= y) {
seg[k][0] = (y - x + 1) * v;
seg[k][1] = 0;
mark[k] = 1;
return seg[k][0];
}
push(k);
let d = Math.floor((x + y) / 2);
seg[2 * k][1] += seg[k][1];
seg[2 * k + 1][1] += seg[k][1];
seg[k][1] = 0;
seg[k][0] = assign(v, a, b, 2 * k, x, d) +
assign(v, a, b, 2 * k + 1, d + 1, y);
return seg[k][0];
}
// Function to calculate sum in a range using segment tree
function sum(a, b, k, x, y) {
if (b < x || a > y) return 0;
if (a <= x && b >= y) return seg[k][0] + (y - x + 1) * seg[k][1];
push(k);
seg[k][0] += (y - x + 1) * seg[k][1];
seg[2 * k][1] += seg[k][1];
seg[2 * k + 1][1] += seg[k][1];
seg[k][1] = 0;
let d = Math.floor((x + y) / 2);
return sum(a, b, 2 * k, x, d) + sum(a, b, 2 * k + 1, d + 1, y);
}
function main() {
// Input string
S = prompt("Enter the string:");
// Size of the string
N = S.length;
// Construct suffix array
suffix();
// Construct longest common prefix array
build_lcp();
// Given k value for substring extraction
let k = parseInt(prompt("Enter the value of k:"));
// Calculate the k-th smallest lexicographically suffix
k = N * (N + 1) / 2 - k + 1;
// Construct a segment tree
let K = 1 << Math.ceil(Math.log2(N + 1));
let cur = 0;
for (let i = N - 1; i >= 0; i--) {
// Update the segment tree
update(1, 1, N - sa[i], 1, 0, K - 1);
// Get the previous longest common prefix
let prev = (i ? lcp[i - 1] : 0);
// Set the search range for the binary search
let l = prev + 1, r = N - sa[i];
// Initialize answer variable
let ans = -1;
// Binary search to find the longest common prefix that
// satisfies the condition
while (l <= r) {
let m = l + Math.floor((r - l) / 2);
if (cur + sum(m, N - sa[i], 1, 0, K - 1) >= k) {
ans = m;
l = m + 1;
} else {
r = m - 1;
}
}
// If answer is found, print the substring and exit the loop
if (ans !== -1) {
console.log(S.substring(sa[i], sa[i] + ans));
return;
}
// Update the current position and mark the segment tree
cur += sum(prev + 1, N - sa[i], 1, 0, K - 1);
assign(0, prev + 1, N - sa[i], 1, 0, K - 1);
}
}
main();
// This code is contributed by Shivam Gupta
Time Complexity: O(N log N ), where N is the length of the input string. Auxiliary Space: O(N), where N is the length of the input string.
|