2 * lcpit_matchfinder_templates.h
4 * This file is included by lcpit_matchfinder.c.
9 * The author dedicates this file to the public domain.
10 * You can do whatever you want with this file.
14 * In normal mode, we can pack a buffer position and a LCP value into a 32-bit
15 * number. In huge mode, we can't.
18 # define GET_SA_ENTRY(r) (SA[r])
19 # define GET_LCP_ENTRY(r) (LCP[r])
20 # define SET_LCP_ENTRY(r, val) (LCP[r] = (val))
21 # define UNVISITED_TAG HUGE_UNVISITED_TAG
23 # define GET_SA_ENTRY(r) (SA_and_LCP[r] & SA_and_LCP_POS_MASK)
24 # define GET_LCP_ENTRY(r) (SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT)
25 # define SET_LCP_ENTRY(r, val) (SA_and_LCP[r] |= (val) << SA_and_LCP_LCP_SHIFT)
26 # define UNVISITED_TAG NORMAL_UNVISITED_TAG
30 * Build the LCP (Longest Common Prefix) array in linear time.
32 * LCP[r] will be the length of the longest common prefix between the suffixes
33 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
35 * Algorithm taken from Kasai et al. (2001), but modified slightly:
37 * - For decreased memory usage and improved memory locality, pack the two
38 * logically distinct SA and LCP arrays into a single array SA_and_LCP.
40 * - With bytes there is no realistic way to reserve a unique symbol for
41 * end-of-buffer, so use explicit checks for end-of-buffer.
43 * - If a LCP value is less than the minimum match length, then store 0. This
44 * avoids having to do comparisons against the minimum match length later.
46 * - If a LCP value is greater than the "nice match length", then store the
47 * "nice match length". This caps the number of bits needed to store each
48 * LCP value, and this caps the depth of the LCP-interval tree, without
49 * usually hurting the compression ratio too much.
53 * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
54 * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
55 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
59 build_LCP_huge(u32 LCP[restrict], const u32 SA[restrict], const u32 ISA[restrict],
60 const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp)
63 build_LCP_normal(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
64 const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp)
68 for (u32 i = 0; i < n; i++) {
71 u32 j = GET_SA_ENTRY(r - 1);
72 u32 lim = min(n - i, n - j);
73 while (h < lim && T[i + h] == T[j + h])
76 if (stored_lcp < min_lcp)
78 else if (stored_lcp > max_lcp)
80 SET_LCP_ENTRY(r, stored_lcp);
88 * Use the suffix array accompanied with the longest-common-prefix array --- in
89 * other words, the "enhanced suffix array" --- to simulate a bottom-up
90 * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
91 * tree", as described in Abouelhoda et al. (2004).
93 * While doing the traversal, create a table 'intervals' that contains data for
94 * each lcp-interval, specifically the lcp value of that interval, and the index
95 * of the superinterval.
97 * Also while doing the traversal, create a table 'pos_data' that contains a
98 * mapping from suffix index to the deepest lcp-interval containing it.
100 * The result is that we will later be able to do match-finding at a given
101 * position by looking up that position in 'pos_data' to get the deepest
102 * lcp-interval containing the corresponding suffix, then proceeding to the
103 * superintervals. See lcpit_advance_one_byte() for more details.
105 * Note: We limit the depth of the lcp-interval tree by capping the lcp at
106 * LCP_MAX. This can cause a sub-tree of intervals with lcp greater than
107 * LCP_MAX to be collapsed into a single interval with lcp LCP_MAX. This avoids
108 * degenerate cases and does not hurt match-finding very much, since if we find
109 * a match of length LCP_MAX and extend it as far as possible, that's usually
110 * good enough because that region of the input must already be highly
115 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
116 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
117 * Issue 1, March 2004, pp. 53-86.
119 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
120 * Using Less Time & Space. Mathematics in Computer Science June 2008,
121 * Volume 1, Issue 4, pp. 605-623.
123 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
124 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
125 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
129 build_LCPIT_huge(const u32 SA[restrict], u32 LCP[], u64 intervals[],
130 u32 pos_data[restrict], u32 n)
133 build_LCPIT_normal(const u32 SA_and_LCP[restrict], u32 intervals[restrict],
134 u32 pos_data[restrict], u32 n)
137 u32 next_interval_idx = 0;
138 u32 open_intervals[LCP_MAX + 1];
139 u32 *top = open_intervals;
140 u32 prev_pos = GET_SA_ENTRY(0);
142 /* The interval with lcp=0 covers the entire array. It remains open
144 *top = next_interval_idx;
145 intervals[next_interval_idx] = 0;
148 for (u32 r = 1; r < n; r++) {
149 u32 next_pos = GET_SA_ENTRY(r);
150 u32 next_lcp = GET_LCP_ENTRY(r);
151 u32 top_lcp = intervals[*top];
153 if (next_lcp == top_lcp) {
154 /* continuing the deepest open interval */
155 pos_data[prev_pos] = *top;
156 } else if (next_lcp > top_lcp) {
157 /* opening a new interval */
158 intervals[next_interval_idx] = next_lcp;
159 *++top = next_interval_idx;
160 pos_data[prev_pos] = next_interval_idx;
163 /* closing the deepest open interval */
164 pos_data[prev_pos] = *top;
166 u32 closed_interval_idx = *top;
167 u32 superinterval_idx = *--top;
168 u32 superinterval_lcp = intervals[superinterval_idx];
170 if (next_lcp == superinterval_lcp) {
171 /* continuing the superinterval */
172 intervals[closed_interval_idx] |=
173 (superinterval_idx << LCP_BITS) |
176 } else if (next_lcp > superinterval_lcp) {
177 /* creating a new interval that is a
178 * superinterval of the one being
179 * closed, but still a subinterval of
180 * its superinterval */
181 intervals[next_interval_idx] = next_lcp;
182 *++top = next_interval_idx;
183 intervals[closed_interval_idx] |=
184 (next_interval_idx << LCP_BITS) |
189 /* also closing the superinterval */
190 intervals[closed_interval_idx] |=
191 (superinterval_idx << LCP_BITS) |
199 /* close any still-open intervals */
200 pos_data[prev_pos] = *top;
201 while (top > open_intervals) {
202 u32 closed_interval_idx = *top;
203 u32 superinterval_idx = *--top;
204 intervals[closed_interval_idx] |=
205 (superinterval_idx << LCP_BITS) | UNVISITED_TAG;
210 * Advance the LCP-interval tree matchfinder by one byte.
212 * If @record_matches is true, then matches are recorded in the @matches array,
213 * and the return value is the number of matches found. Otherwise, @matches is
214 * ignored and the return value is always 0.
218 lcpit_advance_one_byte_huge
220 lcpit_advance_one_byte_normal
222 (struct lcpit_matchfinder *mf, struct lz_match * restrict matches,
225 const u32 cur_pos = mf->cur_pos++;
226 u32 * const pos_data = mf->pos_data;
228 u64 * const intervals = mf->intervals64;
230 u32 * const intervals = mf->intervals;
234 u32 interval, next_interval;
235 u32 cur_match, next_match;
237 /* Look up the deepest lcp-interval containing the current suffix. */
238 interval = pos_data[cur_pos];
240 /* Prefetch the deepest lcp-interval containing the next suffix. */
241 prefetch(&intervals[pos_data[cur_pos + 1]]);
243 /* Since the current position is greater than any position previously
244 * searched, set the "lcp interval of the next match" for this suffix to
245 * 0. This is the index of the root interval, and this indicates that
246 * there is no next match. */
247 pos_data[cur_pos] = 0;
249 /* Ascend the lcp-interval tree until we reach an lcp-interval that has
250 * already been visited. */
252 while (intervals[interval] & UNVISITED_TAG) {
254 /* Visiting this lcp-interval for the first time. Therefore,
255 * there are no matches with length equal to the lcp of this
258 /* Extract the LCP and superinterval reference. */
260 lcp = intervals[interval] & LCP_MASK;
262 /* If the LCP is shorter than the minimum length of matches to
263 * be produced, we're done, since the LCP will only ever get
264 * shorter from here. This also prevents ascending above the
265 * root of the lcp-interval tree, since the root is guaranteed
266 * to be a 0-interval. */
270 next_interval = (intervals[interval] & ~UNVISITED_TAG) >> LCP_BITS;
272 /* Set the position of the most-recently-seen suffix within this
273 * lcp-interval. Since this is the first visitation of this
274 * lcp-interval, this is simply the current suffix.
276 * Note that this overwrites the superinterval reference which
277 * was previously included in this lcp-interval data slot.
278 * Further visitations of this lcp-interval will detect that it
279 * is already visited and will follow the chain of
280 * most-recently-seen suffixes rather than ascend the tree
282 intervals[interval] = (cur_pos << LCP_BITS) | lcp;
284 /* Ascend to the superinterval of this lcp-interval. */
285 interval = next_interval;
288 /* We've already visited the current lcp-interval. */
290 /* Extract the LCP of this lcp-interval. */
291 lcp = intervals[interval] & LCP_MASK;
293 /* Extract the current match for this lcp-interval. This usually is the
294 * most-recently-seen suffix within this lcp-interval, but it may be
296 cur_match = intervals[interval] >> LCP_BITS;
299 /* If the LCP is shorter than the minimum length of matches to
300 * be produced, we're done, since the LCP will only ever get
301 * shorter from here. This also prevents ascending above the
302 * root of the lcp-interval tree, since the root is guaranteed
303 * to be a 0-interval. */
307 /* Advance the current match until the lcp of the *next* match
308 * is lower than the current lcp. When this is true we know
309 * that the current match is up to date (lowest offset /
310 * greatest position for that lcp). */
312 next_match = cur_match;
314 next_interval = pos_data[next_match];
315 next_lcp = intervals[next_interval] & LCP_MASK;
316 cur_match = next_match;
317 next_match = intervals[next_interval] >> LCP_BITS;
318 } while (next_lcp >= lcp);
320 /* Link the current position into the match chain, discarding
321 * any skipped matches. */
322 intervals[interval] = (cur_pos << LCP_BITS) | lcp;
323 pos_data[cur_match] = interval;
325 if (record_matches) {
326 /* Record the match. */
327 matches[num_matches].length = lcp;
328 matches[num_matches].offset = cur_pos - cur_match;
332 /* Advance to the next match. */
333 interval = next_interval;
335 cur_match = next_match;