2 * divsufsort.c for libdivsufsort
3 * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
27 #include "divsufsort_private.h"
32 /*- Private Functions -*/
34 /* Sorts suffixes of type B*. */
37 sort_typeBstar(const sauchar_t *T, saidx_t *SA,
38 saidx_t *bucket_A, saidx_t *bucket_B,
40 saidx_t *PAb, *ISAb, *buf;
41 saidx_t i, j, k, t, m, bufsize;
44 /* Initialize bucket arrays. */
45 for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
46 for(uint32_t i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
48 /* Count the number of occurrences of the first one or two characters of each
49 type A, B and B* suffix. Moreover, store the beginning position of all
50 type B* suffixes into the array SA. */
52 for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
54 do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
57 ++BUCKET_BSTAR(c0, c1);
60 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
68 A type B* suffix is lexicographically smaller than a type B suffix that
69 begins with the same first two characters.
72 /* Calculate the index of start/end point of each bucket. */
73 for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
75 BUCKET_A(c0) = i + j; /* start point */
76 i = t + BUCKET_B(c0, c0);
77 for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
78 j += BUCKET_BSTAR(c0, c1);
79 BUCKET_BSTAR(c0, c1) = j; /* end point */
80 i += BUCKET_B(c0, c1);
85 /* Sort the type B* suffixes by their first two characters. */
86 PAb = SA + n - m; ISAb = SA + m;
87 for(i = m - 2; 0 <= i; --i) {
88 t = PAb[i], c0 = T[t], c1 = T[t + 1];
89 SA[--BUCKET_BSTAR(c0, c1)] = i;
91 t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
92 SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
94 /* Sort the type B* substrings using sssort. */
95 buf = SA + m, bufsize = n - (2 * m);
96 for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
97 for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
98 i = BUCKET_BSTAR(c0, c1);
100 sssort(T, PAb, SA + i, SA + j,
101 buf, bufsize, 2, n, *(SA + i) == (m - 1));
106 /* Compute ranks of type B* substrings. */
107 for(i = m - 1; 0 <= i; --i) {
110 do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
112 if(i <= 0) { break; }
115 do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
119 /* Construct the inverse suffix array of type B* suffixes using trsort. */
120 trsort(ISAb, SA, m, 1);
122 /* Set the sorted order of tyoe B* suffixes. */
123 for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
124 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
127 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
128 SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
132 /* Calculate the index of start/end point of each bucket. */
133 BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
134 for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
135 i = BUCKET_A(c0 + 1) - 1;
136 for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
137 t = i - BUCKET_B(c0, c1);
138 BUCKET_B(c0, c1) = i; /* end point */
140 /* Move all type B* suffixes to the correct position. */
141 for(i = t, j = BUCKET_BSTAR(c0, c1);
143 --i, --k) { SA[i] = SA[k]; }
145 BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
146 BUCKET_B(c0, c0) = i; /* end point */
153 /* Constructs the suffix array by using the sorted order of type B* suffixes. */
156 construct_SA(const sauchar_t *T, saidx_t *SA,
157 saidx_t *bucket_A, saidx_t *bucket_B,
158 saidx_t n, saidx_t m) {
164 /* Construct the sorted order of type B suffixes by using
165 the sorted order of type B* suffixes. */
166 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
167 /* Scan the suffix array from right to left. */
168 for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
169 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
174 assert(((s + 1) < n) && (T[s] <= T[s + 1]));
175 assert(T[s - 1] <= T[s]);
178 if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
180 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
181 k = SA + BUCKET_B(c2 = c0, c1);
186 assert(((s == 0) && (T[s] == c1)) || (s < 0));
193 /* Construct the suffix array by using
194 the sorted order of type B suffixes. */
195 k = SA + BUCKET_A(c2 = T[n - 1]);
196 *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
197 /* Scan the suffix array from left to right. */
198 for(i = SA, j = SA + n; i < j; ++i) {
200 assert(T[s - 1] >= T[s]);
202 if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
204 BUCKET_A(c2) = k - SA;
205 k = SA + BUCKET_A(c2 = c0);
217 /* Constructs the burrows-wheeler transformed string directly
218 by using the sorted order of type B* suffixes. */
221 construct_BWT(const sauchar_t *T, saidx_t *SA,
222 saidx_t *bucket_A, saidx_t *bucket_B,
223 saidx_t n, saidx_t m) {
224 saidx_t *i, *j, *k, *orig;
229 /* Construct the sorted order of type B suffixes by using
230 the sorted order of type B* suffixes. */
231 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
232 /* Scan the suffix array from right to left. */
233 for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
234 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
239 assert(((s + 1) < n) && (T[s] <= T[s + 1]));
240 assert(T[s - 1] <= T[s]);
243 if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
245 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
246 k = SA + BUCKET_B(c2 = c0, c1);
261 /* Construct the BWTed string by using
262 the sorted order of type B suffixes. */
263 k = SA + BUCKET_A(c2 = T[n - 1]);
264 *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
265 /* Scan the suffix array from left to right. */
266 for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
268 assert(T[s - 1] >= T[s]);
271 if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
273 BUCKET_A(c2) = k - SA;
274 k = SA + BUCKET_A(c2 = c0);
290 /*---------------------------------------------------------------------------*/
294 /* XXX Modified from original: use provided temporary space instead of
297 divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n,
298 saidx_t *bucket_A, saidx_t *bucket_B) {
316 m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
317 construct_SA(T, SA, bucket_A, bucket_B, n, m);
325 divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
327 saidx_t *bucket_A, *bucket_B;
330 /* Check arguments. */
331 if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
332 else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
334 if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
335 bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
336 bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
338 /* Burrows-Wheeler Transform. */
339 if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
340 m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
341 pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
343 /* Copy to output string. */
345 for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
346 for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
354 if(A == NULL) { free(B); }
362 divsufsort_version(void) {
363 return PROJECT_VERSION_FULL;