+/******************************************************************************/
+
+/*
+ * Block splitting algorithm. The problem is to decide when it is worthwhile to
+ * start a new block with new entropy codes. There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach. But this is
+ * far too slow. Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution. If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type". For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not. The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols. For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough. We also add a number
+ * proportional to the block size so that the algorithm is more likely to end
+ * large blocks than small blocks. This reflects the general expectation that
+ * it will become increasingly beneficial to start a new block as the current
+ * blocks grows larger.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered. With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized. Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+ for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+ stats->new_observations[i] = 0;
+ stats->observations[i] = 0;
+ }
+ stats->num_new_observations = 0;
+ stats->num_observations = 0;
+}
+
+/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types. */
+static inline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+ stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+ stats->num_new_observations++;
+}
+
+/* Match observation. Heuristic: use one observation type for "short match" and
+ * one observation type for "long match". */
+static inline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+ stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
+ stats->num_new_observations++;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_size)
+{
+ if (stats->num_observations > 0) {
+
+ /* Note: to avoid slow divisions, we do not divide by
+ * 'num_observations', but rather do all math with the numbers
+ * multiplied by 'num_observations'. */
+ u32 total_delta = 0;
+ for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+ u32 expected = stats->observations[i] * stats->num_new_observations;
+ u32 actual = stats->new_observations[i] * stats->num_observations;
+ u32 delta = (actual > expected) ? actual - expected :
+ expected - actual;
+ total_delta += delta;
+ }
+
+ /* Ready to end the block? */
+ if (total_delta + (block_size / 1024) * stats->num_observations >=
+ stats->num_new_observations * 51 / 64 * stats->num_observations)
+ return true;
+ }
+
+ for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+ stats->num_observations += stats->new_observations[i];
+ stats->observations[i] += stats->new_observations[i];
+ stats->new_observations[i] = 0;
+ }
+ stats->num_new_observations = 0;
+ return false;
+}
+
+static inline bool
+should_end_block(struct block_split_stats *stats,
+ const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+ /* Ready to check block split statistics? */
+ if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
+ in_next - in_block_begin < MIN_BLOCK_SIZE ||
+ in_end - in_next < MIN_BLOCK_SIZE)
+ return false;
+
+ return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+