001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.util.Arrays;
023
024/**
025 * Helper class for compression algorithms that use the ideas of LZ77.
026 *
027 * <p>Most LZ77 derived algorithms split input data into blocks of
028 * uncompressed data (called literal blocks) and back-references
029 * (pairs of offsets and lengths) that state "add <code>length</code>
030 * bytes that are the same as those already written starting
031 * <code>offset</code> bytes before the current position. The details
032 * of how those blocks and back-references are encoded are quite
033 * different between the algorithms and some algorithms perform
034 * additional steps (Huffman encoding in the case of DEFLATE for
035 * example).</p>
036 *
037 * <p>This class attempts to extract the core logic - finding
038 * back-references - so it can be re-used. It follows the algorithm
039 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
040 * implement the "lazy match" optimization. The three-byte hash
041 * function used in this class is the same as the one used by zlib and
042 * InfoZIP's ZIP implementation of DEFLATE. The whole class is
043 * strongly inspired by InfoZIP's implementation.</p>
044 *
045 * <p>LZ77 is used vaguely here (as well as many other places that
046 * talk about it :-), LZSS would likely be closer to the truth but
047 * LZ77 has become the synonym for a whole family of algorithms.</p>
048 *
049 * <p>The API consists of a compressor that is fed <code>byte</code>s
050 * and emits {@link Block}s to a registered callback where the blocks
051 * represent either {@link LiteralBlock literal blocks}, {@link
052 * BackReference back-references} or {@link EOD end of data
053 * markers}. In order to ensure the callback receives all information,
054 * the {@code #finish} method must be used once all data has been fed
055 * into the compressor.</p>
056 *
057 * <p>Several parameters influence the outcome of the "compression":</p>
058 * <dl>
059 *
060 *  <dt><code>windowSize</code></dt> <dd>the size of the sliding
061 *  window, must be a power of two - this determines the maximum
062 *  offset a back-reference can take. The compressor maintains a
063 *  buffer of twice of <code>windowSize</code> - real world values are
064 *  in the area of 32k.</dd>
065 *
066 *  <dt><code>minBackReferenceLength</code></dt>
067 *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
068 *  hard-coded inside of this implemention but bigger lengths can be
069 *  configured.</dd>
070 *
071 *  <dt><code>maxBackReferenceLength</code></dt>
072 *  <dd>Maximal length of a back-reference found.</dd>
073 *
074 *  <dt><code>maxOffset</code></dt>
075 *  <dd>Maximal offset of a back-reference.</dd>
076 *
077 *  <dt><code>maxLiteralLength</code></dt>
078 *  <dd>Maximal length of a literal block.</dd>
079 * </dl>
080 *
081 * @see "https://tools.ietf.org/html/rfc1951#section-4"
082 * @since 1.14
083 * @NotThreadSafe
084 */
085public class LZ77Compressor {
086
087    /**
088     * Base class representing blocks the compressor may emit.
089     *
090     * <p>This class is not supposed to be subclassed by classes
091     * outside of Commons Compress so it is considered internal and
092     * changed that would break subclasses may get introduced with
093     * future releases.</p>
094     */
095    public static abstract class Block {
096        /** Enumeration of the block types the compressor may emit. */
097        public enum BlockType {
098            LITERAL, BACK_REFERENCE, EOD
099        }
100        public abstract BlockType getType();
101    }
102
103    /**
104     * Represents a literal block of data.
105     *
106     * <p>For performance reasons this encapsulates the real data, not
107     * a copy of it. Don't modify the data and process it inside of
108     * {@link Callback#accept} immediately as it will get overwritten
109     * sooner or later.</p>
110     */
111    public static final class LiteralBlock extends Block {
112        private final byte[] data;
113        private final int offset, length;
114        public LiteralBlock(byte[] data, int offset, int length) {
115            this.data = data;
116            this.offset = offset;
117            this.length = length;
118        }
119        /**
120         * The literal data.
121         *
122         * <p>This returns a life view of the actual data in order to
123         * avoid copying, modify the array at your own risk.</p>
124         * @return the data
125         */
126        public byte[] getData() {
127            return data;
128        }
129        /**
130         * Offset into data where the literal block starts.
131         * @return the offset
132         */
133        public int getOffset() {
134            return offset;
135        }
136        /**
137         * Length of literal block.
138         * @return the length
139         */
140        public int getLength() {
141            return length;
142        }
143        @Override
144        public BlockType getType() {
145            return BlockType.LITERAL;
146        }
147        @Override
148        public String toString() {
149            return "LiteralBlock starting at " + offset + " with length " + length;
150        }
151    }
152
153    /**
154     * Represents a back-reference.
155     */
156    public static final class BackReference extends Block {
157        private final int offset, length;
158        public BackReference(int offset, int length) {
159            this.offset = offset;
160            this.length = length;
161        }
162        /**
163         * Provides the offset of the back-reference.
164         * @return the offset
165         */
166        public int getOffset() {
167            return offset;
168        }
169        /**
170         * Provides the length of the back-reference.
171         * @return the length
172         */
173        public int getLength() {
174            return length;
175        }
176        @Override
177        public BlockType getType() {
178            return BlockType.BACK_REFERENCE;
179        }
180        @Override
181        public String toString() {
182            return "BackReference with offset " + offset + " and length " + length;
183        }
184    }
185
186    /** A simple "we are done" marker. */
187    public static final class EOD extends Block {
188        @Override
189        public BlockType getType() {
190            return BlockType.EOD;
191        }
192    }
193
194    private static final Block THE_EOD = new EOD();
195
196    /**
197     * Callback invoked while the compressor processes data.
198     *
199     * <p>The callback is invoked on the same thread that receives the
200     * bytes to compress and may be invoked multiple times during the
201     * execution of {@link #compress} or {@link #finish}.</p>
202     */
203    public interface Callback {
204        /**
205         * Consumes a block.
206         * @param b the block to consume
207         * @throws IOException in case of an error
208         */
209        void accept(Block b) throws IOException;
210    }
211
212    static final int NUMBER_OF_BYTES_IN_HASH = 3;
213    private static final int NO_MATCH = -1;
214
215    private final Parameters params;
216    private final Callback callback;
217
218    // the sliding window, twice as big as "windowSize" parameter
219    private final byte[] window;
220    // the head of hash-chain - indexed by hash-code, points to the
221    // location inside of window of the latest sequence of bytes with
222    // the given hash.
223    private final int[] head;
224    // for each window-location points to the latest earlier location
225    // with the same hash. Only stores values for the latest
226    // "windowSize" elements, the index is "window location modulo
227    // windowSize".
228    private final int[] prev;
229
230    // bit mask used when indexing into prev
231    private final int wMask;
232
233    private boolean initialized = false;
234    // the position inside of window that shall be encoded right now
235    private int currentPosition;
236    // the number of bytes available to compress including the one at
237    // currentPosition
238    private int lookahead = 0;
239    // the hash of the three bytes stating at the current position
240    private int insertHash = 0;
241    // the position inside of the window where the current literal
242    // block starts (in case we are inside of a literal block).
243    private int blockStart = 0;
244    // position of the current match
245    private int matchStart = NO_MATCH;
246    // number of missed insertString calls for the up to three last
247    // bytes of the last match that can only be performed once more
248    // data has been read
249    private int missedInserts = 0;
250
251    /**
252     * Initializes a compressor with parameters and a callback.
253     * @param params the parameters
254     * @param callback the callback
255     * @throws NullPointerException if either parameter is <code>null</code>
256     */
257    public LZ77Compressor(Parameters params, Callback callback) {
258        if (params == null) {
259            throw new NullPointerException("params must not be null");
260        }
261        if (callback == null) {
262            throw new NullPointerException("callback must not be null");
263        }
264        this.params = params;
265        this.callback = callback;
266
267        final int wSize = params.getWindowSize();
268        window = new byte[wSize * 2];
269        wMask = wSize - 1;
270        head = new int[HASH_SIZE];
271        Arrays.fill(head, NO_MATCH);
272        prev = new int[wSize];
273    }
274
275    /**
276     * Feeds bytes into the compressor which in turn may emit zero or
277     * more blocks to the callback during the execution of this
278     * method.
279     * @param data the data to compress - must not be null
280     * @throws IOException if the callback throws an exception
281     */
282    public void compress(byte[] data) throws IOException {
283        compress(data, 0, data.length);
284    }
285
286    /**
287     * Feeds bytes into the compressor which in turn may emit zero or
288     * more blocks to the callback during the execution of this
289     * method.
290     * @param data the data to compress - must not be null
291     * @param off the start offset of the data
292     * @param len the number of bytes to compress
293     * @throws IOException if the callback throws an exception
294     */
295    public void compress(byte[] data, int off, int len) throws IOException {
296        final int wSize = params.getWindowSize();
297        while (len > wSize) { // chop into windowSize sized chunks
298            doCompress(data, off, wSize);
299            off += wSize;
300            len -= wSize;
301        }
302        if (len > 0) {
303            doCompress(data, off, len);
304        }
305    }
306
307    /**
308     * Tells the compressor to process all remaining data and signal
309     * end of data to the callback.
310     *
311     * <p>The compressor will in turn emit at least one block ({@link
312     * EOD}) but potentially multiple blocks to the callback during
313     * the execution of this method.</p>
314     * @throws IOException if the callback throws an exception
315     */
316    public void finish() throws IOException {
317        if (blockStart != currentPosition || lookahead > 0) {
318            currentPosition += lookahead;
319            flushLiteralBlock();
320        }
321        callback.accept(THE_EOD);
322    }
323
324    /**
325     * Adds some initial data to fill the window with.
326     *
327     * <p>This is used if the stream has been cut into blocks and
328     * back-references of one block may refer to data of the previous
329     * block(s). One such example is the LZ4 frame format using block
330     * dependency.</p>
331     *
332     * @param data the data to fill the window with.
333     * @throws IllegalStateException if the compressor has already started to accept data
334     */
335    public void prefill(byte[] data) {
336        if (currentPosition != 0 || lookahead != 0) {
337            throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore");
338        }
339
340        // don't need more than windowSize for back-references
341        final int len = Math.min(params.getWindowSize(), data.length);
342        System.arraycopy(data, data.length - len, window, 0, len);
343
344        if (len >= NUMBER_OF_BYTES_IN_HASH) {
345            initialize();
346            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
347            for (int i = 0; i < stop; i++) {
348                insertString(i);
349            }
350            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
351        } else { // not enough data to hash anything
352            missedInserts = len;
353        }
354        blockStart = currentPosition = len;
355    }
356
357    // we use a 15 bit hashcode as calculated in updateHash
358    private static final int HASH_SIZE = 1 << 15;
359    private static final int HASH_MASK = HASH_SIZE - 1;
360    private static final int H_SHIFT = 5;
361
362    /**
363     * Assumes we are calculating the hash for three consecutive bytes
364     * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
365     * the new hash for BCD is nextHash(H, D).
366     *
367     * <p>The hash is shifted by five bits on each update so all
368     * effects of A have been swapped after the third update.</p>
369     */
370    private int nextHash(int oldHash, byte nextByte) {
371        final int nextVal = nextByte & 0xFF;
372        return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
373    }
374
375    // performs the actual algorithm with the pre-condition len <= windowSize
376    private void doCompress(byte[] data, int off, int len) throws IOException {
377        int spaceLeft = window.length - currentPosition - lookahead;
378        if (len > spaceLeft) {
379            slide();
380        }
381        System.arraycopy(data, off, window, currentPosition + lookahead, len);
382        lookahead += len;
383        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
384            initialize();
385        }
386        if (initialized) {
387            compress();
388        }
389    }
390
391    private void slide() throws IOException {
392        final int wSize = params.getWindowSize();
393        if (blockStart != currentPosition && blockStart < wSize) {
394            flushLiteralBlock();
395            blockStart = currentPosition;
396        }
397        System.arraycopy(window, wSize, window, 0, wSize);
398        currentPosition -= wSize;
399        matchStart -= wSize;
400        blockStart -= wSize;
401        for (int i = 0; i < HASH_SIZE; i++) {
402            int h = head[i];
403            head[i] = h >= wSize ? h - wSize : NO_MATCH;
404        }
405        for (int i = 0; i < wSize; i++) {
406            int p = prev[i];
407            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
408        }
409    }
410
411    private void initialize() {
412        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
413            insertHash = nextHash(insertHash, window[i]);
414        }
415        initialized = true;
416    }
417
418    private void compress() throws IOException {
419        final int minMatch = params.getMinBackReferenceLength();
420        final boolean lazy = params.getLazyMatching();
421        final int lazyThreshold = params.getLazyMatchingThreshold();
422
423        while (lookahead >= minMatch) {
424            catchUpMissedInserts();
425            int matchLength = 0;
426            int hashHead = insertString(currentPosition);
427            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
428                // sets matchStart as a side effect
429                matchLength = longestMatch(hashHead);
430
431                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
432                    // try to find a longer match using the next position
433                    matchLength = longestMatchForNextPosition(matchLength);
434                }
435            }
436            if (matchLength >= minMatch) {
437                if (blockStart != currentPosition) {
438                    // emit preceeding literal block
439                    flushLiteralBlock();
440                    blockStart = NO_MATCH;
441                }
442                flushBackReference(matchLength);
443                insertStringsInMatch(matchLength);
444                lookahead -= matchLength;
445                currentPosition += matchLength;
446                blockStart = currentPosition;
447            } else {
448                // no match, append to current or start a new literal
449                lookahead--;
450                currentPosition++;
451                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
452                    flushLiteralBlock();
453                    blockStart = currentPosition;
454                }
455            }
456        }
457    }
458
459    /**
460     * Inserts the current three byte sequence into the dictionary and
461     * returns the previous head of the hash-chain.
462     *
463     * <p>Updates <code>insertHash</code> and <code>prev</code> as a
464     * side effect.</p>
465     */
466    private int insertString(int pos) {
467        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
468        int hashHead = head[insertHash];
469        prev[pos & wMask] = hashHead;
470        head[insertHash] = pos;
471        return hashHead;
472    }
473
474    private int longestMatchForNextPosition(final int prevMatchLength) {
475        // save a bunch of values to restore them if the next match isn't better than the current one
476        final int prevMatchStart = matchStart;
477        final int prevInsertHash = insertHash;
478
479        lookahead--;
480        currentPosition++;
481        int hashHead = insertString(currentPosition);
482        final int prevHashHead = prev[currentPosition & wMask];
483        int matchLength = longestMatch(hashHead);
484
485        if (matchLength <= prevMatchLength) {
486            // use the first match, as the next one isn't any better
487            matchLength = prevMatchLength;
488            matchStart = prevMatchStart;
489
490            // restore modified values
491            head[insertHash] = prevHashHead;
492            insertHash = prevInsertHash;
493            currentPosition--;
494            lookahead++;
495        }
496        return matchLength;
497    }
498
499    private void insertStringsInMatch(int matchLength) {
500        // inserts strings contained in current match
501        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
502        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
503        // currentPosition has been inserted already
504        for (int i = 1; i <= stop; i++) {
505            insertString(currentPosition + i);
506        }
507        missedInserts = matchLength - stop - 1;
508    }
509
510    private void catchUpMissedInserts() {
511        while (missedInserts > 0) {
512            insertString(currentPosition - missedInserts--);
513        }
514    }
515
516    private void flushBackReference(int matchLength) throws IOException {
517        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
518    }
519
520    private void flushLiteralBlock() throws IOException {
521        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
522    }
523
524    /**
525     * Searches the hash chain for real matches and returns the length
526     * of the longest match (0 if none were found) that isn't too far
527     * away (WRT maxOffset).
528     *
529     * <p>Sets matchStart to the index of the start position of the
530     * longest match as a side effect.</p>
531     */
532    private int longestMatch(int matchHead) {
533        final int minLength = params.getMinBackReferenceLength();
534        int longestMatchLength = minLength - 1;
535        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
536        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
537        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
538        final int maxCandidates = params.getMaxCandidates();
539        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
540            int currentLength = 0;
541            for (int i = 0; i < maxPossibleLength; i++) {
542                if (window[matchHead + i] != window[currentPosition + i]) {
543                    break;
544                }
545                currentLength++;
546            }
547            if (currentLength > longestMatchLength) {
548                longestMatchLength = currentLength;
549                matchStart = matchHead;
550                if (currentLength >= niceBackReferenceLength) {
551                    // no need to search any further
552                    break;
553                }
554            }
555            matchHead = prev[matchHead & wMask];
556        }
557        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
558    }
559}