001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
041
042    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
043    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
044
045    /*
046
047      The LZ4 block format has a few properties that make it less
048      straight-forward than one would hope:
049
050      * literal blocks and back-references must come in pairs (except
051        for the very last literal block), so consecutive literal
052        blocks created by the compressor must be merged into a single
053        block.
054
055      * the start of a literal/back-reference pair contains the length
056        of the back-reference (at least some part of it) so we can't
057        start writing the literal before we know how long the next
058        back-reference is going to be.
059
060      * there are special rules for the final blocks
061
062        > There are specific parsing rules to respect in order to remain
063        > compatible with assumptions made by the decoder :
064        >
065        >     1. The last 5 bytes are always literals
066        >
067        >     2. The last match must start at least 12 bytes before end of
068        >        block. Consequently, a block with less than 13 bytes cannot be
069        >        compressed.
070
071        which means any back-reference may need to get rewritten as a
072        literal block unless we know the next block is at least of
073        length 5 and the sum of this block's length and offset and the
074        next block's length is at least twelve.
075
076    */
077
078    private final LZ77Compressor compressor;
079    private final OutputStream os;
080
081    // used in one-arg write method
082    private final byte[] oneByte = new byte[1];
083
084    private boolean finished = false;
085
086    private Deque<Pair> pairs = new LinkedList<>();
087    // keeps track of the last window-size bytes (64k) in order to be
088    // able to expand back-references when needed
089    private Deque<byte[]> expandedBlocks = new LinkedList<>();
090
091    /**
092     * Creates a new LZ4 output stream.
093     *
094     * @param os
095     *            An OutputStream to read compressed data from
096     *
097     * @throws IOException if reading fails
098     */
099    public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException {
100        this(os, createParameterBuilder().build());
101    }
102
103    /**
104     * Creates a new LZ4 output stream.
105     *
106     * @param os
107     *            An OutputStream to read compressed data from
108     * @param params
109     *            The parameters to use for LZ77 compression.
110     *
111     * @throws IOException if reading fails
112     */
113    public BlockLZ4CompressorOutputStream(final OutputStream os, Parameters params) throws IOException {
114        this.os = os;
115        compressor = new LZ77Compressor(params,
116            new LZ77Compressor.Callback() {
117                @Override
118                public void accept(LZ77Compressor.Block block) throws IOException {
119                    switch (block.getType()) {
120                    case LITERAL:
121                        addLiteralBlock((LZ77Compressor.LiteralBlock) block);
122                        break;
123                    case BACK_REFERENCE:
124                        addBackReference((LZ77Compressor.BackReference) block);
125                        break;
126                    case EOD:
127                        writeFinalLiteralBlock();
128                        break;
129                    }
130                }
131            });
132    }
133
134    @Override
135    public void write(int b) throws IOException {
136        oneByte[0] = (byte) (b & 0xff);
137        write(oneByte);
138    }
139
140    @Override
141    public void write(byte[] data, int off, int len) throws IOException {
142        compressor.compress(data, off, len);
143    }
144
145    @Override
146    public void close() throws IOException {
147        try {
148            finish();
149        } finally {
150            os.close();
151        }
152    }
153
154    /**
155     * Compresses all remaining data and writes it to the stream,
156     * doesn't close the underlying stream.
157     * @throws IOException if an error occurs
158     */
159    public void finish() throws IOException {
160        if (!finished) {
161            compressor.finish();
162            finished = true;
163        }
164    }
165
166    /**
167     * Adds some initial data to fill the window with.
168     *
169     * @param data the data to fill the window with.
170     * @param off offset of real data into the array
171     * @param len amount of data
172     * @throws IllegalStateException if the stream has already started to write data
173     * @see LZ77Compressor#prefill
174     */
175    public void prefill(byte[] data, int off, int len) {
176        if (len > 0) {
177            byte[] b = Arrays.copyOfRange(data, off, off + len);
178            compressor.prefill(b);
179            recordLiteral(b);
180        }
181    }
182
183    private void addLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException {
184        Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
185        recordLiteral(last.addLiteral(block));
186        clearUnusedBlocksAndPairs();
187    }
188
189    private void addBackReference(LZ77Compressor.BackReference block) throws IOException {
190        Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
191        last.setBackReference(block);
192        recordBackReference(block);
193        clearUnusedBlocksAndPairs();
194    }
195
196    private Pair writeBlocksAndReturnUnfinishedPair(int length) throws IOException {
197        writeWritablePairs(length);
198        Pair last = pairs.peekLast();
199        if (last == null || last.hasBackReference()) {
200            last = new Pair();
201            pairs.addLast(last);
202        }
203        return last;
204    }
205
206    private void recordLiteral(byte[] b) {
207        expandedBlocks.addFirst(b);
208    }
209
210    private void clearUnusedBlocksAndPairs() {
211        clearUnusedBlocks();
212        clearUnusedPairs();
213    }
214
215    private void clearUnusedBlocks() {
216        int blockLengths = 0;
217        int blocksToKeep = 0;
218        for (byte[] b : expandedBlocks) {
219            blocksToKeep++;
220            blockLengths += b.length;
221            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
222                break;
223            }
224        }
225        final int size = expandedBlocks.size();
226        for (int i = blocksToKeep; i < size; i++) {
227            expandedBlocks.removeLast();
228        }
229    }
230
231    private void recordBackReference(LZ77Compressor.BackReference block) {
232        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
233    }
234
235    private byte[] expand(final int offset, final int length) {
236        byte[] expanded = new byte[length];
237        if (offset == 1) { // surprisingly common special case
238            byte[] block = expandedBlocks.peekFirst();
239            byte b = block[block.length - 1];
240            if (b != 0) { // the fresh array contains 0s anyway
241                Arrays.fill(expanded, b);
242            }
243        } else {
244            expandFromList(expanded, offset, length);
245        }
246        return expanded;
247    }
248
249    private void expandFromList(final byte[] expanded, int offset, int length) {
250        int offsetRemaining = offset;
251        int lengthRemaining = length;
252        int writeOffset = 0;
253        while (lengthRemaining > 0) {
254            // find block that contains offsetRemaining
255            byte[] block = null;
256            int copyLen, copyOffset;
257            if (offsetRemaining > 0) {
258                int blockOffset = 0;
259                for (byte[] b : expandedBlocks) {
260                    if (b.length + blockOffset >= offsetRemaining) {
261                        block = b;
262                        break;
263                    }
264                    blockOffset += b.length;
265                }
266                if (block == null) {
267                    // should not be possible
268                    throw new IllegalStateException("failed to find a block containing offset " + offset);
269                }
270                copyOffset = blockOffset + block.length - offsetRemaining;
271                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
272            } else {
273                // offsetRemaining is negative or 0 and points into the expanded bytes
274                block = expanded;
275                copyOffset = -offsetRemaining;
276                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
277            }
278            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
279            offsetRemaining -= copyLen;
280            lengthRemaining -= copyLen;
281            writeOffset += copyLen;
282        }
283    }
284
285    private void clearUnusedPairs() {
286        int pairLengths = 0;
287        int pairsToKeep = 0;
288        for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
289            Pair p = it.next();
290            pairsToKeep++;
291            pairLengths += p.length();
292            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
293                break;
294            }
295        }
296        final int size = pairs.size();
297        for (int i = pairsToKeep; i < size; i++) {
298            Pair p = pairs.peekFirst();
299            if (p.hasBeenWritten()) {
300                pairs.removeFirst();
301            } else {
302                break;
303            }
304        }
305    }
306
307    private void writeFinalLiteralBlock() throws IOException {
308        rewriteLastPairs();
309        for (Pair p : pairs) {
310            if (!p.hasBeenWritten()) {
311                p.writeTo(os);
312            }
313        }
314        pairs.clear();
315    }
316
317    private void writeWritablePairs(int lengthOfBlocksAfterLastPair) throws IOException {
318        int unwrittenLength = lengthOfBlocksAfterLastPair;
319        for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
320            Pair p = it.next();
321            if (p.hasBeenWritten()) {
322                break;
323            }
324            unwrittenLength += p.length();
325        }
326        for (Pair p : pairs) {
327            if (p.hasBeenWritten()) {
328                continue;
329            }
330            unwrittenLength -= p.length();
331            if (p.canBeWritten(unwrittenLength)) {
332                p.writeTo(os);
333            } else {
334                break;
335            }
336        }
337    }
338
339    private void rewriteLastPairs() {
340        LinkedList<Pair> lastPairs = new LinkedList<>();
341        LinkedList<Integer> pairLength = new LinkedList<>();
342        int offset = 0;
343        for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
344            Pair p = it.next();
345            if (p.hasBeenWritten()) {
346                break;
347            }
348            int len = p.length();
349            pairLength.addFirst(len);
350            lastPairs.addFirst(p);
351            offset += len;
352            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
353                break;
354            }
355        }
356        for (Pair p : lastPairs) {
357            pairs.remove(p);
358        }
359        // lastPairs may contain between one and four Pairs:
360        // * the last pair may be a one byte literal
361        // * all other Pairs contain a back-reference which must be four bytes long at minimum
362        // we could merge them all into a single literal block but
363        // this may harm compression. For example compressing
364        // "bla.tar" from our tests yields a last block containing a
365        // back-reference of length > 2k and we'd end up with a last
366        // literal of that size rather than a 2k back-reference and a
367        // 12 byte literal at the end.
368
369        // Instead we merge all but the first of lastPairs into a new
370        // literal-only Pair "replacement" and look at the
371        // back-reference in the first of lastPairs and see if we can
372        // split it. We can split it if it is longer than 16 -
373        // replacement.length (i.e. the minimal length of four is kept
374        // while making sure the last literal is at least twelve bytes
375        // long). If we can't split it, we expand the first of the pairs
376        // as well.
377
378        // this is not optimal, we could get better compression
379        // results with more complex approaches as the last literal
380        // only needs to be five bytes long if the previous
381        // back-reference has an offset big enough
382
383        final int lastPairsSize = lastPairs.size();
384        int toExpand = 0;
385        for (int i = 1; i < lastPairsSize; i++) {
386            toExpand += pairLength.get(i);
387        }
388        Pair replacement = new Pair();
389        if (toExpand > 0) {
390            replacement.prependLiteral(expand(toExpand, toExpand));
391        }
392        Pair splitCandidate = lastPairs.get(0);
393        int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
394        int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
395        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
396            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
397            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
398        } else {
399            if (splitCandidate.hasBackReference()) {
400                replacement.prependLiteral(expand(toExpand + brLen, brLen));
401            }
402            splitCandidate.prependTo(replacement);
403        }
404        pairs.add(replacement);
405    }
406
407    /**
408     * Returns a builder correctly configured for the LZ4 algorithm.
409     * @return a builder correctly configured for the LZ4 algorithm
410     */
411    public static Parameters.Builder createParameterBuilder() {
412        int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
413        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE)
414            .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
415            .withMaxBackReferenceLength(maxLen)
416            .withMaxOffset(maxLen)
417            .withMaxLiteralLength(maxLen);
418    }
419
420    final static class Pair {
421        private final Deque<byte[]> literals = new LinkedList<>();
422        private int brOffset, brLength;
423        private boolean written;
424
425        private void prependLiteral(byte[] data) {
426            literals.addFirst(data);
427        }
428        byte[] addLiteral(LZ77Compressor.LiteralBlock block) {
429            byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
430                block.getOffset() + block.getLength());
431            literals.add(copy);
432            return copy;
433        }
434        void setBackReference(LZ77Compressor.BackReference block) {
435            if (hasBackReference()) {
436                throw new IllegalStateException();
437            }
438            brOffset = block.getOffset();
439            brLength = block.getLength();
440        }
441        boolean hasBackReference() {
442            return brOffset > 0;
443        }
444        boolean canBeWritten(int lengthOfBlocksAfterThisPair) {
445            return hasBackReference()
446                && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
447        }
448        int length() {
449            return literalLength() + brLength;
450        }
451        private boolean hasBeenWritten() {
452            return written;
453        }
454        void writeTo(OutputStream out) throws IOException {
455            int litLength = literalLength();
456            out.write(lengths(litLength, brLength));
457            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
458                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
459            }
460            for (byte[] b : literals) {
461                out.write(b);
462            }
463            if (hasBackReference()) {
464                ByteUtils.toLittleEndian(out, brOffset, 2);
465                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
466                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH
467                        - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
468                }
469            }
470            written = true;
471        }
472        private int literalLength() {
473            int length = 0;
474            for (byte[] b : literals) {
475                length += b.length;
476            }
477            return length;
478        }
479        private static int lengths(int litLength, int brLength) {
480            int l = litLength < 15 ? litLength : 15;
481            int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15);
482            return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br;
483        }
484        private static void writeLength(int length, OutputStream out) throws IOException {
485            while (length >= 255) {
486                out.write(255);
487                length -= 255;
488            }
489            out.write(length);
490        }
491        private int backReferenceLength() {
492            return brLength;
493        }
494        private void prependTo(Pair other) {
495            Iterator<byte[]> listBackwards = literals.descendingIterator();
496            while (listBackwards.hasNext()) {
497                other.prependLiteral(listBackwards.next());
498            }
499        }
500        private Pair splitWithNewBackReferenceLengthOf(int newBackReferenceLength) {
501            Pair p = new Pair();
502            p.literals.addAll(literals);
503            p.brOffset = brOffset;
504            p.brLength = newBackReferenceLength;
505            return p;
506        }
507    }
508}