001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.OutputStream;
023
024import org.apache.commons.compress.compressors.CompressorOutputStream;
025import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
026import org.apache.commons.compress.compressors.lz77support.Parameters;
027import org.apache.commons.compress.utils.ByteUtils;
028
029/**
030 * CompressorOutputStream for the raw Snappy format.
031 *
032 * <p>This implementation uses an internal buffer in order to handle
033 * the back-references that are at the heart of the LZ77 algorithm.
034 * The size of the buffer must be at least as big as the biggest
035 * offset used in the compressed stream.  The current version of the
036 * Snappy algorithm as defined by Google works on 32k blocks and
037 * doesn't contain offsets bigger than 32k which is the default block
038 * size used by this class.</p>
039 *
040 * <p>The raw Snappy format requires the uncompressed size to be
041 * written at the beginning of the stream using a varint
042 * representation, i.e. the number of bytes needed to write the
043 * information is not known before the uncompressed size is
044 * known. We've chosen to make the uncompressedSize a parameter of the
045 * constructor in favor of buffering the whole output until the size
046 * is known. When using the {@link FramedSnappyCompressorOutputStream}
047 * this limitation is taken care of by the warpping framing
048 * format.</p>
049 *
050 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
051 * @since 1.14
052 * @NotThreadSafe
053 */
054public class SnappyCompressorOutputStream extends CompressorOutputStream {
055    private final LZ77Compressor compressor;
056    private final OutputStream os;
057    private final ByteUtils.ByteConsumer consumer;
058
059    // used in one-arg write method
060    private final byte[] oneByte = new byte[1];
061
062    private boolean finished = false;
063
064    /**
065     * Constructor using the default block size of 32k.
066     *
067     * @param os the outputstream to write compressed data to
068     * @param uncompressedSize the uncompressed size of data
069     * @throws IOException if writing of the size fails
070     */
071    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException {
072        this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE);
073    }
074
075    /**
076     * Constructor using a configurable block size.
077     *
078     * @param os the outputstream to write compressed data to
079     * @param uncompressedSize the uncompressed size of data
080     * @param blockSize the block size used - must be a power of two
081     * @throws IOException if writing of the size fails
082     */
083    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize)
084        throws IOException {
085        this(os, uncompressedSize, createParameterBuilder(blockSize).build());
086    }
087
088    /**
089     * Constructor providing full control over the underlying LZ77 compressor.
090     *
091     * @param os the outputstream to write compressed data to
092     * @param uncompressedSize the uncompressed size of data
093     * @param params the parameters to use by the compressor - note
094     * that the format itself imposes some limits like a maximum match
095     * length of 64 bytes
096     * @throws IOException if writing of the size fails
097     */
098    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, Parameters params)
099        throws IOException {
100        this.os = os;
101        consumer = new ByteUtils.OutputStreamByteConsumer(os);
102        compressor = new LZ77Compressor(params, new LZ77Compressor.Callback() {
103                @Override
104                public void accept(LZ77Compressor.Block block) throws IOException {
105                    switch (block.getType()) {
106                    case LITERAL:
107                        writeLiteralBlock((LZ77Compressor.LiteralBlock) block);
108                        break;
109                    case BACK_REFERENCE:
110                        writeBackReference((LZ77Compressor.BackReference) block);
111                        break;
112                    case EOD:
113                        break;
114                    }
115                }
116            });
117        writeUncompressedSize(uncompressedSize);
118    }
119
120    @Override
121    public void write(int b) throws IOException {
122        oneByte[0] = (byte) (b & 0xff);
123        write(oneByte);
124    }
125
126    @Override
127    public void write(byte[] data, int off, int len) throws IOException {
128        compressor.compress(data, off, len);
129    }
130
131    @Override
132    public void close() throws IOException {
133        try {
134            finish();
135        } finally {
136            os.close();
137        }
138    }
139
140    /**
141     * Compresses all remaining data and writes it to the stream,
142     * doesn't close the underlying stream.
143     * @throws IOException if an error occurs
144     */
145    public void finish() throws IOException {
146        if (!finished) {
147            compressor.finish();
148            finished = true;
149        }
150    }
151
152    private void writeUncompressedSize(long uncompressedSize) throws IOException {
153        boolean more = false;
154        do {
155            int currentByte = (int) (uncompressedSize & 0x7F);
156            more = uncompressedSize > currentByte;
157            if (more) {
158                currentByte |= 0x80;
159            }
160            os.write(currentByte);
161            uncompressedSize >>= 7;
162        } while (more);
163    }
164
165    // literal length is stored as (len - 1) either inside the tag
166    // (six bits minus four flags) or in 1 to 4 bytes after the tag
167    private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60;
168    private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8;
169    private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16;
170    private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24;
171
172    private static final int ONE_SIZE_BYTE_MARKER = 60 << 2;
173    private static final int TWO_SIZE_BYTE_MARKER = 61 << 2;
174    private static final int THREE_SIZE_BYTE_MARKER = 62 << 2;
175    private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2;
176
177    private void writeLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException {
178        int len = block.getLength();
179        if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) {
180            writeLiteralBlockNoSizeBytes(block, len);
181        } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) {
182            writeLiteralBlockOneSizeByte(block, len);
183        } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) {
184            writeLiteralBlockTwoSizeBytes(block, len);
185        } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) {
186            writeLiteralBlockThreeSizeBytes(block, len);
187        } else {
188            writeLiteralBlockFourSizeBytes(block, len);
189        }
190    }
191
192    private void writeLiteralBlockNoSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException {
193        writeLiteralBlockWithSize(len - 1 << 2, 0, len, block);
194    }
195
196    private void writeLiteralBlockOneSizeByte(LZ77Compressor.LiteralBlock block, int len) throws IOException {
197        writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block);
198    }
199
200    private void writeLiteralBlockTwoSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException {
201        writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block);
202    }
203
204    private void writeLiteralBlockThreeSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException {
205        writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block);
206    }
207
208    private void writeLiteralBlockFourSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException {
209        writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block);
210    }
211
212    private void writeLiteralBlockWithSize(int tagByte, int sizeBytes, int len, LZ77Compressor.LiteralBlock block)
213        throws IOException {
214        os.write(tagByte);
215        writeLittleEndian(sizeBytes, len - 1);
216        os.write(block.getData(), block.getOffset(), len);
217    }
218
219    private void writeLittleEndian(final int numBytes, int num) throws IOException {
220        ByteUtils.toLittleEndian(consumer, num, numBytes);
221    }
222
223    // Back-references ("copies") have their offset/size information
224    // in two, three or five bytes.
225    private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4;
226    private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11;
227    private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1;
228    private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1;
229
230    private static final int ONE_BYTE_COPY_TAG = 1;
231    private static final int TWO_BYTE_COPY_TAG = 2;
232    private static final int FOUR_BYTE_COPY_TAG = 3;
233
234    private void writeBackReference(LZ77Compressor.BackReference block) throws IOException {
235        final int len = block.getLength();
236        final int offset = block.getOffset();
237        if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE
238            && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) {
239            writeBackReferenceWithOneOffsetByte(len, offset);
240        } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) {
241            writeBackReferenceWithTwoOffsetBytes(len, offset);
242        } else {
243            writeBackReferenceWithFourOffsetBytes(len, offset);
244        }
245    }
246
247    private void writeBackReferenceWithOneOffsetByte(int len, int offset) throws IOException {
248        os.write(ONE_BYTE_COPY_TAG | ((len - 4) << 2) | ((offset & 0x700) >> 3));
249        os.write(offset & 0xff);
250    }
251
252    private void writeBackReferenceWithTwoOffsetBytes(int len, int offset) throws IOException {
253        writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset);
254    }
255
256    private void writeBackReferenceWithFourOffsetBytes(int len, int offset) throws IOException {
257        writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset);
258    }
259
260    private void writeBackReferenceWithLittleEndianOffset(int tag, int offsetBytes, int len, int offset)
261        throws IOException {
262        os.write(tag | ((len - 1) << 2));
263        writeLittleEndian(offsetBytes, offset);
264    }
265
266    // technically the format could use shorter matches but with a
267    // length of three the offset would be encoded as at least two
268    // bytes in addition to the tag, so yield no compression at all
269    private static final int MIN_MATCH_LENGTH = 4;
270    // Snappy stores the match length in six bits of the tag
271    private static final int MAX_MATCH_LENGTH = 64;
272
273    /**
274     * Returns a builder correctly configured for the Snappy algorithm using the gven block size.
275     * @param blockSize the block size.
276     * @return a builder correctly configured for the Snappy algorithm using the gven block size
277     */
278    public static Parameters.Builder createParameterBuilder(int blockSize) {
279        // the max offset and max literal length defined by the format
280        // are 2^32 - 1 and 2^32 respectively - with blockSize being
281        // an integer we will never exceed that
282        return Parameters.builder(blockSize)
283            .withMinBackReferenceLength(MIN_MATCH_LENGTH)
284            .withMaxBackReferenceLength(MAX_MATCH_LENGTH)
285            .withMaxOffset(blockSize)
286            .withMaxLiteralLength(blockSize);
287    }
288}