001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
042
043    /** Mask used to determine the type of "tag" is being processed */
044    private static final int TAG_MASK = 0x03;
045
046    /** Default block size */
047    public static final int DEFAULT_BLOCK_SIZE = 32768;
048
049    /** The size of the uncompressed data */
050    private final int size;
051
052    /** Number of uncompressed bytes still to be read. */
053    private int uncompressedBytesRemaining;
054
055    /** Current state of the stream */
056    private State state = State.NO_BLOCK;
057
058    private boolean endReached = false;
059
060    /**
061     * Constructor using the default buffer size of 32k.
062     *
063     * @param is
064     *            An InputStream to read compressed data from
065     *
066     * @throws IOException if reading fails
067     */
068    public SnappyCompressorInputStream(final InputStream is) throws IOException {
069        this(is, DEFAULT_BLOCK_SIZE);
070    }
071
072    /**
073     * Constructor using a configurable buffer size.
074     *
075     * @param is
076     *            An InputStream to read compressed data from
077     * @param blockSize
078     *            The block size used in compression
079     *
080     * @throws IOException if reading fails
081     */
082    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
083            throws IOException {
084        super(is, blockSize);
085        uncompressedBytesRemaining = size = (int) readSize();
086    }
087
088    /**
089     * {@inheritDoc}
090     */
091    @Override
092    public int read(final byte[] b, final int off, final int len) throws IOException {
093        if (endReached) {
094            return -1;
095        }
096        switch (state) {
097        case NO_BLOCK:
098            fill();
099            return read(b, off, len);
100        case IN_LITERAL:
101            int litLen = readLiteral(b, off, len);
102            if (!hasMoreDataInBlock()) {
103                state = State.NO_BLOCK;
104            }
105            return litLen > 0 ? litLen : read(b, off, len);
106        case IN_BACK_REFERENCE:
107            int backReferenceLen = readBackReference(b, off, len);
108            if (!hasMoreDataInBlock()) {
109                state = State.NO_BLOCK;
110            }
111            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
112        default:
113            throw new IOException("Unknown stream state " + state);
114        }
115    }
116
117    /**
118     * Try to fill the buffer with the next block of data.
119     */
120    private void fill() throws IOException {
121        if (uncompressedBytesRemaining == 0) {
122            endReached = true;
123            return;
124        }
125
126        int b = readOneByte();
127        if (b == -1) {
128            throw new IOException("Premature end of stream reading block start");
129        }
130        int length = 0;
131        int offset = 0;
132
133        switch (b & TAG_MASK) {
134
135        case 0x00:
136
137            length = readLiteralLength(b);
138            uncompressedBytesRemaining -= length;
139            startLiteral(length);
140            state = State.IN_LITERAL;
141            break;
142
143        case 0x01:
144
145            /*
146             * These elements can encode lengths between [4..11] bytes and
147             * offsets between [0..2047] bytes. (len-4) occupies three bits
148             * and is stored in bits [2..4] of the tag byte. The offset
149             * occupies 11 bits, of which the upper three are stored in the
150             * upper three bits ([5..7]) of the tag byte, and the lower
151             * eight are stored in a byte following the tag byte.
152             */
153
154            length = 4 + ((b >> 2) & 0x07);
155            uncompressedBytesRemaining -= length;
156            offset = (b & 0xE0) << 3;
157            b = readOneByte();
158            if (b == -1) {
159                throw new IOException("Premature end of stream reading back-reference length");
160            }
161            offset |= b;
162
163            startBackReference(offset, length);
164            state = State.IN_BACK_REFERENCE;
165            break;
166
167        case 0x02:
168
169            /*
170             * These elements can encode lengths between [1..64] and offsets
171             * from [0..65535]. (len-1) occupies six bits and is stored in
172             * the upper six bits ([2..7]) of the tag byte. The offset is
173             * stored as a little-endian 16-bit integer in the two bytes
174             * following the tag byte.
175             */
176
177            length = (b >> 2) + 1;
178            uncompressedBytesRemaining -= length;
179
180            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
181
182            startBackReference(offset, length);
183            state = State.IN_BACK_REFERENCE;
184            break;
185
186        case 0x03:
187
188            /*
189             * These are like the copies with 2-byte offsets (see previous
190             * subsection), except that the offset is stored as a 32-bit
191             * integer instead of a 16-bit integer (and thus will occupy
192             * four bytes).
193             */
194
195            length = (b >> 2) + 1;
196            uncompressedBytesRemaining -= length;
197
198            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
199
200            startBackReference(offset, length);
201            state = State.IN_BACK_REFERENCE;
202            break;
203        default:
204            // impossible as TAG_MASK is two bits and all four possible cases have been covered
205            break;
206        }
207    }
208
209    /*
210     * For literals up to and including 60 bytes in length, the
211     * upper six bits of the tag byte contain (len-1). The literal
212     * follows immediately thereafter in the bytestream. - For
213     * longer literals, the (len-1) value is stored after the tag
214     * byte, little-endian. The upper six bits of the tag byte
215     * describe how many bytes are used for the length; 60, 61, 62
216     * or 63 for 1-4 bytes, respectively. The literal itself follows
217     * after the length.
218     */
219    private int readLiteralLength(final int b) throws IOException {
220        int length;
221        switch (b >> 2) {
222        case 60:
223            length = readOneByte();
224            if (length == -1) {
225                throw new IOException("Premature end of stream reading literal length");
226            }
227            break;
228        case 61:
229            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
230            break;
231        case 62:
232            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
233            break;
234        case 63:
235            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
236            break;
237        default:
238            length = b >> 2;
239            break;
240        }
241
242        return length + 1;
243    }
244
245    /**
246     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
247     * 1), stored as a little-endian varint. Varints consist of a series of
248     * bytes, where the lower 7 bits are data and the upper bit is set iff there
249     * are more bytes to be read. In other words, an uncompressed length of 64
250     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
251     * would be stored as 0xFE 0xFF 0x7F.
252     *
253     * @return The size of the uncompressed data
254     *
255     * @throws IOException
256     *             Could not read a byte
257     */
258    private long readSize() throws IOException {
259        int index = 0;
260        long sz = 0;
261        int b = 0;
262
263        do {
264            b = readOneByte();
265            if (b == -1) {
266                throw new IOException("Premature end of stream reading size");
267            }
268            sz |= (b & 0x7f) << (index++ * 7);
269        } while (0 != (b & 0x80));
270        return sz;
271    }
272
273    /**
274     * Get the uncompressed size of the stream
275     *
276     * @return the uncompressed size
277     */
278    @Override
279    public int getSize() {
280        return size;
281    }
282
283    private enum State {
284        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
285    }
286}