001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 042 043 /** Mask used to determine the type of "tag" is being processed */ 044 private static final int TAG_MASK = 0x03; 045 046 /** Default block size */ 047 public static final int DEFAULT_BLOCK_SIZE = 32768; 048 049 /** The size of the uncompressed data */ 050 private final int size; 051 052 /** Number of uncompressed bytes still to be read. */ 053 private int uncompressedBytesRemaining; 054 055 /** Current state of the stream */ 056 private State state = State.NO_BLOCK; 057 058 private boolean endReached = false; 059 060 /** 061 * Constructor using the default buffer size of 32k. 062 * 063 * @param is 064 * An InputStream to read compressed data from 065 * 066 * @throws IOException if reading fails 067 */ 068 public SnappyCompressorInputStream(final InputStream is) throws IOException { 069 this(is, DEFAULT_BLOCK_SIZE); 070 } 071 072 /** 073 * Constructor using a configurable buffer size. 074 * 075 * @param is 076 * An InputStream to read compressed data from 077 * @param blockSize 078 * The block size used in compression 079 * 080 * @throws IOException if reading fails 081 */ 082 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 083 throws IOException { 084 super(is, blockSize); 085 uncompressedBytesRemaining = size = (int) readSize(); 086 } 087 088 /** 089 * {@inheritDoc} 090 */ 091 @Override 092 public int read(final byte[] b, final int off, final int len) throws IOException { 093 if (endReached) { 094 return -1; 095 } 096 switch (state) { 097 case NO_BLOCK: 098 fill(); 099 return read(b, off, len); 100 case IN_LITERAL: 101 int litLen = readLiteral(b, off, len); 102 if (!hasMoreDataInBlock()) { 103 state = State.NO_BLOCK; 104 } 105 return litLen > 0 ? litLen : read(b, off, len); 106 case IN_BACK_REFERENCE: 107 int backReferenceLen = readBackReference(b, off, len); 108 if (!hasMoreDataInBlock()) { 109 state = State.NO_BLOCK; 110 } 111 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 112 default: 113 throw new IOException("Unknown stream state " + state); 114 } 115 } 116 117 /** 118 * Try to fill the buffer with the next block of data. 119 */ 120 private void fill() throws IOException { 121 if (uncompressedBytesRemaining == 0) { 122 endReached = true; 123 return; 124 } 125 126 int b = readOneByte(); 127 if (b == -1) { 128 throw new IOException("Premature end of stream reading block start"); 129 } 130 int length = 0; 131 int offset = 0; 132 133 switch (b & TAG_MASK) { 134 135 case 0x00: 136 137 length = readLiteralLength(b); 138 uncompressedBytesRemaining -= length; 139 startLiteral(length); 140 state = State.IN_LITERAL; 141 break; 142 143 case 0x01: 144 145 /* 146 * These elements can encode lengths between [4..11] bytes and 147 * offsets between [0..2047] bytes. (len-4) occupies three bits 148 * and is stored in bits [2..4] of the tag byte. The offset 149 * occupies 11 bits, of which the upper three are stored in the 150 * upper three bits ([5..7]) of the tag byte, and the lower 151 * eight are stored in a byte following the tag byte. 152 */ 153 154 length = 4 + ((b >> 2) & 0x07); 155 uncompressedBytesRemaining -= length; 156 offset = (b & 0xE0) << 3; 157 b = readOneByte(); 158 if (b == -1) { 159 throw new IOException("Premature end of stream reading back-reference length"); 160 } 161 offset |= b; 162 163 startBackReference(offset, length); 164 state = State.IN_BACK_REFERENCE; 165 break; 166 167 case 0x02: 168 169 /* 170 * These elements can encode lengths between [1..64] and offsets 171 * from [0..65535]. (len-1) occupies six bits and is stored in 172 * the upper six bits ([2..7]) of the tag byte. The offset is 173 * stored as a little-endian 16-bit integer in the two bytes 174 * following the tag byte. 175 */ 176 177 length = (b >> 2) + 1; 178 uncompressedBytesRemaining -= length; 179 180 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 181 182 startBackReference(offset, length); 183 state = State.IN_BACK_REFERENCE; 184 break; 185 186 case 0x03: 187 188 /* 189 * These are like the copies with 2-byte offsets (see previous 190 * subsection), except that the offset is stored as a 32-bit 191 * integer instead of a 16-bit integer (and thus will occupy 192 * four bytes). 193 */ 194 195 length = (b >> 2) + 1; 196 uncompressedBytesRemaining -= length; 197 198 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 199 200 startBackReference(offset, length); 201 state = State.IN_BACK_REFERENCE; 202 break; 203 default: 204 // impossible as TAG_MASK is two bits and all four possible cases have been covered 205 break; 206 } 207 } 208 209 /* 210 * For literals up to and including 60 bytes in length, the 211 * upper six bits of the tag byte contain (len-1). The literal 212 * follows immediately thereafter in the bytestream. - For 213 * longer literals, the (len-1) value is stored after the tag 214 * byte, little-endian. The upper six bits of the tag byte 215 * describe how many bytes are used for the length; 60, 61, 62 216 * or 63 for 1-4 bytes, respectively. The literal itself follows 217 * after the length. 218 */ 219 private int readLiteralLength(final int b) throws IOException { 220 int length; 221 switch (b >> 2) { 222 case 60: 223 length = readOneByte(); 224 if (length == -1) { 225 throw new IOException("Premature end of stream reading literal length"); 226 } 227 break; 228 case 61: 229 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 230 break; 231 case 62: 232 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 233 break; 234 case 63: 235 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 236 break; 237 default: 238 length = b >> 2; 239 break; 240 } 241 242 return length + 1; 243 } 244 245 /** 246 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 247 * 1), stored as a little-endian varint. Varints consist of a series of 248 * bytes, where the lower 7 bits are data and the upper bit is set iff there 249 * are more bytes to be read. In other words, an uncompressed length of 64 250 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 251 * would be stored as 0xFE 0xFF 0x7F. 252 * 253 * @return The size of the uncompressed data 254 * 255 * @throws IOException 256 * Could not read a byte 257 */ 258 private long readSize() throws IOException { 259 int index = 0; 260 long sz = 0; 261 int b = 0; 262 263 do { 264 b = readOneByte(); 265 if (b == -1) { 266 throw new IOException("Premature end of stream reading size"); 267 } 268 sz |= (b & 0x7f) << (index++ * 7); 269 } while (0 != (b & 0x80)); 270 return sz; 271 } 272 273 /** 274 * Get the uncompressed size of the stream 275 * 276 * @return the uncompressed size 277 */ 278 @Override 279 public int getSize() { 280 return size; 281 } 282 283 private enum State { 284 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 285 } 286}