001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInput; 026import java.io.DataInputStream; 027import java.io.BufferedInputStream; 028import java.util.zip.DataFormatException; 029import java.util.zip.Deflater; 030import java.util.zip.Inflater; 031import java.util.zip.CRC32; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.CharsetNames; 036import org.apache.commons.compress.utils.CountingInputStream; 037import org.apache.commons.compress.utils.IOUtils; 038import org.apache.commons.compress.utils.InputStreamStatistics; 039 040/** 041 * Input stream that decompresses .gz files. 042 * 043 * <p>This supports decompressing concatenated .gz files which is important 044 * when decompressing standalone .gz files.</p> 045 * 046 * <p> 047 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 048 * files: it stops after the first member and silently ignores the rest. 049 * It doesn't leave the read position to point to the beginning of the next 050 * member, which makes it difficult workaround the lack of concatenation 051 * support. 052 * </p> 053 * 054 * <p> 055 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 056 * container format decoder. The actual decompression is done with 057 * {@link java.util.zip.Inflater}. 058 * </p> 059 * 060 * <p>If you use the constructor {@code GzipCompressorInputStream(in)} 061 * or {@code GzipCompressorInputStream(in, false)} with some {@code 062 * InputStream} {@code in} then {@link #read} will return -1 as soon 063 * as the first internal member has been read completely. The stream 064 * {@code in} will be positioned at the start of the second gzip 065 * member if there is one.</p> 066 * 067 * <p>If you use the constructor {@code GzipCompressorInputStream(in, 068 * true)} with some {@code InputStream} {@code in} then {@link #read} 069 * will return -1 once the stream {@code in} has been exhausted. The 070 * data read from a stream constructed this way will consist of the 071 * concatenated data of all gzip members contained inside {@code 072 * in}.</p> 073 * 074 * @see "https://tools.ietf.org/html/rfc1952" 075 */ 076public class GzipCompressorInputStream extends CompressorInputStream 077 implements InputStreamStatistics { 078 079 // Header flags 080 // private static final int FTEXT = 0x01; // Uninteresting for us 081 private static final int FHCRC = 0x02; 082 private static final int FEXTRA = 0x04; 083 private static final int FNAME = 0x08; 084 private static final int FCOMMENT = 0x10; 085 private static final int FRESERVED = 0xE0; 086 087 private final CountingInputStream countingStream; 088 089 // Compressed input stream, possibly wrapped in a 090 // BufferedInputStream, always wrapped in countingStream above 091 private final InputStream in; 092 093 // True if decompressing multi member streams. 094 private final boolean decompressConcatenated; 095 096 // Buffer to hold the input data 097 private final byte[] buf = new byte[8192]; 098 099 // Amount of data in buf. 100 private int bufUsed; 101 102 // Decompressor 103 private Inflater inf = new Inflater(true); 104 105 // CRC32 from uncompressed data 106 private final CRC32 crc = new CRC32(); 107 108 // True once everything has been decompressed 109 private boolean endReached = false; 110 111 // used in no-arg read method 112 private final byte[] oneByte = new byte[1]; 113 114 private final GzipParameters parameters = new GzipParameters(); 115 116 /** 117 * Constructs a new input stream that decompresses gzip-compressed data 118 * from the specified input stream. 119 * <p> 120 * This is equivalent to 121 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 122 * will not decompress concatenated .gz files. 123 * 124 * @param inputStream the InputStream from which this object should 125 * be created of 126 * 127 * @throws IOException if the stream could not be created 128 */ 129 public GzipCompressorInputStream(final InputStream inputStream) 130 throws IOException { 131 this(inputStream, false); 132 } 133 134 /** 135 * Constructs a new input stream that decompresses gzip-compressed data 136 * from the specified input stream. 137 * <p> 138 * If <code>decompressConcatenated</code> is {@code false}: 139 * This decompressor might read more input than it will actually use. 140 * If <code>inputStream</code> supports <code>mark</code> and 141 * <code>reset</code>, then the input position will be adjusted 142 * so that it is right after the last byte of the compressed stream. 143 * If <code>mark</code> isn't supported, the input position will be 144 * undefined. 145 * 146 * @param inputStream the InputStream from which this object should 147 * be created of 148 * @param decompressConcatenated 149 * if true, decompress until the end of the input; 150 * if false, stop after the first .gz member 151 * 152 * @throws IOException if the stream could not be created 153 */ 154 public GzipCompressorInputStream(final InputStream inputStream, 155 final boolean decompressConcatenated) 156 throws IOException { 157 countingStream = new CountingInputStream(inputStream); 158 // Mark support is strictly needed for concatenated files only, 159 // but it's simpler if it is always available. 160 if (countingStream.markSupported()) { 161 in = countingStream; 162 } else { 163 in = new BufferedInputStream(countingStream); 164 } 165 166 this.decompressConcatenated = decompressConcatenated; 167 init(true); 168 } 169 170 /** 171 * Provides the stream's meta data - may change with each stream 172 * when decompressing concatenated streams. 173 * @return the stream's meta data 174 * @since 1.8 175 */ 176 public GzipParameters getMetaData() { 177 return parameters; 178 } 179 180 private boolean init(final boolean isFirstMember) throws IOException { 181 assert isFirstMember || decompressConcatenated; 182 183 // Check the magic bytes without a possibility of EOFException. 184 final int magic0 = in.read(); 185 final int magic1 = in.read(); 186 187 // If end of input was reached after decompressing at least 188 // one .gz member, we have reached the end of the file successfully. 189 if (magic0 == -1 && !isFirstMember) { 190 return false; 191 } 192 193 if (magic0 != 31 || magic1 != 139) { 194 throw new IOException(isFirstMember 195 ? "Input is not in the .gz format" 196 : "Garbage after a valid .gz stream"); 197 } 198 199 // Parsing the rest of the header may throw EOFException. 200 final DataInput inData = new DataInputStream(in); 201 final int method = inData.readUnsignedByte(); 202 if (method != Deflater.DEFLATED) { 203 throw new IOException("Unsupported compression method " 204 + method + " in the .gz header"); 205 } 206 207 final int flg = inData.readUnsignedByte(); 208 if ((flg & FRESERVED) != 0) { 209 throw new IOException( 210 "Reserved flags are set in the .gz header"); 211 } 212 213 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 214 switch (inData.readUnsignedByte()) { // extra flags 215 case 2: 216 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 217 break; 218 case 4: 219 parameters.setCompressionLevel(Deflater.BEST_SPEED); 220 break; 221 default: 222 // ignored for now 223 break; 224 } 225 parameters.setOperatingSystem(inData.readUnsignedByte()); 226 227 // Extra field, ignored 228 if ((flg & FEXTRA) != 0) { 229 int xlen = inData.readUnsignedByte(); 230 xlen |= inData.readUnsignedByte() << 8; 231 232 // This isn't as efficient as calling in.skip would be, 233 // but it's lazier to handle unexpected end of input this way. 234 // Most files don't have an extra field anyway. 235 while (xlen-- > 0) { 236 inData.readUnsignedByte(); 237 } 238 } 239 240 // Original file name 241 if ((flg & FNAME) != 0) { 242 parameters.setFilename(new String(readToNull(inData), 243 CharsetNames.ISO_8859_1)); 244 } 245 246 // Comment 247 if ((flg & FCOMMENT) != 0) { 248 parameters.setComment(new String(readToNull(inData), 249 CharsetNames.ISO_8859_1)); 250 } 251 252 // Header "CRC16" which is actually a truncated CRC32 (which isn't 253 // as good as real CRC16). I don't know if any encoder implementation 254 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 255 // doesn't support this field, but zlib seems to be able to at least 256 // skip over it. 257 if ((flg & FHCRC) != 0) { 258 inData.readShort(); 259 } 260 261 // Reset 262 inf.reset(); 263 crc.reset(); 264 265 return true; 266 } 267 268 private static byte[] readToNull(final DataInput inData) throws IOException { 269 final ByteArrayOutputStream bos = new ByteArrayOutputStream(); 270 int b = 0; 271 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD 272 bos.write(b); 273 } 274 return bos.toByteArray(); 275 } 276 277 @Override 278 public int read() throws IOException { 279 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 280 } 281 282 /** 283 * {@inheritDoc} 284 * 285 * @since 1.1 286 */ 287 @Override 288 public int read(final byte[] b, int off, int len) throws IOException { 289 if (endReached) { 290 return -1; 291 } 292 293 int size = 0; 294 295 while (len > 0) { 296 if (inf.needsInput()) { 297 // Remember the current position because we may need to 298 // rewind after reading too much input. 299 in.mark(buf.length); 300 301 bufUsed = in.read(buf); 302 if (bufUsed == -1) { 303 throw new EOFException(); 304 } 305 306 inf.setInput(buf, 0, bufUsed); 307 } 308 309 int ret; 310 try { 311 ret = inf.inflate(b, off, len); 312 } catch (final DataFormatException e) { 313 throw new IOException("Gzip-compressed data is corrupt"); 314 } 315 316 crc.update(b, off, ret); 317 off += ret; 318 len -= ret; 319 size += ret; 320 count(ret); 321 322 if (inf.finished()) { 323 // We may have read too many bytes. Rewind the read 324 // position to match the actual amount used. 325 // 326 // NOTE: The "if" is there just in case. Since we used 327 // in.mark earlier, it should always skip enough. 328 in.reset(); 329 330 final int skipAmount = bufUsed - inf.getRemaining(); 331 if (IOUtils.skip(in, skipAmount) != skipAmount) { 332 throw new IOException(); 333 } 334 335 bufUsed = 0; 336 337 final DataInput inData = new DataInputStream(in); 338 339 // CRC32 340 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 341 342 if (crcStored != crc.getValue()) { 343 throw new IOException("Gzip-compressed data is corrupt " 344 + "(CRC32 error)"); 345 } 346 347 // Uncompressed size modulo 2^32 (ISIZE in the spec) 348 final long isize = ByteUtils.fromLittleEndian(inData, 4); 349 350 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 351 throw new IOException("Gzip-compressed data is corrupt" 352 + "(uncompressed size mismatch)"); 353 } 354 355 // See if this is the end of the file. 356 if (!decompressConcatenated || !init(false)) { 357 inf.end(); 358 inf = null; 359 endReached = true; 360 return size == 0 ? -1 : size; 361 } 362 } 363 } 364 365 return size; 366 } 367 368 /** 369 * Checks if the signature matches what is expected for a .gz file. 370 * 371 * @param signature the bytes to check 372 * @param length the number of bytes to check 373 * @return true if this is a .gz stream, false otherwise 374 * 375 * @since 1.1 376 */ 377 public static boolean matches(final byte[] signature, final int length) { 378 return length >= 2 && signature[0] == 31 && signature[1] == -117; 379 } 380 381 /** 382 * Closes the input stream (unless it is System.in). 383 * 384 * @since 1.2 385 */ 386 @Override 387 public void close() throws IOException { 388 if (inf != null) { 389 inf.end(); 390 inf = null; 391 } 392 393 if (this.in != System.in) { 394 this.in.close(); 395 } 396 } 397 398 /** 399 * @since 1.17 400 */ 401 @Override 402 public long getCompressedCount() { 403 return countingStream.getBytesRead(); 404 } 405}