001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.ByteArrayOutputStream;
022import java.io.IOException;
023import java.io.EOFException;
024import java.io.InputStream;
025import java.io.DataInput;
026import java.io.DataInputStream;
027import java.io.BufferedInputStream;
028import java.util.zip.DataFormatException;
029import java.util.zip.Deflater;
030import java.util.zip.Inflater;
031import java.util.zip.CRC32;
032
033import org.apache.commons.compress.compressors.CompressorInputStream;
034import org.apache.commons.compress.utils.ByteUtils;
035import org.apache.commons.compress.utils.CharsetNames;
036import org.apache.commons.compress.utils.CountingInputStream;
037import org.apache.commons.compress.utils.IOUtils;
038import org.apache.commons.compress.utils.InputStreamStatistics;
039
040/**
041 * Input stream that decompresses .gz files.
042 *
043 * <p>This supports decompressing concatenated .gz files which is important
044 * when decompressing standalone .gz files.</p>
045 *
046 * <p>
047 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
048 * files: it stops after the first member and silently ignores the rest.
049 * It doesn't leave the read position to point to the beginning of the next
050 * member, which makes it difficult workaround the lack of concatenation
051 * support.
052 * </p>
053 *
054 * <p>
055 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
056 * container format decoder. The actual decompression is done with
057 * {@link java.util.zip.Inflater}.
058 * </p>
059 *
060 * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
061 * or {@code GzipCompressorInputStream(in, false)} with some {@code
062 * InputStream} {@code in} then {@link #read} will return -1 as soon
063 * as the first internal member has been read completely. The stream
064 * {@code in} will be positioned at the start of the second gzip
065 * member if there is one.</p>
066 *
067 * <p>If you use the constructor {@code GzipCompressorInputStream(in,
068 * true)} with some {@code InputStream} {@code in} then {@link #read}
069 * will return -1 once the stream {@code in} has been exhausted. The
070 * data read from a stream constructed this way will consist of the
071 * concatenated data of all gzip members contained inside {@code
072 * in}.</p>
073 *
074 * @see "https://tools.ietf.org/html/rfc1952"
075 */
076public class GzipCompressorInputStream extends CompressorInputStream
077    implements InputStreamStatistics {
078
079    // Header flags
080    // private static final int FTEXT = 0x01; // Uninteresting for us
081    private static final int FHCRC = 0x02;
082    private static final int FEXTRA = 0x04;
083    private static final int FNAME = 0x08;
084    private static final int FCOMMENT = 0x10;
085    private static final int FRESERVED = 0xE0;
086
087    private final CountingInputStream countingStream;
088
089    // Compressed input stream, possibly wrapped in a
090    // BufferedInputStream, always wrapped in countingStream above
091    private final InputStream in;
092
093    // True if decompressing multi member streams.
094    private final boolean decompressConcatenated;
095
096    // Buffer to hold the input data
097    private final byte[] buf = new byte[8192];
098
099    // Amount of data in buf.
100    private int bufUsed;
101
102    // Decompressor
103    private Inflater inf = new Inflater(true);
104
105    // CRC32 from uncompressed data
106    private final CRC32 crc = new CRC32();
107
108    // True once everything has been decompressed
109    private boolean endReached = false;
110
111    // used in no-arg read method
112    private final byte[] oneByte = new byte[1];
113
114    private final GzipParameters parameters = new GzipParameters();
115
116    /**
117     * Constructs a new input stream that decompresses gzip-compressed data
118     * from the specified input stream.
119     * <p>
120     * This is equivalent to
121     * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
122     * will not decompress concatenated .gz files.
123     *
124     * @param inputStream  the InputStream from which this object should
125     *                     be created of
126     *
127     * @throws IOException if the stream could not be created
128     */
129    public GzipCompressorInputStream(final InputStream inputStream)
130            throws IOException {
131        this(inputStream, false);
132    }
133
134    /**
135     * Constructs a new input stream that decompresses gzip-compressed data
136     * from the specified input stream.
137     * <p>
138     * If <code>decompressConcatenated</code> is {@code false}:
139     * This decompressor might read more input than it will actually use.
140     * If <code>inputStream</code> supports <code>mark</code> and
141     * <code>reset</code>, then the input position will be adjusted
142     * so that it is right after the last byte of the compressed stream.
143     * If <code>mark</code> isn't supported, the input position will be
144     * undefined.
145     *
146     * @param inputStream  the InputStream from which this object should
147     *                     be created of
148     * @param decompressConcatenated
149     *                     if true, decompress until the end of the input;
150     *                     if false, stop after the first .gz member
151     *
152     * @throws IOException if the stream could not be created
153     */
154    public GzipCompressorInputStream(final InputStream inputStream,
155                                     final boolean decompressConcatenated)
156            throws IOException {
157        countingStream = new CountingInputStream(inputStream);
158        // Mark support is strictly needed for concatenated files only,
159        // but it's simpler if it is always available.
160        if (countingStream.markSupported()) {
161            in = countingStream;
162        } else {
163            in = new BufferedInputStream(countingStream);
164        }
165
166        this.decompressConcatenated = decompressConcatenated;
167        init(true);
168    }
169
170    /**
171     * Provides the stream's meta data - may change with each stream
172     * when decompressing concatenated streams.
173     * @return the stream's meta data
174     * @since 1.8
175     */
176    public GzipParameters getMetaData() {
177        return parameters;
178    }
179
180    private boolean init(final boolean isFirstMember) throws IOException {
181        assert isFirstMember || decompressConcatenated;
182
183        // Check the magic bytes without a possibility of EOFException.
184        final int magic0 = in.read();
185        final int magic1 = in.read();
186
187        // If end of input was reached after decompressing at least
188        // one .gz member, we have reached the end of the file successfully.
189        if (magic0 == -1 && !isFirstMember) {
190            return false;
191        }
192
193        if (magic0 != 31 || magic1 != 139) {
194            throw new IOException(isFirstMember
195                                  ? "Input is not in the .gz format"
196                                  : "Garbage after a valid .gz stream");
197        }
198
199        // Parsing the rest of the header may throw EOFException.
200        final DataInput inData = new DataInputStream(in);
201        final int method = inData.readUnsignedByte();
202        if (method != Deflater.DEFLATED) {
203            throw new IOException("Unsupported compression method "
204                                  + method + " in the .gz header");
205        }
206
207        final int flg = inData.readUnsignedByte();
208        if ((flg & FRESERVED) != 0) {
209            throw new IOException(
210                    "Reserved flags are set in the .gz header");
211        }
212
213        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
214        switch (inData.readUnsignedByte()) { // extra flags
215        case 2:
216            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
217            break;
218        case 4:
219            parameters.setCompressionLevel(Deflater.BEST_SPEED);
220            break;
221        default:
222            // ignored for now
223            break;
224        }
225        parameters.setOperatingSystem(inData.readUnsignedByte());
226
227        // Extra field, ignored
228        if ((flg & FEXTRA) != 0) {
229            int xlen = inData.readUnsignedByte();
230            xlen |= inData.readUnsignedByte() << 8;
231
232            // This isn't as efficient as calling in.skip would be,
233            // but it's lazier to handle unexpected end of input this way.
234            // Most files don't have an extra field anyway.
235            while (xlen-- > 0) {
236                inData.readUnsignedByte();
237            }
238        }
239
240        // Original file name
241        if ((flg & FNAME) != 0) {
242            parameters.setFilename(new String(readToNull(inData),
243                                              CharsetNames.ISO_8859_1));
244        }
245
246        // Comment
247        if ((flg & FCOMMENT) != 0) {
248            parameters.setComment(new String(readToNull(inData),
249                                             CharsetNames.ISO_8859_1));
250        }
251
252        // Header "CRC16" which is actually a truncated CRC32 (which isn't
253        // as good as real CRC16). I don't know if any encoder implementation
254        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
255        // doesn't support this field, but zlib seems to be able to at least
256        // skip over it.
257        if ((flg & FHCRC) != 0) {
258            inData.readShort();
259        }
260
261        // Reset
262        inf.reset();
263        crc.reset();
264
265        return true;
266    }
267
268    private static byte[] readToNull(final DataInput inData) throws IOException {
269        final ByteArrayOutputStream bos = new ByteArrayOutputStream();
270        int b = 0;
271        while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
272            bos.write(b);
273        }
274        return bos.toByteArray();
275    }
276
277    @Override
278    public int read() throws IOException {
279        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
280    }
281
282    /**
283     * {@inheritDoc}
284     *
285     * @since 1.1
286     */
287    @Override
288    public int read(final byte[] b, int off, int len) throws IOException {
289        if (endReached) {
290            return -1;
291        }
292
293        int size = 0;
294
295        while (len > 0) {
296            if (inf.needsInput()) {
297                // Remember the current position because we may need to
298                // rewind after reading too much input.
299                in.mark(buf.length);
300
301                bufUsed = in.read(buf);
302                if (bufUsed == -1) {
303                    throw new EOFException();
304                }
305
306                inf.setInput(buf, 0, bufUsed);
307            }
308
309            int ret;
310            try {
311                ret = inf.inflate(b, off, len);
312            } catch (final DataFormatException e) {
313                throw new IOException("Gzip-compressed data is corrupt");
314            }
315
316            crc.update(b, off, ret);
317            off += ret;
318            len -= ret;
319            size += ret;
320            count(ret);
321
322            if (inf.finished()) {
323                // We may have read too many bytes. Rewind the read
324                // position to match the actual amount used.
325                //
326                // NOTE: The "if" is there just in case. Since we used
327                // in.mark earlier, it should always skip enough.
328                in.reset();
329
330                final int skipAmount = bufUsed - inf.getRemaining();
331                if (IOUtils.skip(in, skipAmount) != skipAmount) {
332                    throw new IOException();
333                }
334
335                bufUsed = 0;
336
337                final DataInput inData = new DataInputStream(in);
338
339                // CRC32
340                final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
341
342                if (crcStored != crc.getValue()) {
343                    throw new IOException("Gzip-compressed data is corrupt "
344                                          + "(CRC32 error)");
345                }
346
347                // Uncompressed size modulo 2^32 (ISIZE in the spec)
348                final long isize = ByteUtils.fromLittleEndian(inData, 4);
349
350                if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
351                    throw new IOException("Gzip-compressed data is corrupt"
352                                          + "(uncompressed size mismatch)");
353                }
354
355                // See if this is the end of the file.
356                if (!decompressConcatenated || !init(false)) {
357                    inf.end();
358                    inf = null;
359                    endReached = true;
360                    return size == 0 ? -1 : size;
361                }
362            }
363        }
364
365        return size;
366    }
367
368    /**
369     * Checks if the signature matches what is expected for a .gz file.
370     *
371     * @param signature the bytes to check
372     * @param length    the number of bytes to check
373     * @return          true if this is a .gz stream, false otherwise
374     *
375     * @since 1.1
376     */
377    public static boolean matches(final byte[] signature, final int length) {
378        return length >= 2 && signature[0] == 31 && signature[1] == -117;
379    }
380
381    /**
382     * Closes the input stream (unless it is System.in).
383     *
384     * @since 1.2
385     */
386    @Override
387    public void close() throws IOException {
388        if (inf != null) {
389            inf.end();
390            inf = null;
391        }
392
393        if (this.in != System.in) {
394            this.in.close();
395        }
396    }
397
398    /**
399     * @since 1.17
400     */
401    @Override
402    public long getCompressedCount() {
403        return countingStream.getBytesRead();
404    }
405}