001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 *
017 */
018
019/*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024package org.apache.commons.compress.archivers.tar;
025
026import java.io.ByteArrayOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.HashMap;
030import java.util.Map;
031
032import org.apache.commons.compress.archivers.ArchiveEntry;
033import org.apache.commons.compress.archivers.ArchiveInputStream;
034import org.apache.commons.compress.archivers.zip.ZipEncoding;
035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
036import org.apache.commons.compress.utils.ArchiveUtils;
037import org.apache.commons.compress.utils.CharsetNames;
038import org.apache.commons.compress.utils.IOUtils;
039
040/**
041 * The TarInputStream reads a UNIX tar archive as an InputStream.
042 * methods are provided to position at each successive entry in
043 * the archive, and the read each entry as a normal input stream
044 * using read().
045 * @NotThreadSafe
046 */
047public class TarArchiveInputStream extends ArchiveInputStream {
048
049    private static final int SMALL_BUFFER_SIZE = 256;
050
051    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
052
053    /** The size the TAR header */
054    private final int recordSize;
055
056    /** The size of a block */
057    private final int blockSize;
058
059    /** True if file has hit EOF */
060    private boolean hasHitEOF;
061
062    /** Size of the current entry */
063    private long entrySize;
064
065    /** How far into the entry the stream is at */
066    private long entryOffset;
067
068    /** An input stream to read from */
069    private final InputStream is;
070
071    /** The meta-data about the current entry */
072    private TarArchiveEntry currEntry;
073
074    /** The encoding of the file */
075    private final ZipEncoding zipEncoding;
076
077    // the provided encoding (for unit tests)
078    final String encoding;
079
080    // the global PAX header
081    private Map<String, String> globalPaxHeaders = new HashMap<>();
082
083    /**
084     * Constructor for TarInputStream.
085     * @param is the input stream to use
086     */
087    public TarArchiveInputStream(final InputStream is) {
088        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
089    }
090
091    /**
092     * Constructor for TarInputStream.
093     * @param is the input stream to use
094     * @param encoding name of the encoding to use for file names
095     * @since 1.4
096     */
097    public TarArchiveInputStream(final InputStream is, final String encoding) {
098        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
099             encoding);
100    }
101
102    /**
103     * Constructor for TarInputStream.
104     * @param is the input stream to use
105     * @param blockSize the block size to use
106     */
107    public TarArchiveInputStream(final InputStream is, final int blockSize) {
108        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
109    }
110
111    /**
112     * Constructor for TarInputStream.
113     * @param is the input stream to use
114     * @param blockSize the block size to use
115     * @param encoding name of the encoding to use for file names
116     * @since 1.4
117     */
118    public TarArchiveInputStream(final InputStream is, final int blockSize,
119                                 final String encoding) {
120        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
121    }
122
123    /**
124     * Constructor for TarInputStream.
125     * @param is the input stream to use
126     * @param blockSize the block size to use
127     * @param recordSize the record size to use
128     */
129    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) {
130        this(is, blockSize, recordSize, null);
131    }
132
133    /**
134     * Constructor for TarInputStream.
135     * @param is the input stream to use
136     * @param blockSize the block size to use
137     * @param recordSize the record size to use
138     * @param encoding name of the encoding to use for file names
139     * @since 1.4
140     */
141    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
142                                 final String encoding) {
143        this.is = is;
144        this.hasHitEOF = false;
145        this.encoding = encoding;
146        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
147        this.recordSize = recordSize;
148        this.blockSize = blockSize;
149    }
150
151    /**
152     * Closes this stream. Calls the TarBuffer's close() method.
153     * @throws IOException on error
154     */
155    @Override
156    public void close() throws IOException {
157        is.close();
158    }
159
160    /**
161     * Get the record size being used by this stream's buffer.
162     *
163     * @return The TarBuffer record size.
164     */
165    public int getRecordSize() {
166        return recordSize;
167    }
168
169    /**
170     * Get the available data that can be read from the current
171     * entry in the archive. This does not indicate how much data
172     * is left in the entire archive, only in the current entry.
173     * This value is determined from the entry's size header field
174     * and the amount of data already read from the current entry.
175     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
176     * bytes are left in the current entry in the archive.
177     *
178     * @return The number of available bytes for the current entry.
179     * @throws IOException for signature
180     */
181    @Override
182    public int available() throws IOException {
183        if (isDirectory()) {
184            return 0;
185        }
186        if (entrySize - entryOffset > Integer.MAX_VALUE) {
187            return Integer.MAX_VALUE;
188        }
189        return (int) (entrySize - entryOffset);
190    }
191
192
193    /**
194     * Skips over and discards <code>n</code> bytes of data from this input
195     * stream. The <code>skip</code> method may, for a variety of reasons, end
196     * up skipping over some smaller number of bytes, possibly <code>0</code>.
197     * This may result from any of a number of conditions; reaching end of file
198     * or end of entry before <code>n</code> bytes have been skipped; are only
199     * two possibilities. The actual number of bytes skipped is returned. If
200     * <code>n</code> is negative, no bytes are skipped.
201     *
202     *
203     * @param n
204     *            the number of bytes to be skipped.
205     * @return the actual number of bytes skipped.
206     * @throws IOException
207     *                if some other I/O error occurs.
208     */
209    @Override
210    public long skip(final long n) throws IOException {
211        if (n <= 0 || isDirectory()) {
212            return 0;
213        }
214
215        final long available = entrySize - entryOffset;
216        final long skipped = IOUtils.skip(is, Math.min(n, available));
217        count(skipped);
218        entryOffset += skipped;
219        return skipped;
220    }
221
222    /**
223     * Since we do not support marking just yet, we return false.
224     *
225     * @return False.
226     */
227    @Override
228    public boolean markSupported() {
229        return false;
230    }
231
232    /**
233     * Since we do not support marking just yet, we do nothing.
234     *
235     * @param markLimit The limit to mark.
236     */
237    @Override
238    public void mark(final int markLimit) {
239    }
240
241    /**
242     * Since we do not support marking just yet, we do nothing.
243     */
244    @Override
245    public synchronized void reset() {
246    }
247
248    /**
249     * Get the next entry in this tar archive. This will skip
250     * over any remaining data in the current entry, if there
251     * is one, and place the input stream at the header of the
252     * next entry, and read the header and instantiate a new
253     * TarEntry from the header bytes and return that entry.
254     * If there are no more entries in the archive, null will
255     * be returned to indicate that the end of the archive has
256     * been reached.
257     *
258     * @return The next TarEntry in the archive, or null.
259     * @throws IOException on error
260     */
261    public TarArchiveEntry getNextTarEntry() throws IOException {
262        if (isAtEOF()) {
263            return null;
264        }
265
266        if (currEntry != null) {
267            /* Skip will only go to the end of the current entry */
268            IOUtils.skip(this, Long.MAX_VALUE);
269
270            /* skip to the end of the last record */
271            skipRecordPadding();
272        }
273
274        final byte[] headerBuf = getRecord();
275
276        if (headerBuf == null) {
277            /* hit EOF */
278            currEntry = null;
279            return null;
280        }
281
282        try {
283            currEntry = new TarArchiveEntry(headerBuf, zipEncoding);
284        } catch (final IllegalArgumentException e) {
285            throw new IOException("Error detected parsing the header", e);
286        }
287
288        entryOffset = 0;
289        entrySize = currEntry.getSize();
290
291        if (currEntry.isGNULongLinkEntry()) {
292            final byte[] longLinkData = getLongNameData();
293            if (longLinkData == null) {
294                // Bugzilla: 40334
295                // Malformed tar file - long link entry name not followed by
296                // entry
297                return null;
298            }
299            currEntry.setLinkName(zipEncoding.decode(longLinkData));
300        }
301
302        if (currEntry.isGNULongNameEntry()) {
303            final byte[] longNameData = getLongNameData();
304            if (longNameData == null) {
305                // Bugzilla: 40334
306                // Malformed tar file - long entry name not followed by
307                // entry
308                return null;
309            }
310            currEntry.setName(zipEncoding.decode(longNameData));
311        }
312
313        if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers
314            readGlobalPaxHeaders();
315        }
316
317        if (currEntry.isPaxHeader()){ // Process Pax headers
318            paxHeaders();
319        } else if (!globalPaxHeaders.isEmpty()) {
320            applyPaxHeadersToCurrentEntry(globalPaxHeaders);
321        }
322
323        if (currEntry.isOldGNUSparse()){ // Process sparse files
324            readOldGNUSparse();
325        }
326
327        // If the size of the next element in the archive has changed
328        // due to a new size being reported in the posix header
329        // information, we update entrySize here so that it contains
330        // the correct value.
331        entrySize = currEntry.getSize();
332
333        return currEntry;
334    }
335
336    /**
337     * The last record block should be written at the full size, so skip any
338     * additional space used to fill a record after an entry
339     */
340    private void skipRecordPadding() throws IOException {
341        if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
342            final long numRecords = (this.entrySize / this.recordSize) + 1;
343            final long padding = (numRecords * this.recordSize) - this.entrySize;
344            final long skipped = IOUtils.skip(is, padding);
345            count(skipped);
346        }
347    }
348
349    /**
350     * Get the next entry in this tar archive as longname data.
351     *
352     * @return The next entry in the archive as longname data, or null.
353     * @throws IOException on error
354     */
355    protected byte[] getLongNameData() throws IOException {
356        // read in the name
357        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
358        int length = 0;
359        while ((length = read(smallBuf)) >= 0) {
360            longName.write(smallBuf, 0, length);
361        }
362        getNextEntry();
363        if (currEntry == null) {
364            // Bugzilla: 40334
365            // Malformed tar file - long entry name not followed by entry
366            return null;
367        }
368        byte[] longNameData = longName.toByteArray();
369        // remove trailing null terminator(s)
370        length = longNameData.length;
371        while (length > 0 && longNameData[length - 1] == 0) {
372            --length;
373        }
374        if (length != longNameData.length) {
375            final byte[] l = new byte[length];
376            System.arraycopy(longNameData, 0, l, 0, length);
377            longNameData = l;
378        }
379        return longNameData;
380    }
381
382    /**
383     * Get the next record in this tar archive. This will skip
384     * over any remaining data in the current entry, if there
385     * is one, and place the input stream at the header of the
386     * next entry.
387     *
388     * <p>If there are no more entries in the archive, null will be
389     * returned to indicate that the end of the archive has been
390     * reached.  At the same time the {@code hasHitEOF} marker will be
391     * set to true.</p>
392     *
393     * @return The next header in the archive, or null.
394     * @throws IOException on error
395     */
396    private byte[] getRecord() throws IOException {
397        byte[] headerBuf = readRecord();
398        setAtEOF(isEOFRecord(headerBuf));
399        if (isAtEOF() && headerBuf != null) {
400            tryToConsumeSecondEOFRecord();
401            consumeRemainderOfLastBlock();
402            headerBuf = null;
403        }
404        return headerBuf;
405    }
406
407    /**
408     * Determine if an archive record indicate End of Archive. End of
409     * archive is indicated by a record that consists entirely of null bytes.
410     *
411     * @param record The record data to check.
412     * @return true if the record data is an End of Archive
413     */
414    protected boolean isEOFRecord(final byte[] record) {
415        return record == null || ArchiveUtils.isArrayZero(record, recordSize);
416    }
417
418    /**
419     * Read a record from the input stream and return the data.
420     *
421     * @return The record data or null if EOF has been hit.
422     * @throws IOException on error
423     */
424    protected byte[] readRecord() throws IOException {
425
426        final byte[] record = new byte[recordSize];
427
428        final int readNow = IOUtils.readFully(is, record);
429        count(readNow);
430        if (readNow != recordSize) {
431            return null;
432        }
433
434        return record;
435    }
436
437    private void readGlobalPaxHeaders() throws IOException {
438        globalPaxHeaders = parsePaxHeaders(this);
439        getNextEntry(); // Get the actual file entry
440    }
441
442    private void paxHeaders() throws IOException{
443        final Map<String, String> headers = parsePaxHeaders(this);
444        getNextEntry(); // Get the actual file entry
445        applyPaxHeadersToCurrentEntry(headers);
446    }
447
448    // NOTE, using a Map here makes it impossible to ever support GNU
449    // sparse files using the PAX Format 0.0, see
450    // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
451    Map<String, String> parsePaxHeaders(final InputStream i)
452        throws IOException {
453        final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
454        // Format is "length keyword=value\n";
455        while(true){ // get length
456            int ch;
457            int len = 0;
458            int read = 0;
459            while((ch = i.read()) != -1) {
460                read++;
461                if (ch == '\n') { // blank line in header
462                    break;
463                } else if (ch == ' '){ // End of length string
464                    // Get keyword
465                    final ByteArrayOutputStream coll = new ByteArrayOutputStream();
466                    while((ch = i.read()) != -1) {
467                        read++;
468                        if (ch == '='){ // end of keyword
469                            final String keyword = coll.toString(CharsetNames.UTF_8);
470                            // Get rest of entry
471                            final int restLen = len - read;
472                            if (restLen == 1) { // only NL
473                                headers.remove(keyword);
474                            } else {
475                                final byte[] rest = new byte[restLen];
476                                final int got = IOUtils.readFully(i, rest);
477                                if (got != restLen) {
478                                    throw new IOException("Failed to read "
479                                                          + "Paxheader. Expected "
480                                                          + restLen
481                                                          + " bytes, read "
482                                                          + got);
483                                }
484                                // Drop trailing NL
485                                final String value = new String(rest, 0,
486                                                          restLen - 1, CharsetNames.UTF_8);
487                                headers.put(keyword, value);
488                            }
489                            break;
490                        }
491                        coll.write((byte) ch);
492                    }
493                    break; // Processed single header
494                }
495                len *= 10;
496                len += ch - '0';
497            }
498            if (ch == -1){ // EOF
499                break;
500            }
501        }
502        return headers;
503    }
504
505    private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) {
506        currEntry.updateEntryFromPaxHeaders(headers);
507
508    }
509
510    /**
511     * Adds the sparse chunks from the current entry to the sparse chunks,
512     * including any additional sparse entries following the current entry.
513     *
514     * @throws IOException on error
515     *
516     * @todo Sparse files get not yet really processed.
517     */
518    private void readOldGNUSparse() throws IOException {
519        /* we do not really process sparse files yet
520        sparses = new ArrayList();
521        sparses.addAll(currEntry.getSparses());
522        */
523        if (currEntry.isExtended()) {
524            TarArchiveSparseEntry entry;
525            do {
526                final byte[] headerBuf = getRecord();
527                if (headerBuf == null) {
528                    currEntry = null;
529                    break;
530                }
531                entry = new TarArchiveSparseEntry(headerBuf);
532                /* we do not really process sparse files yet
533                sparses.addAll(entry.getSparses());
534                */
535            } while (entry.isExtended());
536        }
537    }
538
539    private boolean isDirectory() {
540        return currEntry != null && currEntry.isDirectory();
541    }
542
543    /**
544     * Returns the next Archive Entry in this Stream.
545     *
546     * @return the next entry,
547     *         or {@code null} if there are no more entries
548     * @throws IOException if the next entry could not be read
549     */
550    @Override
551    public ArchiveEntry getNextEntry() throws IOException {
552        return getNextTarEntry();
553    }
554
555    /**
556     * Tries to read the next record rewinding the stream if it is not a EOF record.
557     *
558     * <p>This is meant to protect against cases where a tar
559     * implementation has written only one EOF record when two are
560     * expected.  Actually this won't help since a non-conforming
561     * implementation likely won't fill full blocks consisting of - by
562     * default - ten records either so we probably have already read
563     * beyond the archive anyway.</p>
564     */
565    private void tryToConsumeSecondEOFRecord() throws IOException {
566        boolean shouldReset = true;
567        final boolean marked = is.markSupported();
568        if (marked) {
569            is.mark(recordSize);
570        }
571        try {
572            shouldReset = !isEOFRecord(readRecord());
573        } finally {
574            if (shouldReset && marked) {
575                pushedBackBytes(recordSize);
576                is.reset();
577            }
578        }
579    }
580
581    /**
582     * Reads bytes from the current tar archive entry.
583     *
584     * This method is aware of the boundaries of the current
585     * entry in the archive and will deal with them as if they
586     * were this stream's start and EOF.
587     *
588     * @param buf The buffer into which to place bytes read.
589     * @param offset The offset at which to place bytes read.
590     * @param numToRead The number of bytes to read.
591     * @return The number of bytes read, or -1 at EOF.
592     * @throws IOException on error
593     */
594    @Override
595    public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
596        int totalRead = 0;
597
598        if (isAtEOF() || isDirectory() || entryOffset >= entrySize) {
599            return -1;
600        }
601
602        if (currEntry == null) {
603            throw new IllegalStateException("No current tar entry");
604        }
605
606        numToRead = Math.min(numToRead, available());
607
608        totalRead = is.read(buf, offset, numToRead);
609
610        if (totalRead == -1) {
611            if (numToRead > 0) {
612                throw new IOException("Truncated TAR archive");
613            }
614            setAtEOF(true);
615        } else {
616            count(totalRead);
617            entryOffset += totalRead;
618        }
619
620        return totalRead;
621    }
622
623    /**
624     * Whether this class is able to read the given entry.
625     *
626     * <p>May return false if the current entry is a sparse file.</p>
627     */
628    @Override
629    public boolean canReadEntryData(final ArchiveEntry ae) {
630        if (ae instanceof TarArchiveEntry) {
631            final TarArchiveEntry te = (TarArchiveEntry) ae;
632            return !te.isSparse();
633        }
634        return false;
635    }
636
637    /**
638     * Get the current TAR Archive Entry that this input stream is processing
639     *
640     * @return The current Archive Entry
641     */
642    public TarArchiveEntry getCurrentEntry() {
643        return currEntry;
644    }
645
646    protected final void setCurrentEntry(final TarArchiveEntry e) {
647        currEntry = e;
648    }
649
650    protected final boolean isAtEOF() {
651        return hasHitEOF;
652    }
653
654    protected final void setAtEOF(final boolean b) {
655        hasHitEOF = b;
656    }
657
658    /**
659     * This method is invoked once the end of the archive is hit, it
660     * tries to consume the remaining bytes under the assumption that
661     * the tool creating this archive has padded the last block.
662     */
663    private void consumeRemainderOfLastBlock() throws IOException {
664        final long bytesReadOfLastBlock = getBytesRead() % blockSize;
665        if (bytesReadOfLastBlock > 0) {
666            final long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock);
667            count(skipped);
668        }
669    }
670
671    /**
672     * Checks if the signature matches what is expected for a tar file.
673     *
674     * @param signature
675     *            the bytes to check
676     * @param length
677     *            the number of bytes to check
678     * @return true, if this stream is a tar archive stream, false otherwise
679     */
680    public static boolean matches(final byte[] signature, final int length) {
681        if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
682            return false;
683        }
684
685        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
686                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
687            &&
688            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
689                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
690                ){
691            return true;
692        }
693        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
694                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
695            &&
696            (
697             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
698                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
699            ||
700            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
701                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
702            )
703                ){
704            return true;
705        }
706        // COMPRESS-107 - recognise Ant tar files
707        return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
708                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
709                &&
710                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
711                        signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
712    }
713
714}