001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 */
018
019 /*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024 package org.apache.commons.compress.archivers.tar;
025
026 import java.io.ByteArrayOutputStream;
027 import java.io.IOException;
028 import java.io.InputStream;
029 import java.util.HashMap;
030 import java.util.Map;
031 import java.util.Map.Entry;
032
033 import org.apache.commons.compress.archivers.ArchiveEntry;
034 import org.apache.commons.compress.archivers.ArchiveInputStream;
035 import org.apache.commons.compress.archivers.zip.ZipEncoding;
036 import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
037 import org.apache.commons.compress.utils.ArchiveUtils;
038 import org.apache.commons.compress.utils.CharsetNames;
039
040 /**
041 * The TarInputStream reads a UNIX tar archive as an InputStream.
042 * methods are provided to position at each successive entry in
043 * the archive, and the read each entry as a normal input stream
044 * using read().
045 * @NotThreadSafe
046 */
047 public class TarArchiveInputStream extends ArchiveInputStream {
048 private static final int SMALL_BUFFER_SIZE = 256;
049 private static final int BUFFER_SIZE = 8 * 1024;
050
051 private boolean hasHitEOF;
052 private long entrySize;
053 private long entryOffset;
054 private byte[] readBuf;
055 protected final TarBuffer buffer;
056 private TarArchiveEntry currEntry;
057 private final ZipEncoding encoding;
058
059 /**
060 * Constructor for TarInputStream.
061 * @param is the input stream to use
062 */
063 public TarArchiveInputStream(InputStream is) {
064 this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
065 }
066
067 /**
068 * Constructor for TarInputStream.
069 * @param is the input stream to use
070 * @param encoding name of the encoding to use for file names
071 * @since Commons Compress 1.4
072 */
073 public TarArchiveInputStream(InputStream is, String encoding) {
074 this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE, encoding);
075 }
076
077 /**
078 * Constructor for TarInputStream.
079 * @param is the input stream to use
080 * @param blockSize the block size to use
081 */
082 public TarArchiveInputStream(InputStream is, int blockSize) {
083 this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
084 }
085
086 /**
087 * Constructor for TarInputStream.
088 * @param is the input stream to use
089 * @param blockSize the block size to use
090 * @param encoding name of the encoding to use for file names
091 * @since Commons Compress 1.4
092 */
093 public TarArchiveInputStream(InputStream is, int blockSize,
094 String encoding) {
095 this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE, encoding);
096 }
097
098 /**
099 * Constructor for TarInputStream.
100 * @param is the input stream to use
101 * @param blockSize the block size to use
102 * @param recordSize the record size to use
103 */
104 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
105 this(is, blockSize, recordSize, null);
106 }
107
108 /**
109 * Constructor for TarInputStream.
110 * @param is the input stream to use
111 * @param blockSize the block size to use
112 * @param recordSize the record size to use
113 * @param encoding name of the encoding to use for file names
114 * @since Commons Compress 1.4
115 */
116 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
117 String encoding) {
118 this.buffer = new TarBuffer(is, blockSize, recordSize);
119 this.readBuf = null;
120 this.hasHitEOF = false;
121 this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
122 }
123
124 /**
125 * Closes this stream. Calls the TarBuffer's close() method.
126 * @throws IOException on error
127 */
128 @Override
129 public void close() throws IOException {
130 buffer.close();
131 }
132
133 /**
134 * Get the record size being used by this stream's TarBuffer.
135 *
136 * @return The TarBuffer record size.
137 */
138 public int getRecordSize() {
139 return buffer.getRecordSize();
140 }
141
142 /**
143 * Get the available data that can be read from the current
144 * entry in the archive. This does not indicate how much data
145 * is left in the entire archive, only in the current entry.
146 * This value is determined from the entry's size header field
147 * and the amount of data already read from the current entry.
148 * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE
149 * bytes are left in the current entry in the archive.
150 *
151 * @return The number of available bytes for the current entry.
152 * @throws IOException for signature
153 */
154 @Override
155 public int available() throws IOException {
156 if (entrySize - entryOffset > Integer.MAX_VALUE) {
157 return Integer.MAX_VALUE;
158 }
159 return (int) (entrySize - entryOffset);
160 }
161
162 /**
163 * Skip bytes in the input buffer. This skips bytes in the
164 * current entry's data, not the entire archive, and will
165 * stop at the end of the current entry's data if the number
166 * to skip extends beyond that point.
167 *
168 * @param numToSkip The number of bytes to skip.
169 * @return the number actually skipped
170 * @throws IOException on error
171 */
172 @Override
173 public long skip(long numToSkip) throws IOException {
174 // REVIEW
175 // This is horribly inefficient, but it ensures that we
176 // properly skip over bytes via the TarBuffer...
177 //
178 byte[] skipBuf = new byte[BUFFER_SIZE];
179 long skip = numToSkip;
180 while (skip > 0) {
181 int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip);
182 int numRead = read(skipBuf, 0, realSkip);
183 if (numRead == -1) {
184 break;
185 }
186 skip -= numRead;
187 }
188 return (numToSkip - skip);
189 }
190
191 /**
192 * Since we do not support marking just yet, we do nothing.
193 */
194 @Override
195 public synchronized void reset() {
196 }
197
198 /**
199 * Get the next entry in this tar archive. This will skip
200 * over any remaining data in the current entry, if there
201 * is one, and place the input stream at the header of the
202 * next entry, and read the header and instantiate a new
203 * TarEntry from the header bytes and return that entry.
204 * If there are no more entries in the archive, null will
205 * be returned to indicate that the end of the archive has
206 * been reached.
207 *
208 * @return The next TarEntry in the archive, or null.
209 * @throws IOException on error
210 */
211 public TarArchiveEntry getNextTarEntry() throws IOException {
212 if (hasHitEOF) {
213 return null;
214 }
215
216 if (currEntry != null) {
217 long numToSkip = entrySize - entryOffset;
218
219 while (numToSkip > 0) {
220 long skipped = skip(numToSkip);
221 if (skipped <= 0) {
222 throw new RuntimeException("failed to skip current tar entry");
223 }
224 numToSkip -= skipped;
225 }
226
227 readBuf = null;
228 }
229
230 byte[] headerBuf = getRecord();
231
232 if (hasHitEOF) {
233 currEntry = null;
234 return null;
235 }
236
237 try {
238 currEntry = new TarArchiveEntry(headerBuf, encoding);
239 } catch (IllegalArgumentException e) {
240 IOException ioe = new IOException("Error detected parsing the header");
241 ioe.initCause(e);
242 throw ioe;
243 }
244 entryOffset = 0;
245 entrySize = currEntry.getSize();
246
247 if (currEntry.isGNULongNameEntry()) {
248 // read in the name
249 StringBuffer longName = new StringBuffer();
250 byte[] buf = new byte[SMALL_BUFFER_SIZE];
251 int length = 0;
252 while ((length = read(buf)) >= 0) {
253 longName.append(new String(buf, 0, length)); // TODO default charset?
254 }
255 getNextEntry();
256 if (currEntry == null) {
257 // Bugzilla: 40334
258 // Malformed tar file - long entry name not followed by entry
259 return null;
260 }
261 // remove trailing null terminator
262 if (longName.length() > 0
263 && longName.charAt(longName.length() - 1) == 0) {
264 longName.deleteCharAt(longName.length() - 1);
265 }
266 currEntry.setName(longName.toString());
267 }
268
269 if (currEntry.isPaxHeader()){ // Process Pax headers
270 paxHeaders();
271 }
272
273 if (currEntry.isGNUSparse()){ // Process sparse files
274 readGNUSparse();
275 }
276
277 // If the size of the next element in the archive has changed
278 // due to a new size being reported in the posix header
279 // information, we update entrySize here so that it contains
280 // the correct value.
281 entrySize = currEntry.getSize();
282 return currEntry;
283 }
284
285 /**
286 * Get the next record in this tar archive. This will skip
287 * over any remaining data in the current entry, if there
288 * is one, and place the input stream at the header of the
289 * next entry.
290 * If there are no more entries in the archive, null will
291 * be returned to indicate that the end of the archive has
292 * been reached.
293 *
294 * @return The next header in the archive, or null.
295 * @throws IOException on error
296 */
297 private byte[] getRecord() throws IOException {
298 if (hasHitEOF) {
299 return null;
300 }
301
302 byte[] headerBuf = buffer.readRecord();
303
304 if (headerBuf == null) {
305 hasHitEOF = true;
306 } else if (buffer.isEOFRecord(headerBuf)) {
307 hasHitEOF = true;
308 }
309
310 return hasHitEOF ? null : headerBuf;
311 }
312
313 private void paxHeaders() throws IOException{
314 Map<String, String> headers = parsePaxHeaders(this);
315 getNextEntry(); // Get the actual file entry
316 applyPaxHeadersToCurrentEntry(headers);
317 }
318
319 Map<String, String> parsePaxHeaders(InputStream i) throws IOException {
320 Map<String, String> headers = new HashMap<String, String>();
321 // Format is "length keyword=value\n";
322 while(true){ // get length
323 int ch;
324 int len = 0;
325 int read = 0;
326 while((ch = i.read()) != -1) {
327 read++;
328 if (ch == ' '){ // End of length string
329 // Get keyword
330 ByteArrayOutputStream coll = new ByteArrayOutputStream();
331 while((ch = i.read()) != -1) {
332 read++;
333 if (ch == '='){ // end of keyword
334 String keyword = coll.toString(CharsetNames.UTF_8);
335 // Get rest of entry
336 byte[] rest = new byte[len - read];
337 int got = i.read(rest);
338 if (got != len - read){
339 throw new IOException("Failed to read "
340 + "Paxheader. Expected "
341 + (len - read)
342 + " bytes, read "
343 + got);
344 }
345 // Drop trailing NL
346 String value = new String(rest, 0,
347 len - read - 1, CharsetNames.UTF_8);
348 headers.put(keyword, value);
349 break;
350 }
351 coll.write((byte) ch);
352 }
353 break; // Processed single header
354 }
355 len *= 10;
356 len += ch - '0';
357 }
358 if (ch == -1){ // EOF
359 break;
360 }
361 }
362 return headers;
363 }
364
365 private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) {
366 /*
367 * The following headers are defined for Pax.
368 * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
369 * mtime
370 * comment
371 * gid, gname
372 * linkpath
373 * size
374 * uid,uname
375 * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those
376 */
377 for (Entry<String, String> ent : headers.entrySet()){
378 String key = ent.getKey();
379 String val = ent.getValue();
380 if ("path".equals(key)){
381 currEntry.setName(val);
382 } else if ("linkpath".equals(key)){
383 currEntry.setLinkName(val);
384 } else if ("gid".equals(key)){
385 currEntry.setGroupId(Integer.parseInt(val));
386 } else if ("gname".equals(key)){
387 currEntry.setGroupName(val);
388 } else if ("uid".equals(key)){
389 currEntry.setUserId(Integer.parseInt(val));
390 } else if ("uname".equals(key)){
391 currEntry.setUserName(val);
392 } else if ("size".equals(key)){
393 currEntry.setSize(Long.parseLong(val));
394 } else if ("mtime".equals(key)){
395 currEntry.setModTime((long) (Double.parseDouble(val) * 1000));
396 } else if ("SCHILY.devminor".equals(key)){
397 currEntry.setDevMinor(Integer.parseInt(val));
398 } else if ("SCHILY.devmajor".equals(key)){
399 currEntry.setDevMajor(Integer.parseInt(val));
400 }
401 }
402 }
403
404 /**
405 * Adds the sparse chunks from the current entry to the sparse chunks,
406 * including any additional sparse entries following the current entry.
407 *
408 * @throws IOException on error
409 *
410 * @todo Sparse files get not yet really processed.
411 */
412 private void readGNUSparse() throws IOException {
413 /* we do not really process sparse files yet
414 sparses = new ArrayList();
415 sparses.addAll(currEntry.getSparses());
416 */
417 if (currEntry.isExtended()) {
418 TarArchiveSparseEntry entry;
419 do {
420 byte[] headerBuf = getRecord();
421 if (hasHitEOF) {
422 currEntry = null;
423 break;
424 }
425 entry = new TarArchiveSparseEntry(headerBuf);
426 /* we do not really process sparse files yet
427 sparses.addAll(entry.getSparses());
428 */
429 } while (entry.isExtended());
430 }
431 }
432
433 @Override
434 public ArchiveEntry getNextEntry() throws IOException {
435 return getNextTarEntry();
436 }
437
438 /**
439 * Reads bytes from the current tar archive entry.
440 *
441 * This method is aware of the boundaries of the current
442 * entry in the archive and will deal with them as if they
443 * were this stream's start and EOF.
444 *
445 * @param buf The buffer into which to place bytes read.
446 * @param offset The offset at which to place bytes read.
447 * @param numToRead The number of bytes to read.
448 * @return The number of bytes read, or -1 at EOF.
449 * @throws IOException on error
450 */
451 @Override
452 public int read(byte[] buf, int offset, int numToRead) throws IOException {
453 int totalRead = 0;
454
455 if (entryOffset >= entrySize) {
456 return -1;
457 }
458
459 if ((numToRead + entryOffset) > entrySize) {
460 numToRead = (int) (entrySize - entryOffset);
461 }
462
463 if (readBuf != null) {
464 int sz = (numToRead > readBuf.length) ? readBuf.length
465 : numToRead;
466
467 System.arraycopy(readBuf, 0, buf, offset, sz);
468
469 if (sz >= readBuf.length) {
470 readBuf = null;
471 } else {
472 int newLen = readBuf.length - sz;
473 byte[] newBuf = new byte[newLen];
474
475 System.arraycopy(readBuf, sz, newBuf, 0, newLen);
476
477 readBuf = newBuf;
478 }
479
480 totalRead += sz;
481 numToRead -= sz;
482 offset += sz;
483 }
484
485 while (numToRead > 0) {
486 byte[] rec = buffer.readRecord();
487
488 if (rec == null) {
489 // Unexpected EOF!
490 throw new IOException("unexpected EOF with " + numToRead
491 + " bytes unread. Occured at byte: " + getBytesRead());
492 }
493 count(rec.length);
494 int sz = numToRead;
495 int recLen = rec.length;
496
497 if (recLen > sz) {
498 System.arraycopy(rec, 0, buf, offset, sz);
499
500 readBuf = new byte[recLen - sz];
501
502 System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
503 } else {
504 sz = recLen;
505
506 System.arraycopy(rec, 0, buf, offset, recLen);
507 }
508
509 totalRead += sz;
510 numToRead -= sz;
511 offset += sz;
512 }
513
514 entryOffset += totalRead;
515
516 return totalRead;
517 }
518
519 /**
520 * Whether this class is able to read the given entry.
521 *
522 * <p>May return false if the current entry is a sparse file.</p>
523 */
524 @Override
525 public boolean canReadEntryData(ArchiveEntry ae) {
526 if (ae instanceof TarArchiveEntry) {
527 TarArchiveEntry te = (TarArchiveEntry) ae;
528 return !te.isGNUSparse();
529 }
530 return false;
531 }
532
533 protected final TarArchiveEntry getCurrentEntry() {
534 return currEntry;
535 }
536
537 protected final void setCurrentEntry(TarArchiveEntry e) {
538 currEntry = e;
539 }
540
541 protected final boolean isAtEOF() {
542 return hasHitEOF;
543 }
544
545 protected final void setAtEOF(boolean b) {
546 hasHitEOF = b;
547 }
548
549 /**
550 * Checks if the signature matches what is expected for a tar file.
551 *
552 * @param signature
553 * the bytes to check
554 * @param length
555 * the number of bytes to check
556 * @return true, if this stream is a tar archive stream, false otherwise
557 */
558 public static boolean matches(byte[] signature, int length) {
559 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
560 return false;
561 }
562
563 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
564 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
565 &&
566 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
567 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
568 ){
569 return true;
570 }
571 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
572 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
573 &&
574 (
575 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
576 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
577 ||
578 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
579 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
580 )
581 ){
582 return true;
583 }
584 // COMPRESS-107 - recognise Ant tar files
585 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
586 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
587 &&
588 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
589 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
590 ){
591 return true;
592 }
593 return false;
594 }
595
596 }