001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import static java.nio.charset.StandardCharsets.ISO_8859_1; 022 023import java.io.BufferedInputStream; 024import java.io.ByteArrayOutputStream; 025import java.io.DataInput; 026import java.io.DataInputStream; 027import java.io.EOFException; 028import java.io.IOException; 029import java.io.InputStream; 030import java.util.zip.CRC32; 031import java.util.zip.DataFormatException; 032import java.util.zip.Deflater; 033import java.util.zip.Inflater; 034 035import org.apache.commons.compress.compressors.CompressorInputStream; 036import org.apache.commons.compress.utils.ByteUtils; 037import org.apache.commons.compress.utils.CountingInputStream; 038import org.apache.commons.compress.utils.IOUtils; 039import org.apache.commons.compress.utils.InputStreamStatistics; 040 041/** 042 * Input stream that decompresses .gz files. 043 * 044 * <p>This supports decompressing concatenated .gz files which is important 045 * when decompressing standalone .gz files.</p> 046 * 047 * <p> 048 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 049 * files: it stops after the first member and silently ignores the rest. 050 * It doesn't leave the read position to point to the beginning of the next 051 * member, which makes it difficult workaround the lack of concatenation 052 * support. 053 * </p> 054 * 055 * <p> 056 * Instead of using {@code GZIPInputStream}, this class has its own .gz 057 * container format decoder. The actual decompression is done with 058 * {@link java.util.zip.Inflater}. 059 * </p> 060 * 061 * <p>If you use the constructor {@code GzipCompressorInputStream(in)} 062 * or {@code GzipCompressorInputStream(in, false)} with some {@code 063 * InputStream} {@code in} then {@link #read} will return -1 as soon 064 * as the first internal member has been read completely. The stream 065 * {@code in} will be positioned at the start of the second gzip 066 * member if there is one.</p> 067 * 068 * <p>If you use the constructor {@code GzipCompressorInputStream(in, 069 * true)} with some {@code InputStream} {@code in} then {@link #read} 070 * will return -1 once the stream {@code in} has been exhausted. The 071 * data read from a stream constructed this way will consist of the 072 * concatenated data of all gzip members contained inside {@code 073 * in}.</p> 074 * 075 * @see "https://tools.ietf.org/html/rfc1952" 076 */ 077public class GzipCompressorInputStream extends CompressorInputStream 078 implements InputStreamStatistics { 079 080 // Header flags 081 // private static final int FTEXT = 0x01; // Uninteresting for us 082 private static final int FHCRC = 0x02; 083 private static final int FEXTRA = 0x04; 084 private static final int FNAME = 0x08; 085 private static final int FCOMMENT = 0x10; 086 private static final int FRESERVED = 0xE0; 087 088 private final CountingInputStream countingStream; 089 090 // Compressed input stream, possibly wrapped in a 091 // BufferedInputStream, always wrapped in countingStream above 092 private final InputStream in; 093 094 // True if decompressing multi member streams. 095 private final boolean decompressConcatenated; 096 097 // Buffer to hold the input data 098 private final byte[] buf = new byte[8192]; 099 100 // Amount of data in buf. 101 private int bufUsed; 102 103 // Decompressor 104 private Inflater inf = new Inflater(true); 105 106 // CRC32 from uncompressed data 107 private final CRC32 crc = new CRC32(); 108 109 // True once everything has been decompressed 110 private boolean endReached; 111 112 // used in no-arg read method 113 private final byte[] oneByte = new byte[1]; 114 115 private final GzipParameters parameters = new GzipParameters(); 116 117 /** 118 * Constructs a new input stream that decompresses gzip-compressed data 119 * from the specified input stream. 120 * <p> 121 * This is equivalent to 122 * {@code GzipCompressorInputStream(inputStream, false)} and thus 123 * will not decompress concatenated .gz files. 124 * 125 * @param inputStream the InputStream from which this object should 126 * be created of 127 * 128 * @throws IOException if the stream could not be created 129 */ 130 public GzipCompressorInputStream(final InputStream inputStream) 131 throws IOException { 132 this(inputStream, false); 133 } 134 135 /** 136 * Constructs a new input stream that decompresses gzip-compressed data 137 * from the specified input stream. 138 * <p> 139 * If {@code decompressConcatenated} is {@code false}: 140 * This decompressor might read more input than it will actually use. 141 * If {@code inputStream} supports {@code mark} and 142 * {@code reset}, then the input position will be adjusted 143 * so that it is right after the last byte of the compressed stream. 144 * If {@code mark} isn't supported, the input position will be 145 * undefined. 146 * 147 * @param inputStream the InputStream from which this object should 148 * be created of 149 * @param decompressConcatenated 150 * if true, decompress until the end of the input; 151 * if false, stop after the first .gz member 152 * 153 * @throws IOException if the stream could not be created 154 */ 155 public GzipCompressorInputStream(final InputStream inputStream, 156 final boolean decompressConcatenated) 157 throws IOException { 158 countingStream = new CountingInputStream(inputStream); 159 // Mark support is strictly needed for concatenated files only, 160 // but it's simpler if it is always available. 161 if (countingStream.markSupported()) { 162 in = countingStream; 163 } else { 164 in = new BufferedInputStream(countingStream); 165 } 166 167 this.decompressConcatenated = decompressConcatenated; 168 init(true); 169 } 170 171 /** 172 * Provides the stream's meta data - may change with each stream 173 * when decompressing concatenated streams. 174 * @return the stream's meta data 175 * @since 1.8 176 */ 177 public GzipParameters getMetaData() { 178 return parameters; 179 } 180 181 private boolean init(final boolean isFirstMember) throws IOException { 182 assert isFirstMember || decompressConcatenated; 183 184 // Check the magic bytes without a possibility of EOFException. 185 final int magic0 = in.read(); 186 187 // If end of input was reached after decompressing at least 188 // one .gz member, we have reached the end of the file successfully. 189 if (magic0 == -1 && !isFirstMember) { 190 return false; 191 } 192 193 if (magic0 != 31 || in.read() != 139) { 194 throw new IOException(isFirstMember 195 ? "Input is not in the .gz format" 196 : "Garbage after a valid .gz stream"); 197 } 198 199 // Parsing the rest of the header may throw EOFException. 200 final DataInput inData = new DataInputStream(in); 201 final int method = inData.readUnsignedByte(); 202 if (method != Deflater.DEFLATED) { 203 throw new IOException("Unsupported compression method " 204 + method + " in the .gz header"); 205 } 206 207 final int flg = inData.readUnsignedByte(); 208 if ((flg & FRESERVED) != 0) { 209 throw new IOException( 210 "Reserved flags are set in the .gz header"); 211 } 212 213 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 214 switch (inData.readUnsignedByte()) { // extra flags 215 case 2: 216 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 217 break; 218 case 4: 219 parameters.setCompressionLevel(Deflater.BEST_SPEED); 220 break; 221 default: 222 // ignored for now 223 break; 224 } 225 parameters.setOperatingSystem(inData.readUnsignedByte()); 226 227 // Extra field, ignored 228 if ((flg & FEXTRA) != 0) { 229 int xlen = inData.readUnsignedByte(); 230 xlen |= inData.readUnsignedByte() << 8; 231 232 // This isn't as efficient as calling in.skip would be, 233 // but it's lazier to handle unexpected end of input this way. 234 // Most files don't have an extra field anyway. 235 while (xlen-- > 0) { 236 inData.readUnsignedByte(); 237 } 238 } 239 240 // Original file name 241 if ((flg & FNAME) != 0) { 242 parameters.setFilename(new String(readToNull(inData), ISO_8859_1)); 243 } 244 245 // Comment 246 if ((flg & FCOMMENT) != 0) { 247 parameters.setComment(new String(readToNull(inData), ISO_8859_1)); 248 } 249 250 // Header "CRC16" which is actually a truncated CRC32 (which isn't 251 // as good as real CRC16). I don't know if any encoder implementation 252 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 253 // doesn't support this field, but zlib seems to be able to at least 254 // skip over it. 255 if ((flg & FHCRC) != 0) { 256 inData.readShort(); 257 } 258 259 // Reset 260 inf.reset(); 261 crc.reset(); 262 263 return true; 264 } 265 266 private static byte[] readToNull(final DataInput inData) throws IOException { 267 try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 268 int b = 0; 269 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR 270 bos.write(b); 271 } 272 return bos.toByteArray(); 273 } 274 } 275 276 @Override 277 public int read() throws IOException { 278 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 279 } 280 281 /** 282 * {@inheritDoc} 283 * 284 * @since 1.1 285 */ 286 @Override 287 public int read(final byte[] b, int off, int len) throws IOException { 288 if (len == 0) { 289 return 0; 290 } 291 if (endReached) { 292 return -1; 293 } 294 295 int size = 0; 296 297 while (len > 0) { 298 if (inf.needsInput()) { 299 // Remember the current position because we may need to 300 // rewind after reading too much input. 301 in.mark(buf.length); 302 303 bufUsed = in.read(buf); 304 if (bufUsed == -1) { 305 throw new EOFException(); 306 } 307 308 inf.setInput(buf, 0, bufUsed); 309 } 310 311 final int ret; 312 try { 313 ret = inf.inflate(b, off, len); 314 } catch (final DataFormatException e) { // NOSONAR 315 throw new IOException("Gzip-compressed data is corrupt"); 316 } 317 318 crc.update(b, off, ret); 319 off += ret; 320 len -= ret; 321 size += ret; 322 count(ret); 323 324 if (inf.finished()) { 325 // We may have read too many bytes. Rewind the read 326 // position to match the actual amount used. 327 in.reset(); 328 329 final int skipAmount = bufUsed - inf.getRemaining(); 330 if (IOUtils.skip(in, skipAmount) != skipAmount) { 331 throw new IOException(); 332 } 333 334 bufUsed = 0; 335 336 final DataInput inData = new DataInputStream(in); 337 338 // CRC32 339 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 340 341 if (crcStored != crc.getValue()) { 342 throw new IOException("Gzip-compressed data is corrupt " 343 + "(CRC32 error)"); 344 } 345 346 // Uncompressed size modulo 2^32 (ISIZE in the spec) 347 final long isize = ByteUtils.fromLittleEndian(inData, 4); 348 349 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 350 throw new IOException("Gzip-compressed data is corrupt" 351 + "(uncompressed size mismatch)"); 352 } 353 354 // See if this is the end of the file. 355 if (!decompressConcatenated || !init(false)) { 356 inf.end(); 357 inf = null; 358 endReached = true; 359 return size == 0 ? -1 : size; 360 } 361 } 362 } 363 364 return size; 365 } 366 367 /** 368 * Checks if the signature matches what is expected for a .gz file. 369 * 370 * @param signature the bytes to check 371 * @param length the number of bytes to check 372 * @return true if this is a .gz stream, false otherwise 373 * 374 * @since 1.1 375 */ 376 public static boolean matches(final byte[] signature, final int length) { 377 return length >= 2 && signature[0] == 31 && signature[1] == -117; 378 } 379 380 /** 381 * Closes the input stream (unless it is System.in). 382 * 383 * @since 1.2 384 */ 385 @Override 386 public void close() throws IOException { 387 if (inf != null) { 388 inf.end(); 389 inf = null; 390 } 391 392 if (this.in != System.in) { 393 this.in.close(); 394 } 395 } 396 397 /** 398 * @since 1.17 399 */ 400 @Override 401 public long getCompressedCount() { 402 return countingStream.getBytesRead(); 403 } 404}