001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied. See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019 package org.apache.commons.compress.compressors.gzip;
020
021 import java.io.IOException;
022 import java.io.EOFException;
023 import java.io.InputStream;
024 import java.io.DataInputStream;
025 import java.io.BufferedInputStream;
026 import java.util.zip.DataFormatException;
027 import java.util.zip.Inflater;
028 import java.util.zip.CRC32;
029
030 import org.apache.commons.compress.compressors.CompressorInputStream;
031
032 /**
033 * Input stream that decompresses .gz files.
034 * This supports decompressing concatenated .gz files which is important
035 * when decompressing standalone .gz files.
036 * <p>
037 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
038 * files: it stops after the first member and silently ignores the rest.
039 * It doesn't leave the read position to point to the beginning of the next
040 * member, which makes it difficult workaround the lack of concatenation
041 * support.
042 * <p>
043 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
044 * container format decoder. The actual decompression is done with
045 * {@link java.util.zip.Inflater}.
046 */
047 public class GzipCompressorInputStream extends CompressorInputStream {
048 // Header flags
049 // private static final int FTEXT = 0x01; // Uninteresting for us
050 private static final int FHCRC = 0x02;
051 private static final int FEXTRA = 0x04;
052 private static final int FNAME = 0x08;
053 private static final int FCOMMENT = 0x10;
054 private static final int FRESERVED = 0xE0;
055
056 // Compressed input stream, possibly wrapped in a BufferedInputStream
057 private final InputStream in;
058
059 // True if decompressing multimember streams.
060 private final boolean decompressConcatenated;
061
062 // Buffer to hold the input data
063 private final byte[] buf = new byte[8192];
064
065 // Amount of data in buf.
066 private int bufUsed = 0;
067
068 // Decompressor
069 private Inflater inf = new Inflater(true);
070
071 // CRC32 from uncompressed data
072 private CRC32 crc = new CRC32();
073
074 private int memberSize;
075
076 // True once everything has been decompressed
077 private boolean endReached = false;
078
079 /**
080 * Constructs a new input stream that decompresses gzip-compressed data
081 * from the specified input stream.
082 * <p>
083 * This is equivalent to
084 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
085 * will not decompress concatenated .gz files.
086 *
087 * @param inputStream the InputStream from which this object should
088 * be created of
089 *
090 * @throws IOException if the stream could not be created
091 */
092 public GzipCompressorInputStream(InputStream inputStream)
093 throws IOException {
094 this(inputStream, false);
095 }
096
097 /**
098 * Constructs a new input stream that decompresses gzip-compressed data
099 * from the specified input stream.
100 * <p>
101 * If <code>decompressConcatenated</code> is {@code false}:
102 * This decompressor might read more input than it will actually use.
103 * If <code>inputStream</code> supports <code>mark</code> and
104 * <code>reset</code>, then the input position will be adjusted
105 * so that it is right after the last byte of the compressed stream.
106 * If <code>mark</code> isn't supported, the input position will be
107 * undefined.
108 *
109 * @param inputStream the InputStream from which this object should
110 * be created of
111 * @param decompressConcatenated
112 * if true, decompress until the end of the input;
113 * if false, stop after the first .gz member
114 *
115 * @throws IOException if the stream could not be created
116 */
117 public GzipCompressorInputStream(InputStream inputStream,
118 boolean decompressConcatenated)
119 throws IOException {
120 // Mark support is strictly needed for concatenated files only,
121 // but it's simpler if it is always available.
122 if (inputStream.markSupported()) {
123 in = inputStream;
124 } else {
125 in = new BufferedInputStream(inputStream);
126 }
127
128 this.decompressConcatenated = decompressConcatenated;
129 init(true);
130 }
131
132 private boolean init(boolean isFirstMember) throws IOException {
133 assert isFirstMember || decompressConcatenated;
134
135 // Check the magic bytes without a possibility of EOFException.
136 int magic0 = in.read();
137 int magic1 = in.read();
138
139 // If end of input was reached after decompressing at least
140 // one .gz member, we have reached the end of the file successfully.
141 if (magic0 == -1 && !isFirstMember) {
142 return false;
143 }
144
145 if (magic0 != 31 || magic1 != 139) {
146 throw new IOException(isFirstMember
147 ? "Input is not in the .gz format"
148 : "Garbage after a valid .gz stream");
149 }
150
151 // Parsing the rest of the header may throw EOFException.
152 DataInputStream inData = new DataInputStream(in);
153 int method = inData.readUnsignedByte();
154 if (method != 8) {
155 throw new IOException("Unsupported compression method "
156 + method + " in the .gz header");
157 }
158
159 int flg = inData.readUnsignedByte();
160 if ((flg & FRESERVED) != 0) {
161 throw new IOException(
162 "Reserved flags are set in the .gz header");
163 }
164
165 inData.readInt(); // mtime, ignored
166 inData.readUnsignedByte(); // extra flags, ignored
167 inData.readUnsignedByte(); // operating system, ignored
168
169 // Extra field, ignored
170 if ((flg & FEXTRA) != 0) {
171 int xlen = inData.readUnsignedByte();
172 xlen |= inData.readUnsignedByte() << 8;
173
174 // This isn't as efficient as calling in.skip would be,
175 // but it's lazier to handle unexpected end of input this way.
176 // Most files don't have an extra field anyway.
177 while (xlen-- > 0) {
178 inData.readUnsignedByte();
179 }
180 }
181
182 // Original file name, ignored
183 if ((flg & FNAME) != 0) {
184 readToNull(inData);
185 }
186
187 // Comment, ignored
188 if ((flg & FCOMMENT) != 0) {
189 readToNull(inData);
190 }
191
192 // Header "CRC16" which is actually a truncated CRC32 (which isn't
193 // as good as real CRC16). I don't know if any encoder implementation
194 // sets this, so it's not worth trying to verify it. GNU gzip 1.4
195 // doesn't support this field, but zlib seems to be able to at least
196 // skip over it.
197 if ((flg & FHCRC) != 0) {
198 inData.readShort();
199 }
200
201 // Reset
202 inf.reset();
203 crc.reset();
204 memberSize = 0;
205
206 return true;
207 }
208
209 private void readToNull(DataInputStream inData) throws IOException {
210 while (inData.readUnsignedByte() != 0x00) {}
211 }
212
213 /** {@inheritDoc} */
214 @Override
215 public int read() throws IOException {
216 byte[] buf = new byte[1];
217 return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
218 }
219
220 /**
221 * {@inheritDoc}
222 *
223 * @since 1.1
224 */
225 @Override
226 public int read(byte[] b, int off, int len) throws IOException {
227 if (endReached) {
228 return -1;
229 }
230
231 int size = 0;
232
233 while (len > 0) {
234 if (inf.needsInput()) {
235 // Remember the current position because we may need to
236 // rewind after reading too much input.
237 in.mark(buf.length);
238
239 bufUsed = in.read(buf);
240 if (bufUsed == -1) {
241 throw new EOFException();
242 }
243
244 inf.setInput(buf, 0, bufUsed);
245 }
246
247 int ret;
248 try {
249 ret = inf.inflate(b, off, len);
250 } catch (DataFormatException e) {
251 throw new IOException("Gzip-compressed data is corrupt");
252 }
253
254 crc.update(b, off, ret);
255 memberSize += ret;
256 off += ret;
257 len -= ret;
258 size += ret;
259 count(ret);
260
261 if (inf.finished()) {
262 // We may have read too many bytes. Rewind the read
263 // position to match the actual amount used.
264 //
265 // NOTE: The "if" is there just in case. Since we used
266 // in.mark earler, it should always skip enough.
267 in.reset();
268
269 int skipAmount = bufUsed - inf.getRemaining();
270 if (in.skip(skipAmount) != skipAmount) {
271 throw new IOException();
272 }
273
274 bufUsed = 0;
275
276 DataInputStream inData = new DataInputStream(in);
277
278 // CRC32
279 long crcStored = 0;
280 for (int i = 0; i < 4; ++i) {
281 crcStored |= (long)inData.readUnsignedByte() << (i * 8);
282 }
283
284 if (crcStored != crc.getValue()) {
285 throw new IOException("Gzip-compressed data is corrupt "
286 + "(CRC32 error)");
287 }
288
289 // Uncompressed size modulo 2^32 (ISIZE in the spec)
290 int isize = 0;
291 for (int i = 0; i < 4; ++i) {
292 isize |= inData.readUnsignedByte() << (i * 8);
293 }
294
295 if (isize != memberSize) {
296 throw new IOException("Gzip-compressed data is corrupt"
297 + "(uncompressed size mismatch)");
298 }
299
300 // See if this is the end of the file.
301 if (!decompressConcatenated || !init(false)) {
302 inf.end();
303 inf = null;
304 endReached = true;
305 return size == 0 ? -1 : size;
306 }
307 }
308 }
309
310 return size;
311 }
312
313 /**
314 * Checks if the signature matches what is expected for a .gz file.
315 *
316 * @param signature the bytes to check
317 * @param length the number of bytes to check
318 * @return true if this is a .gz stream, false otherwise
319 *
320 * @since 1.1
321 */
322 public static boolean matches(byte[] signature, int length) {
323
324 if (length < 2) {
325 return false;
326 }
327
328 if (signature[0] != 31) {
329 return false;
330 }
331
332 if (signature[1] != -117) {
333 return false;
334 }
335
336 return true;
337 }
338
339 /**
340 * Closes the input stream (unless it is System.in).
341 *
342 * @since 1.2
343 */
344 @Override
345 public void close() throws IOException {
346 if (inf != null) {
347 inf.end();
348 inf = null;
349 }
350
351 if (this.in != System.in) {
352 this.in.close();
353 }
354 }
355 }