001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { 041 042 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 043 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 044 045 /* 046 047 The LZ4 block format has a few properties that make it less 048 straight-forward than one would hope: 049 050 * literal blocks and back-references must come in pairs (except 051 for the very last literal block), so consecutive literal 052 blocks created by the compressor must be merged into a single 053 block. 054 055 * the start of a literal/back-reference pair contains the length 056 of the back-reference (at least some part of it) so we can't 057 start writing the literal before we know how long the next 058 back-reference is going to be. 059 060 * there are special rules for the final blocks 061 062 > There are specific parsing rules to respect in order to remain 063 > compatible with assumptions made by the decoder : 064 > 065 > 1. The last 5 bytes are always literals 066 > 067 > 2. The last match must start at least 12 bytes before end of 068 > block. Consequently, a block with less than 13 bytes cannot be 069 > compressed. 070 071 which means any back-reference may need to get rewritten as a 072 literal block unless we know the next block is at least of 073 length 5 and the sum of this block's length and offset and the 074 next block's length is at least twelve. 075 076 */ 077 078 private final LZ77Compressor compressor; 079 private final OutputStream os; 080 081 // used in one-arg write method 082 private final byte[] oneByte = new byte[1]; 083 084 private boolean finished; 085 086 private final Deque<Pair> pairs = new LinkedList<>(); 087 // keeps track of the last window-size bytes (64k) in order to be 088 // able to expand back-references when needed 089 private final Deque<byte[]> expandedBlocks = new LinkedList<>(); 090 091 /** 092 * Creates a new LZ4 output stream. 093 * 094 * @param os 095 * An OutputStream to read compressed data from 096 */ 097 public BlockLZ4CompressorOutputStream(final OutputStream os) { 098 this(os, createParameterBuilder().build()); 099 } 100 101 /** 102 * Creates a new LZ4 output stream. 103 * 104 * @param os 105 * An OutputStream to read compressed data from 106 * @param params 107 * The parameters to use for LZ77 compression. 108 */ 109 public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) { 110 this.os = os; 111 compressor = new LZ77Compressor(params, 112 block -> { 113 switch (block.getType()) { 114 case LITERAL: 115 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 116 break; 117 case BACK_REFERENCE: 118 addBackReference((LZ77Compressor.BackReference) block); 119 break; 120 case EOD: 121 writeFinalLiteralBlock(); 122 break; 123 } 124 }); 125 } 126 127 @Override 128 public void write(final int b) throws IOException { 129 oneByte[0] = (byte) (b & 0xff); 130 write(oneByte); 131 } 132 133 @Override 134 public void write(final byte[] data, final int off, final int len) throws IOException { 135 compressor.compress(data, off, len); 136 } 137 138 @Override 139 public void close() throws IOException { 140 try { 141 finish(); 142 } finally { 143 os.close(); 144 } 145 } 146 147 /** 148 * Compresses all remaining data and writes it to the stream, 149 * doesn't close the underlying stream. 150 * @throws IOException if an error occurs 151 */ 152 public void finish() throws IOException { 153 if (!finished) { 154 compressor.finish(); 155 finished = true; 156 } 157 } 158 159 /** 160 * Adds some initial data to fill the window with. 161 * 162 * @param data the data to fill the window with. 163 * @param off offset of real data into the array 164 * @param len amount of data 165 * @throws IllegalStateException if the stream has already started to write data 166 * @see LZ77Compressor#prefill 167 */ 168 public void prefill(final byte[] data, final int off, final int len) { 169 if (len > 0) { 170 final byte[] b = Arrays.copyOfRange(data, off, off + len); 171 compressor.prefill(b); 172 recordLiteral(b); 173 } 174 } 175 176 private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 177 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 178 recordLiteral(last.addLiteral(block)); 179 clearUnusedBlocksAndPairs(); 180 } 181 182 private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { 183 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 184 last.setBackReference(block); 185 recordBackReference(block); 186 clearUnusedBlocksAndPairs(); 187 } 188 189 private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { 190 writeWritablePairs(length); 191 Pair last = pairs.peekLast(); 192 if (last == null || last.hasBackReference()) { 193 last = new Pair(); 194 pairs.addLast(last); 195 } 196 return last; 197 } 198 199 private void recordLiteral(final byte[] b) { 200 expandedBlocks.addFirst(b); 201 } 202 203 private void clearUnusedBlocksAndPairs() { 204 clearUnusedBlocks(); 205 clearUnusedPairs(); 206 } 207 208 private void clearUnusedBlocks() { 209 int blockLengths = 0; 210 int blocksToKeep = 0; 211 for (final byte[] b : expandedBlocks) { 212 blocksToKeep++; 213 blockLengths += b.length; 214 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 215 break; 216 } 217 } 218 final int size = expandedBlocks.size(); 219 for (int i = blocksToKeep; i < size; i++) { 220 expandedBlocks.removeLast(); 221 } 222 } 223 224 private void recordBackReference(final LZ77Compressor.BackReference block) { 225 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 226 } 227 228 private byte[] expand(final int offset, final int length) { 229 final byte[] expanded = new byte[length]; 230 if (offset == 1) { // surprisingly common special case 231 final byte[] block = expandedBlocks.peekFirst(); 232 final byte b = block[block.length - 1]; 233 if (b != 0) { // the fresh array contains 0s anyway 234 Arrays.fill(expanded, b); 235 } 236 } else { 237 expandFromList(expanded, offset, length); 238 } 239 return expanded; 240 } 241 242 private void expandFromList(final byte[] expanded, final int offset, final int length) { 243 int offsetRemaining = offset; 244 int lengthRemaining = length; 245 int writeOffset = 0; 246 while (lengthRemaining > 0) { 247 // find block that contains offsetRemaining 248 byte[] block = null; 249 final int copyLen; 250 final int copyOffset; 251 if (offsetRemaining > 0) { 252 int blockOffset = 0; 253 for (final byte[] b : expandedBlocks) { 254 if (b.length + blockOffset >= offsetRemaining) { 255 block = b; 256 break; 257 } 258 blockOffset += b.length; 259 } 260 if (block == null) { 261 // should not be possible 262 throw new IllegalStateException("Failed to find a block containing offset " + offset); 263 } 264 copyOffset = blockOffset + block.length - offsetRemaining; 265 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 266 } else { 267 // offsetRemaining is negative or 0 and points into the expanded bytes 268 block = expanded; 269 copyOffset = -offsetRemaining; 270 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 271 } 272 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 273 offsetRemaining -= copyLen; 274 lengthRemaining -= copyLen; 275 writeOffset += copyLen; 276 } 277 } 278 279 private void clearUnusedPairs() { 280 int pairLengths = 0; 281 int pairsToKeep = 0; 282 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 283 final Pair p = it.next(); 284 pairsToKeep++; 285 pairLengths += p.length(); 286 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 287 break; 288 } 289 } 290 final int size = pairs.size(); 291 for (int i = pairsToKeep; i < size; i++) { 292 final Pair p = pairs.peekFirst(); 293 if (!p.hasBeenWritten()) { 294 break; 295 } 296 pairs.removeFirst(); 297 } 298 } 299 300 private void writeFinalLiteralBlock() throws IOException { 301 rewriteLastPairs(); 302 for (final Pair p : pairs) { 303 if (!p.hasBeenWritten()) { 304 p.writeTo(os); 305 } 306 } 307 pairs.clear(); 308 } 309 310 private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { 311 int unwrittenLength = lengthOfBlocksAfterLastPair; 312 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 313 final Pair p = it.next(); 314 if (p.hasBeenWritten()) { 315 break; 316 } 317 unwrittenLength += p.length(); 318 } 319 for (final Pair p : pairs) { 320 if (p.hasBeenWritten()) { 321 continue; 322 } 323 unwrittenLength -= p.length(); 324 if (!p.canBeWritten(unwrittenLength)) { 325 break; 326 } 327 p.writeTo(os); 328 } 329 } 330 331 private void rewriteLastPairs() { 332 final LinkedList<Pair> lastPairs = new LinkedList<>(); 333 final LinkedList<Integer> pairLength = new LinkedList<>(); 334 int offset = 0; 335 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 336 final Pair p = it.next(); 337 if (p.hasBeenWritten()) { 338 break; 339 } 340 final int len = p.length(); 341 pairLength.addFirst(len); 342 lastPairs.addFirst(p); 343 offset += len; 344 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 345 break; 346 } 347 } 348 lastPairs.forEach(pairs::remove); 349 // lastPairs may contain between one and four Pairs: 350 // * the last pair may be a one byte literal 351 // * all other Pairs contain a back-reference which must be four bytes long at minimum 352 // we could merge them all into a single literal block but 353 // this may harm compression. For example compressing 354 // "bla.tar" from our tests yields a last block containing a 355 // back-reference of length > 2k and we'd end up with a last 356 // literal of that size rather than a 2k back-reference and a 357 // 12 byte literal at the end. 358 359 // Instead we merge all but the first of lastPairs into a new 360 // literal-only Pair "replacement" and look at the 361 // back-reference in the first of lastPairs and see if we can 362 // split it. We can split it if it is longer than 16 - 363 // replacement.length (i.e. the minimal length of four is kept 364 // while making sure the last literal is at least twelve bytes 365 // long). If we can't split it, we expand the first of the pairs 366 // as well. 367 368 // this is not optimal, we could get better compression 369 // results with more complex approaches as the last literal 370 // only needs to be five bytes long if the previous 371 // back-reference has an offset big enough 372 373 final int lastPairsSize = lastPairs.size(); 374 int toExpand = 0; 375 for (int i = 1; i < lastPairsSize; i++) { 376 toExpand += pairLength.get(i); 377 } 378 final Pair replacement = new Pair(); 379 if (toExpand > 0) { 380 replacement.prependLiteral(expand(toExpand, toExpand)); 381 } 382 final Pair splitCandidate = lastPairs.get(0); 383 final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 384 final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 385 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 386 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 387 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 388 } else { 389 if (splitCandidate.hasBackReference()) { 390 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 391 } 392 splitCandidate.prependTo(replacement); 393 } 394 pairs.add(replacement); 395 } 396 397 /** 398 * Returns a builder correctly configured for the LZ4 algorithm. 399 * @return a builder correctly configured for the LZ4 algorithm 400 */ 401 public static Parameters.Builder createParameterBuilder() { 402 final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 403 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE) 404 .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 405 .withMaxBackReferenceLength(maxLen) 406 .withMaxOffset(maxLen) 407 .withMaxLiteralLength(maxLen); 408 } 409 410 final static class Pair { 411 private final Deque<byte[]> literals = new LinkedList<>(); 412 private int brOffset, brLength; 413 private boolean written; 414 415 private void prependLiteral(final byte[] data) { 416 literals.addFirst(data); 417 } 418 419 byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { 420 final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), 421 block.getOffset() + block.getLength()); 422 literals.add(copy); 423 return copy; 424 } 425 426 void setBackReference(final LZ77Compressor.BackReference block) { 427 if (hasBackReference()) { 428 throw new IllegalStateException(); 429 } 430 brOffset = block.getOffset(); 431 brLength = block.getLength(); 432 } 433 434 boolean hasBackReference() { 435 return brOffset > 0; 436 } 437 438 boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { 439 return hasBackReference() 440 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 441 } 442 443 int length() { 444 return literalLength() + brLength; 445 } 446 447 private boolean hasBeenWritten() { 448 return written; 449 } 450 451 void writeTo(final OutputStream out) throws IOException { 452 final int litLength = literalLength(); 453 out.write(lengths(litLength, brLength)); 454 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 455 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 456 } 457 for (final byte[] b : literals) { 458 out.write(b); 459 } 460 if (hasBackReference()) { 461 ByteUtils.toLittleEndian(out, brOffset, 2); 462 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 463 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH 464 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 465 } 466 } 467 written = true; 468 } 469 470 private int literalLength() { 471 return literals.stream().mapToInt(b -> b.length).sum(); 472 } 473 474 private static int lengths(final int litLength, final int brLength) { 475 final int l = Math.min(litLength, 15); 476 final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); 477 return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; 478 } 479 480 private static void writeLength(int length, final OutputStream out) throws IOException { 481 while (length >= 255) { 482 out.write(255); 483 length -= 255; 484 } 485 out.write(length); 486 } 487 488 private int backReferenceLength() { 489 return brLength; 490 } 491 492 private void prependTo(final Pair other) { 493 final Iterator<byte[]> listBackwards = literals.descendingIterator(); 494 while (listBackwards.hasNext()) { 495 other.prependLiteral(listBackwards.next()); 496 } 497 } 498 499 private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { 500 final Pair p = new Pair(); 501 p.literals.addAll(literals); 502 p.brOffset = brOffset; 503 p.brLength = newBackReferenceLength; 504 return p; 505 } 506 } 507}