001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { 041 042 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 043 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 044 045 /* 046 047 The LZ4 block format has a few properties that make it less 048 straight-forward than one would hope: 049 050 * literal blocks and back-references must come in pairs (except 051 for the very last literal block), so consecutive literal 052 blocks created by the compressor must be merged into a single 053 block. 054 055 * the start of a literal/back-reference pair contains the length 056 of the back-reference (at least some part of it) so we can't 057 start writing the literal before we know how long the next 058 back-reference is going to be. 059 060 * there are special rules for the final blocks 061 062 > There are specific parsing rules to respect in order to remain 063 > compatible with assumptions made by the decoder : 064 > 065 > 1. The last 5 bytes are always literals 066 > 067 > 2. The last match must start at least 12 bytes before end of 068 > block. Consequently, a block with less than 13 bytes cannot be 069 > compressed. 070 071 which means any back-reference may need to get rewritten as a 072 literal block unless we know the next block is at least of 073 length 5 and the sum of this block's length and offset and the 074 next block's length is at least twelve. 075 076 */ 077 078 private final LZ77Compressor compressor; 079 private final OutputStream os; 080 081 // used in one-arg write method 082 private final byte[] oneByte = new byte[1]; 083 084 private boolean finished = false; 085 086 private Deque<Pair> pairs = new LinkedList<>(); 087 // keeps track of the last window-size bytes (64k) in order to be 088 // able to expand back-references when needed 089 private Deque<byte[]> expandedBlocks = new LinkedList<>(); 090 091 /** 092 * Creates a new LZ4 output stream. 093 * 094 * @param os 095 * An OutputStream to read compressed data from 096 * 097 * @throws IOException if reading fails 098 */ 099 public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException { 100 this(os, createParameterBuilder().build()); 101 } 102 103 /** 104 * Creates a new LZ4 output stream. 105 * 106 * @param os 107 * An OutputStream to read compressed data from 108 * @param params 109 * The parameters to use for LZ77 compression. 110 * 111 * @throws IOException if reading fails 112 */ 113 public BlockLZ4CompressorOutputStream(final OutputStream os, Parameters params) throws IOException { 114 this.os = os; 115 compressor = new LZ77Compressor(params, 116 new LZ77Compressor.Callback() { 117 @Override 118 public void accept(LZ77Compressor.Block block) throws IOException { 119 switch (block.getType()) { 120 case LITERAL: 121 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 122 break; 123 case BACK_REFERENCE: 124 addBackReference((LZ77Compressor.BackReference) block); 125 break; 126 case EOD: 127 writeFinalLiteralBlock(); 128 break; 129 } 130 } 131 }); 132 } 133 134 @Override 135 public void write(int b) throws IOException { 136 oneByte[0] = (byte) (b & 0xff); 137 write(oneByte); 138 } 139 140 @Override 141 public void write(byte[] data, int off, int len) throws IOException { 142 compressor.compress(data, off, len); 143 } 144 145 @Override 146 public void close() throws IOException { 147 try { 148 finish(); 149 } finally { 150 os.close(); 151 } 152 } 153 154 /** 155 * Compresses all remaining data and writes it to the stream, 156 * doesn't close the underlying stream. 157 * @throws IOException if an error occurs 158 */ 159 public void finish() throws IOException { 160 if (!finished) { 161 compressor.finish(); 162 finished = true; 163 } 164 } 165 166 /** 167 * Adds some initial data to fill the window with. 168 * 169 * @param data the data to fill the window with. 170 * @param off offset of real data into the array 171 * @param len amount of data 172 * @throws IllegalStateException if the stream has already started to write data 173 * @see LZ77Compressor#prefill 174 */ 175 public void prefill(byte[] data, int off, int len) { 176 if (len > 0) { 177 byte[] b = Arrays.copyOfRange(data, off, off + len); 178 compressor.prefill(b); 179 recordLiteral(b); 180 } 181 } 182 183 private void addLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException { 184 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 185 recordLiteral(last.addLiteral(block)); 186 clearUnusedBlocksAndPairs(); 187 } 188 189 private void addBackReference(LZ77Compressor.BackReference block) throws IOException { 190 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 191 last.setBackReference(block); 192 recordBackReference(block); 193 clearUnusedBlocksAndPairs(); 194 } 195 196 private Pair writeBlocksAndReturnUnfinishedPair(int length) throws IOException { 197 writeWritablePairs(length); 198 Pair last = pairs.peekLast(); 199 if (last == null || last.hasBackReference()) { 200 last = new Pair(); 201 pairs.addLast(last); 202 } 203 return last; 204 } 205 206 private void recordLiteral(byte[] b) { 207 expandedBlocks.addFirst(b); 208 } 209 210 private void clearUnusedBlocksAndPairs() { 211 clearUnusedBlocks(); 212 clearUnusedPairs(); 213 } 214 215 private void clearUnusedBlocks() { 216 int blockLengths = 0; 217 int blocksToKeep = 0; 218 for (byte[] b : expandedBlocks) { 219 blocksToKeep++; 220 blockLengths += b.length; 221 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 222 break; 223 } 224 } 225 final int size = expandedBlocks.size(); 226 for (int i = blocksToKeep; i < size; i++) { 227 expandedBlocks.removeLast(); 228 } 229 } 230 231 private void recordBackReference(LZ77Compressor.BackReference block) { 232 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 233 } 234 235 private byte[] expand(final int offset, final int length) { 236 byte[] expanded = new byte[length]; 237 if (offset == 1) { // surprisingly common special case 238 byte[] block = expandedBlocks.peekFirst(); 239 byte b = block[block.length - 1]; 240 if (b != 0) { // the fresh array contains 0s anyway 241 Arrays.fill(expanded, b); 242 } 243 } else { 244 expandFromList(expanded, offset, length); 245 } 246 return expanded; 247 } 248 249 private void expandFromList(final byte[] expanded, int offset, int length) { 250 int offsetRemaining = offset; 251 int lengthRemaining = length; 252 int writeOffset = 0; 253 while (lengthRemaining > 0) { 254 // find block that contains offsetRemaining 255 byte[] block = null; 256 int copyLen, copyOffset; 257 if (offsetRemaining > 0) { 258 int blockOffset = 0; 259 for (byte[] b : expandedBlocks) { 260 if (b.length + blockOffset >= offsetRemaining) { 261 block = b; 262 break; 263 } 264 blockOffset += b.length; 265 } 266 if (block == null) { 267 // should not be possible 268 throw new IllegalStateException("failed to find a block containing offset " + offset); 269 } 270 copyOffset = blockOffset + block.length - offsetRemaining; 271 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 272 } else { 273 // offsetRemaining is negative or 0 and points into the expanded bytes 274 block = expanded; 275 copyOffset = -offsetRemaining; 276 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 277 } 278 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 279 offsetRemaining -= copyLen; 280 lengthRemaining -= copyLen; 281 writeOffset += copyLen; 282 } 283 } 284 285 private void clearUnusedPairs() { 286 int pairLengths = 0; 287 int pairsToKeep = 0; 288 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 289 Pair p = it.next(); 290 pairsToKeep++; 291 pairLengths += p.length(); 292 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 293 break; 294 } 295 } 296 final int size = pairs.size(); 297 for (int i = pairsToKeep; i < size; i++) { 298 Pair p = pairs.peekFirst(); 299 if (p.hasBeenWritten()) { 300 pairs.removeFirst(); 301 } else { 302 break; 303 } 304 } 305 } 306 307 private void writeFinalLiteralBlock() throws IOException { 308 rewriteLastPairs(); 309 for (Pair p : pairs) { 310 if (!p.hasBeenWritten()) { 311 p.writeTo(os); 312 } 313 } 314 pairs.clear(); 315 } 316 317 private void writeWritablePairs(int lengthOfBlocksAfterLastPair) throws IOException { 318 int unwrittenLength = lengthOfBlocksAfterLastPair; 319 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 320 Pair p = it.next(); 321 if (p.hasBeenWritten()) { 322 break; 323 } 324 unwrittenLength += p.length(); 325 } 326 for (Pair p : pairs) { 327 if (p.hasBeenWritten()) { 328 continue; 329 } 330 unwrittenLength -= p.length(); 331 if (p.canBeWritten(unwrittenLength)) { 332 p.writeTo(os); 333 } else { 334 break; 335 } 336 } 337 } 338 339 private void rewriteLastPairs() { 340 LinkedList<Pair> lastPairs = new LinkedList<>(); 341 LinkedList<Integer> pairLength = new LinkedList<>(); 342 int offset = 0; 343 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 344 Pair p = it.next(); 345 if (p.hasBeenWritten()) { 346 break; 347 } 348 int len = p.length(); 349 pairLength.addFirst(len); 350 lastPairs.addFirst(p); 351 offset += len; 352 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 353 break; 354 } 355 } 356 for (Pair p : lastPairs) { 357 pairs.remove(p); 358 } 359 // lastPairs may contain between one and four Pairs: 360 // * the last pair may be a one byte literal 361 // * all other Pairs contain a back-reference which must be four bytes long at minimum 362 // we could merge them all into a single literal block but 363 // this may harm compression. For example compressing 364 // "bla.tar" from our tests yields a last block containing a 365 // back-reference of length > 2k and we'd end up with a last 366 // literal of that size rather than a 2k back-reference and a 367 // 12 byte literal at the end. 368 369 // Instead we merge all but the first of lastPairs into a new 370 // literal-only Pair "replacement" and look at the 371 // back-reference in the first of lastPairs and see if we can 372 // split it. We can split it if it is longer than 16 - 373 // replacement.length (i.e. the minimal length of four is kept 374 // while making sure the last literal is at least twelve bytes 375 // long). If we can't split it, we expand the first of the pairs 376 // as well. 377 378 // this is not optimal, we could get better compression 379 // results with more complex approaches as the last literal 380 // only needs to be five bytes long if the previous 381 // back-reference has an offset big enough 382 383 final int lastPairsSize = lastPairs.size(); 384 int toExpand = 0; 385 for (int i = 1; i < lastPairsSize; i++) { 386 toExpand += pairLength.get(i); 387 } 388 Pair replacement = new Pair(); 389 if (toExpand > 0) { 390 replacement.prependLiteral(expand(toExpand, toExpand)); 391 } 392 Pair splitCandidate = lastPairs.get(0); 393 int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 394 int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 395 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 396 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 397 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 398 } else { 399 if (splitCandidate.hasBackReference()) { 400 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 401 } 402 splitCandidate.prependTo(replacement); 403 } 404 pairs.add(replacement); 405 } 406 407 /** 408 * Returns a builder correctly configured for the LZ4 algorithm. 409 * @return a builder correctly configured for the LZ4 algorithm 410 */ 411 public static Parameters.Builder createParameterBuilder() { 412 int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 413 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE) 414 .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 415 .withMaxBackReferenceLength(maxLen) 416 .withMaxOffset(maxLen) 417 .withMaxLiteralLength(maxLen); 418 } 419 420 final static class Pair { 421 private final Deque<byte[]> literals = new LinkedList<>(); 422 private int brOffset, brLength; 423 private boolean written; 424 425 private void prependLiteral(byte[] data) { 426 literals.addFirst(data); 427 } 428 byte[] addLiteral(LZ77Compressor.LiteralBlock block) { 429 byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), 430 block.getOffset() + block.getLength()); 431 literals.add(copy); 432 return copy; 433 } 434 void setBackReference(LZ77Compressor.BackReference block) { 435 if (hasBackReference()) { 436 throw new IllegalStateException(); 437 } 438 brOffset = block.getOffset(); 439 brLength = block.getLength(); 440 } 441 boolean hasBackReference() { 442 return brOffset > 0; 443 } 444 boolean canBeWritten(int lengthOfBlocksAfterThisPair) { 445 return hasBackReference() 446 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 447 } 448 int length() { 449 return literalLength() + brLength; 450 } 451 private boolean hasBeenWritten() { 452 return written; 453 } 454 void writeTo(OutputStream out) throws IOException { 455 int litLength = literalLength(); 456 out.write(lengths(litLength, brLength)); 457 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 458 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 459 } 460 for (byte[] b : literals) { 461 out.write(b); 462 } 463 if (hasBackReference()) { 464 ByteUtils.toLittleEndian(out, brOffset, 2); 465 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 466 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH 467 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 468 } 469 } 470 written = true; 471 } 472 private int literalLength() { 473 int length = 0; 474 for (byte[] b : literals) { 475 length += b.length; 476 } 477 return length; 478 } 479 private static int lengths(int litLength, int brLength) { 480 int l = litLength < 15 ? litLength : 15; 481 int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); 482 return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; 483 } 484 private static void writeLength(int length, OutputStream out) throws IOException { 485 while (length >= 255) { 486 out.write(255); 487 length -= 255; 488 } 489 out.write(length); 490 } 491 private int backReferenceLength() { 492 return brLength; 493 } 494 private void prependTo(Pair other) { 495 Iterator<byte[]> listBackwards = literals.descendingIterator(); 496 while (listBackwards.hasNext()) { 497 other.prependLiteral(listBackwards.next()); 498 } 499 } 500 private Pair splitWithNewBackReferenceLengthOf(int newBackReferenceLength) { 501 Pair p = new Pair(); 502 p.literals.addAll(literals); 503 p.brOffset = brOffset; 504 p.brLength = newBackReferenceLength; 505 return p; 506 } 507 } 508}