001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Arrays; 023 024/** 025 * Helper class for compression algorithms that use the ideas of LZ77. 026 * 027 * <p>Most LZ77 derived algorithms split input data into blocks of 028 * uncompressed data (called literal blocks) and back-references 029 * (pairs of offsets and lengths) that state "add <code>length</code> 030 * bytes that are the same as those already written starting 031 * <code>offset</code> bytes before the current position. The details 032 * of how those blocks and back-references are encoded are quite 033 * different between the algorithms and some algorithms perform 034 * additional steps (Huffman encoding in the case of DEFLATE for 035 * example).</p> 036 * 037 * <p>This class attempts to extract the core logic - finding 038 * back-references - so it can be re-used. It follows the algorithm 039 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't 040 * implement the "lazy match" optimization. The three-byte hash 041 * function used in this class is the same as the one used by zlib and 042 * InfoZIP's ZIP implementation of DEFLATE. The whole class is 043 * strongly inspired by InfoZIP's implementation.</p> 044 * 045 * <p>LZ77 is used vaguely here (as well as many other places that 046 * talk about it :-), LZSS would likely be closer to the truth but 047 * LZ77 has become the synonym for a whole family of algorithms.</p> 048 * 049 * <p>The API consists of a compressor that is fed <code>byte</code>s 050 * and emits {@link Block}s to a registered callback where the blocks 051 * represent either {@link LiteralBlock literal blocks}, {@link 052 * BackReference back-references} or {@link EOD end of data 053 * markers}. In order to ensure the callback receives all information, 054 * the {@code #finish} method must be used once all data has been fed 055 * into the compressor.</p> 056 * 057 * <p>Several parameters influence the outcome of the "compression":</p> 058 * <dl> 059 * 060 * <dt><code>windowSize</code></dt> <dd>the size of the sliding 061 * window, must be a power of two - this determines the maximum 062 * offset a back-reference can take. The compressor maintains a 063 * buffer of twice of <code>windowSize</code> - real world values are 064 * in the area of 32k.</dd> 065 * 066 * <dt><code>minBackReferenceLength</code></dt> 067 * <dd>Minimal length of a back-reference found. A true minimum of 3 is 068 * hard-coded inside of this implemention but bigger lengths can be 069 * configured.</dd> 070 * 071 * <dt><code>maxBackReferenceLength</code></dt> 072 * <dd>Maximal length of a back-reference found.</dd> 073 * 074 * <dt><code>maxOffset</code></dt> 075 * <dd>Maximal offset of a back-reference.</dd> 076 * 077 * <dt><code>maxLiteralLength</code></dt> 078 * <dd>Maximal length of a literal block.</dd> 079 * </dl> 080 * 081 * @see "https://tools.ietf.org/html/rfc1951#section-4" 082 * @since 1.14 083 * @NotThreadSafe 084 */ 085public class LZ77Compressor { 086 087 /** 088 * Base class representing blocks the compressor may emit. 089 * 090 * <p>This class is not supposed to be subclassed by classes 091 * outside of Commons Compress so it is considered internal and 092 * changed that would break subclasses may get introduced with 093 * future releases.</p> 094 */ 095 public static abstract class Block { 096 /** Enumeration of the block types the compressor may emit. */ 097 public enum BlockType { 098 LITERAL, BACK_REFERENCE, EOD 099 } 100 public abstract BlockType getType(); 101 } 102 103 /** 104 * Represents a literal block of data. 105 * 106 * <p>For performance reasons this encapsulates the real data, not 107 * a copy of it. Don't modify the data and process it inside of 108 * {@link Callback#accept} immediately as it will get overwritten 109 * sooner or later.</p> 110 */ 111 public static final class LiteralBlock extends Block { 112 private final byte[] data; 113 private final int offset, length; 114 public LiteralBlock(byte[] data, int offset, int length) { 115 this.data = data; 116 this.offset = offset; 117 this.length = length; 118 } 119 /** 120 * The literal data. 121 * 122 * <p>This returns a life view of the actual data in order to 123 * avoid copying, modify the array at your own risk.</p> 124 * @return the data 125 */ 126 public byte[] getData() { 127 return data; 128 } 129 /** 130 * Offset into data where the literal block starts. 131 * @return the offset 132 */ 133 public int getOffset() { 134 return offset; 135 } 136 /** 137 * Length of literal block. 138 * @return the length 139 */ 140 public int getLength() { 141 return length; 142 } 143 @Override 144 public BlockType getType() { 145 return BlockType.LITERAL; 146 } 147 @Override 148 public String toString() { 149 return "LiteralBlock starting at " + offset + " with length " + length; 150 } 151 } 152 153 /** 154 * Represents a back-reference. 155 */ 156 public static final class BackReference extends Block { 157 private final int offset, length; 158 public BackReference(int offset, int length) { 159 this.offset = offset; 160 this.length = length; 161 } 162 /** 163 * Provides the offset of the back-reference. 164 * @return the offset 165 */ 166 public int getOffset() { 167 return offset; 168 } 169 /** 170 * Provides the length of the back-reference. 171 * @return the length 172 */ 173 public int getLength() { 174 return length; 175 } 176 @Override 177 public BlockType getType() { 178 return BlockType.BACK_REFERENCE; 179 } 180 @Override 181 public String toString() { 182 return "BackReference with offset " + offset + " and length " + length; 183 } 184 } 185 186 /** A simple "we are done" marker. */ 187 public static final class EOD extends Block { 188 @Override 189 public BlockType getType() { 190 return BlockType.EOD; 191 } 192 } 193 194 private static final Block THE_EOD = new EOD(); 195 196 /** 197 * Callback invoked while the compressor processes data. 198 * 199 * <p>The callback is invoked on the same thread that receives the 200 * bytes to compress and may be invoked multiple times during the 201 * execution of {@link #compress} or {@link #finish}.</p> 202 */ 203 public interface Callback { 204 /** 205 * Consumes a block. 206 * @param b the block to consume 207 * @throws IOException in case of an error 208 */ 209 void accept(Block b) throws IOException; 210 } 211 212 static final int NUMBER_OF_BYTES_IN_HASH = 3; 213 private static final int NO_MATCH = -1; 214 215 private final Parameters params; 216 private final Callback callback; 217 218 // the sliding window, twice as big as "windowSize" parameter 219 private final byte[] window; 220 // the head of hash-chain - indexed by hash-code, points to the 221 // location inside of window of the latest sequence of bytes with 222 // the given hash. 223 private final int[] head; 224 // for each window-location points to the latest earlier location 225 // with the same hash. Only stores values for the latest 226 // "windowSize" elements, the index is "window location modulo 227 // windowSize". 228 private final int[] prev; 229 230 // bit mask used when indexing into prev 231 private final int wMask; 232 233 private boolean initialized = false; 234 // the position inside of window that shall be encoded right now 235 private int currentPosition; 236 // the number of bytes available to compress including the one at 237 // currentPosition 238 private int lookahead = 0; 239 // the hash of the three bytes stating at the current position 240 private int insertHash = 0; 241 // the position inside of the window where the current literal 242 // block starts (in case we are inside of a literal block). 243 private int blockStart = 0; 244 // position of the current match 245 private int matchStart = NO_MATCH; 246 // number of missed insertString calls for the up to three last 247 // bytes of the last match that can only be performed once more 248 // data has been read 249 private int missedInserts = 0; 250 251 /** 252 * Initializes a compressor with parameters and a callback. 253 * @param params the parameters 254 * @param callback the callback 255 * @throws NullPointerException if either parameter is <code>null</code> 256 */ 257 public LZ77Compressor(Parameters params, Callback callback) { 258 if (params == null) { 259 throw new NullPointerException("params must not be null"); 260 } 261 if (callback == null) { 262 throw new NullPointerException("callback must not be null"); 263 } 264 this.params = params; 265 this.callback = callback; 266 267 final int wSize = params.getWindowSize(); 268 window = new byte[wSize * 2]; 269 wMask = wSize - 1; 270 head = new int[HASH_SIZE]; 271 Arrays.fill(head, NO_MATCH); 272 prev = new int[wSize]; 273 } 274 275 /** 276 * Feeds bytes into the compressor which in turn may emit zero or 277 * more blocks to the callback during the execution of this 278 * method. 279 * @param data the data to compress - must not be null 280 * @throws IOException if the callback throws an exception 281 */ 282 public void compress(byte[] data) throws IOException { 283 compress(data, 0, data.length); 284 } 285 286 /** 287 * Feeds bytes into the compressor which in turn may emit zero or 288 * more blocks to the callback during the execution of this 289 * method. 290 * @param data the data to compress - must not be null 291 * @param off the start offset of the data 292 * @param len the number of bytes to compress 293 * @throws IOException if the callback throws an exception 294 */ 295 public void compress(byte[] data, int off, int len) throws IOException { 296 final int wSize = params.getWindowSize(); 297 while (len > wSize) { // chop into windowSize sized chunks 298 doCompress(data, off, wSize); 299 off += wSize; 300 len -= wSize; 301 } 302 if (len > 0) { 303 doCompress(data, off, len); 304 } 305 } 306 307 /** 308 * Tells the compressor to process all remaining data and signal 309 * end of data to the callback. 310 * 311 * <p>The compressor will in turn emit at least one block ({@link 312 * EOD}) but potentially multiple blocks to the callback during 313 * the execution of this method.</p> 314 * @throws IOException if the callback throws an exception 315 */ 316 public void finish() throws IOException { 317 if (blockStart != currentPosition || lookahead > 0) { 318 currentPosition += lookahead; 319 flushLiteralBlock(); 320 } 321 callback.accept(THE_EOD); 322 } 323 324 /** 325 * Adds some initial data to fill the window with. 326 * 327 * <p>This is used if the stream has been cut into blocks and 328 * back-references of one block may refer to data of the previous 329 * block(s). One such example is the LZ4 frame format using block 330 * dependency.</p> 331 * 332 * @param data the data to fill the window with. 333 * @throws IllegalStateException if the compressor has already started to accept data 334 */ 335 public void prefill(byte[] data) { 336 if (currentPosition != 0 || lookahead != 0) { 337 throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore"); 338 } 339 340 // don't need more than windowSize for back-references 341 final int len = Math.min(params.getWindowSize(), data.length); 342 System.arraycopy(data, data.length - len, window, 0, len); 343 344 if (len >= NUMBER_OF_BYTES_IN_HASH) { 345 initialize(); 346 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 347 for (int i = 0; i < stop; i++) { 348 insertString(i); 349 } 350 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 351 } else { // not enough data to hash anything 352 missedInserts = len; 353 } 354 blockStart = currentPosition = len; 355 } 356 357 // we use a 15 bit hashcode as calculated in updateHash 358 private static final int HASH_SIZE = 1 << 15; 359 private static final int HASH_MASK = HASH_SIZE - 1; 360 private static final int H_SHIFT = 5; 361 362 /** 363 * Assumes we are calculating the hash for three consecutive bytes 364 * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC 365 * the new hash for BCD is nextHash(H, D). 366 * 367 * <p>The hash is shifted by five bits on each update so all 368 * effects of A have been swapped after the third update.</p> 369 */ 370 private int nextHash(int oldHash, byte nextByte) { 371 final int nextVal = nextByte & 0xFF; 372 return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; 373 } 374 375 // performs the actual algorithm with the pre-condition len <= windowSize 376 private void doCompress(byte[] data, int off, int len) throws IOException { 377 int spaceLeft = window.length - currentPosition - lookahead; 378 if (len > spaceLeft) { 379 slide(); 380 } 381 System.arraycopy(data, off, window, currentPosition + lookahead, len); 382 lookahead += len; 383 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 384 initialize(); 385 } 386 if (initialized) { 387 compress(); 388 } 389 } 390 391 private void slide() throws IOException { 392 final int wSize = params.getWindowSize(); 393 if (blockStart != currentPosition && blockStart < wSize) { 394 flushLiteralBlock(); 395 blockStart = currentPosition; 396 } 397 System.arraycopy(window, wSize, window, 0, wSize); 398 currentPosition -= wSize; 399 matchStart -= wSize; 400 blockStart -= wSize; 401 for (int i = 0; i < HASH_SIZE; i++) { 402 int h = head[i]; 403 head[i] = h >= wSize ? h - wSize : NO_MATCH; 404 } 405 for (int i = 0; i < wSize; i++) { 406 int p = prev[i]; 407 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 408 } 409 } 410 411 private void initialize() { 412 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 413 insertHash = nextHash(insertHash, window[i]); 414 } 415 initialized = true; 416 } 417 418 private void compress() throws IOException { 419 final int minMatch = params.getMinBackReferenceLength(); 420 final boolean lazy = params.getLazyMatching(); 421 final int lazyThreshold = params.getLazyMatchingThreshold(); 422 423 while (lookahead >= minMatch) { 424 catchUpMissedInserts(); 425 int matchLength = 0; 426 int hashHead = insertString(currentPosition); 427 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 428 // sets matchStart as a side effect 429 matchLength = longestMatch(hashHead); 430 431 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 432 // try to find a longer match using the next position 433 matchLength = longestMatchForNextPosition(matchLength); 434 } 435 } 436 if (matchLength >= minMatch) { 437 if (blockStart != currentPosition) { 438 // emit preceeding literal block 439 flushLiteralBlock(); 440 blockStart = NO_MATCH; 441 } 442 flushBackReference(matchLength); 443 insertStringsInMatch(matchLength); 444 lookahead -= matchLength; 445 currentPosition += matchLength; 446 blockStart = currentPosition; 447 } else { 448 // no match, append to current or start a new literal 449 lookahead--; 450 currentPosition++; 451 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 452 flushLiteralBlock(); 453 blockStart = currentPosition; 454 } 455 } 456 } 457 } 458 459 /** 460 * Inserts the current three byte sequence into the dictionary and 461 * returns the previous head of the hash-chain. 462 * 463 * <p>Updates <code>insertHash</code> and <code>prev</code> as a 464 * side effect.</p> 465 */ 466 private int insertString(int pos) { 467 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 468 int hashHead = head[insertHash]; 469 prev[pos & wMask] = hashHead; 470 head[insertHash] = pos; 471 return hashHead; 472 } 473 474 private int longestMatchForNextPosition(final int prevMatchLength) { 475 // save a bunch of values to restore them if the next match isn't better than the current one 476 final int prevMatchStart = matchStart; 477 final int prevInsertHash = insertHash; 478 479 lookahead--; 480 currentPosition++; 481 int hashHead = insertString(currentPosition); 482 final int prevHashHead = prev[currentPosition & wMask]; 483 int matchLength = longestMatch(hashHead); 484 485 if (matchLength <= prevMatchLength) { 486 // use the first match, as the next one isn't any better 487 matchLength = prevMatchLength; 488 matchStart = prevMatchStart; 489 490 // restore modified values 491 head[insertHash] = prevHashHead; 492 insertHash = prevInsertHash; 493 currentPosition--; 494 lookahead++; 495 } 496 return matchLength; 497 } 498 499 private void insertStringsInMatch(int matchLength) { 500 // inserts strings contained in current match 501 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 502 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 503 // currentPosition has been inserted already 504 for (int i = 1; i <= stop; i++) { 505 insertString(currentPosition + i); 506 } 507 missedInserts = matchLength - stop - 1; 508 } 509 510 private void catchUpMissedInserts() { 511 while (missedInserts > 0) { 512 insertString(currentPosition - missedInserts--); 513 } 514 } 515 516 private void flushBackReference(int matchLength) throws IOException { 517 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 518 } 519 520 private void flushLiteralBlock() throws IOException { 521 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 522 } 523 524 /** 525 * Searches the hash chain for real matches and returns the length 526 * of the longest match (0 if none were found) that isn't too far 527 * away (WRT maxOffset). 528 * 529 * <p>Sets matchStart to the index of the start position of the 530 * longest match as a side effect.</p> 531 */ 532 private int longestMatch(int matchHead) { 533 final int minLength = params.getMinBackReferenceLength(); 534 int longestMatchLength = minLength - 1; 535 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 536 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 537 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 538 final int maxCandidates = params.getMaxCandidates(); 539 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 540 int currentLength = 0; 541 for (int i = 0; i < maxPossibleLength; i++) { 542 if (window[matchHead + i] != window[currentPosition + i]) { 543 break; 544 } 545 currentLength++; 546 } 547 if (currentLength > longestMatchLength) { 548 longestMatchLength = currentLength; 549 matchStart = matchHead; 550 if (currentLength >= niceBackReferenceLength) { 551 // no need to search any further 552 break; 553 } 554 } 555 matchHead = prev[matchHead & wMask]; 556 } 557 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 558 } 559}