001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.OutputStream; 023 024import org.apache.commons.compress.compressors.CompressorOutputStream; 025import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 026import org.apache.commons.compress.compressors.lz77support.Parameters; 027import org.apache.commons.compress.utils.ByteUtils; 028 029/** 030 * CompressorOutputStream for the raw Snappy format. 031 * 032 * <p>This implementation uses an internal buffer in order to handle 033 * the back-references that are at the heart of the LZ77 algorithm. 034 * The size of the buffer must be at least as big as the biggest 035 * offset used in the compressed stream. The current version of the 036 * Snappy algorithm as defined by Google works on 32k blocks and 037 * doesn't contain offsets bigger than 32k which is the default block 038 * size used by this class.</p> 039 * 040 * <p>The raw Snappy format requires the uncompressed size to be 041 * written at the beginning of the stream using a varint 042 * representation, i.e. the number of bytes needed to write the 043 * information is not known before the uncompressed size is 044 * known. We've chosen to make the uncompressedSize a parameter of the 045 * constructor in favor of buffering the whole output until the size 046 * is known. When using the {@link FramedSnappyCompressorOutputStream} 047 * this limitation is taken care of by the warpping framing 048 * format.</p> 049 * 050 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 051 * @since 1.14 052 * @NotThreadSafe 053 */ 054public class SnappyCompressorOutputStream extends CompressorOutputStream { 055 private final LZ77Compressor compressor; 056 private final OutputStream os; 057 private final ByteUtils.ByteConsumer consumer; 058 059 // used in one-arg write method 060 private final byte[] oneByte = new byte[1]; 061 062 private boolean finished = false; 063 064 /** 065 * Constructor using the default block size of 32k. 066 * 067 * @param os the outputstream to write compressed data to 068 * @param uncompressedSize the uncompressed size of data 069 * @throws IOException if writing of the size fails 070 */ 071 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException { 072 this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE); 073 } 074 075 /** 076 * Constructor using a configurable block size. 077 * 078 * @param os the outputstream to write compressed data to 079 * @param uncompressedSize the uncompressed size of data 080 * @param blockSize the block size used - must be a power of two 081 * @throws IOException if writing of the size fails 082 */ 083 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) 084 throws IOException { 085 this(os, uncompressedSize, createParameterBuilder(blockSize).build()); 086 } 087 088 /** 089 * Constructor providing full control over the underlying LZ77 compressor. 090 * 091 * @param os the outputstream to write compressed data to 092 * @param uncompressedSize the uncompressed size of data 093 * @param params the parameters to use by the compressor - note 094 * that the format itself imposes some limits like a maximum match 095 * length of 64 bytes 096 * @throws IOException if writing of the size fails 097 */ 098 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, Parameters params) 099 throws IOException { 100 this.os = os; 101 consumer = new ByteUtils.OutputStreamByteConsumer(os); 102 compressor = new LZ77Compressor(params, new LZ77Compressor.Callback() { 103 @Override 104 public void accept(LZ77Compressor.Block block) throws IOException { 105 switch (block.getType()) { 106 case LITERAL: 107 writeLiteralBlock((LZ77Compressor.LiteralBlock) block); 108 break; 109 case BACK_REFERENCE: 110 writeBackReference((LZ77Compressor.BackReference) block); 111 break; 112 case EOD: 113 break; 114 } 115 } 116 }); 117 writeUncompressedSize(uncompressedSize); 118 } 119 120 @Override 121 public void write(int b) throws IOException { 122 oneByte[0] = (byte) (b & 0xff); 123 write(oneByte); 124 } 125 126 @Override 127 public void write(byte[] data, int off, int len) throws IOException { 128 compressor.compress(data, off, len); 129 } 130 131 @Override 132 public void close() throws IOException { 133 try { 134 finish(); 135 } finally { 136 os.close(); 137 } 138 } 139 140 /** 141 * Compresses all remaining data and writes it to the stream, 142 * doesn't close the underlying stream. 143 * @throws IOException if an error occurs 144 */ 145 public void finish() throws IOException { 146 if (!finished) { 147 compressor.finish(); 148 finished = true; 149 } 150 } 151 152 private void writeUncompressedSize(long uncompressedSize) throws IOException { 153 boolean more = false; 154 do { 155 int currentByte = (int) (uncompressedSize & 0x7F); 156 more = uncompressedSize > currentByte; 157 if (more) { 158 currentByte |= 0x80; 159 } 160 os.write(currentByte); 161 uncompressedSize >>= 7; 162 } while (more); 163 } 164 165 // literal length is stored as (len - 1) either inside the tag 166 // (six bits minus four flags) or in 1 to 4 bytes after the tag 167 private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60; 168 private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8; 169 private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16; 170 private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24; 171 172 private static final int ONE_SIZE_BYTE_MARKER = 60 << 2; 173 private static final int TWO_SIZE_BYTE_MARKER = 61 << 2; 174 private static final int THREE_SIZE_BYTE_MARKER = 62 << 2; 175 private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2; 176 177 private void writeLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException { 178 int len = block.getLength(); 179 if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) { 180 writeLiteralBlockNoSizeBytes(block, len); 181 } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) { 182 writeLiteralBlockOneSizeByte(block, len); 183 } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) { 184 writeLiteralBlockTwoSizeBytes(block, len); 185 } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) { 186 writeLiteralBlockThreeSizeBytes(block, len); 187 } else { 188 writeLiteralBlockFourSizeBytes(block, len); 189 } 190 } 191 192 private void writeLiteralBlockNoSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException { 193 writeLiteralBlockWithSize(len - 1 << 2, 0, len, block); 194 } 195 196 private void writeLiteralBlockOneSizeByte(LZ77Compressor.LiteralBlock block, int len) throws IOException { 197 writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block); 198 } 199 200 private void writeLiteralBlockTwoSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException { 201 writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block); 202 } 203 204 private void writeLiteralBlockThreeSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException { 205 writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block); 206 } 207 208 private void writeLiteralBlockFourSizeBytes(LZ77Compressor.LiteralBlock block, int len) throws IOException { 209 writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block); 210 } 211 212 private void writeLiteralBlockWithSize(int tagByte, int sizeBytes, int len, LZ77Compressor.LiteralBlock block) 213 throws IOException { 214 os.write(tagByte); 215 writeLittleEndian(sizeBytes, len - 1); 216 os.write(block.getData(), block.getOffset(), len); 217 } 218 219 private void writeLittleEndian(final int numBytes, int num) throws IOException { 220 ByteUtils.toLittleEndian(consumer, num, numBytes); 221 } 222 223 // Back-references ("copies") have their offset/size information 224 // in two, three or five bytes. 225 private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4; 226 private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11; 227 private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1; 228 private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1; 229 230 private static final int ONE_BYTE_COPY_TAG = 1; 231 private static final int TWO_BYTE_COPY_TAG = 2; 232 private static final int FOUR_BYTE_COPY_TAG = 3; 233 234 private void writeBackReference(LZ77Compressor.BackReference block) throws IOException { 235 final int len = block.getLength(); 236 final int offset = block.getOffset(); 237 if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE 238 && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) { 239 writeBackReferenceWithOneOffsetByte(len, offset); 240 } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) { 241 writeBackReferenceWithTwoOffsetBytes(len, offset); 242 } else { 243 writeBackReferenceWithFourOffsetBytes(len, offset); 244 } 245 } 246 247 private void writeBackReferenceWithOneOffsetByte(int len, int offset) throws IOException { 248 os.write(ONE_BYTE_COPY_TAG | ((len - 4) << 2) | ((offset & 0x700) >> 3)); 249 os.write(offset & 0xff); 250 } 251 252 private void writeBackReferenceWithTwoOffsetBytes(int len, int offset) throws IOException { 253 writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset); 254 } 255 256 private void writeBackReferenceWithFourOffsetBytes(int len, int offset) throws IOException { 257 writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset); 258 } 259 260 private void writeBackReferenceWithLittleEndianOffset(int tag, int offsetBytes, int len, int offset) 261 throws IOException { 262 os.write(tag | ((len - 1) << 2)); 263 writeLittleEndian(offsetBytes, offset); 264 } 265 266 // technically the format could use shorter matches but with a 267 // length of three the offset would be encoded as at least two 268 // bytes in addition to the tag, so yield no compression at all 269 private static final int MIN_MATCH_LENGTH = 4; 270 // Snappy stores the match length in six bits of the tag 271 private static final int MAX_MATCH_LENGTH = 64; 272 273 /** 274 * Returns a builder correctly configured for the Snappy algorithm using the gven block size. 275 * @param blockSize the block size. 276 * @return a builder correctly configured for the Snappy algorithm using the gven block size 277 */ 278 public static Parameters.Builder createParameterBuilder(int blockSize) { 279 // the max offset and max literal length defined by the format 280 // are 2^32 - 1 and 2^32 respectively - with blockSize being 281 // an integer we will never exceed that 282 return Parameters.builder(blockSize) 283 .withMinBackReferenceLength(MIN_MATCH_LENGTH) 284 .withMaxBackReferenceLength(MAX_MATCH_LENGTH) 285 .withMaxOffset(blockSize) 286 .withMaxLiteralLength(blockSize); 287 } 288}