7 #include <unordered_map> 9 #include "base/pod_array.h" 10 #include "base/flit.h" 12 #include "strings/unique_strings.h" 13 #include "util/coding/sequence_array.h" 26 unsigned DeltaEncode16(
const uint16_t* src,
unsigned cnt, uint16_t* dest);
30 typedef uint16 SymbId;
33 enum { kMaxAlphabetSize = (1 << 14) - 1, kInvalidId = kMaxAlphabetSize + 1 };
40 Record() : cnt(0), id(0) {}
53 void Add(T val) { ++freq_map_[val].cnt; }
55 size_t alphabet_size()
const {
return alphabet_.size(); }
57 T FromId(SymbId i)
const {
return alphabet_[i]; }
60 SymbId Resolve(T t)
const {
61 auto it = freq_map_.find(t);
62 if (it == std::end(freq_map_))
68 bool Resolve(
const T* src, uint32_t count, SymbId* dest);
76 size_t SerializeTo(uint8_t* dest)
const;
77 size_t GetMaxSerializedSize()
const;
81 size_t dict_size()
const {
return freq_map_.size(); }
84 std::unordered_map<T, Record> freq_map_;
85 std::vector<T> alphabet_;
90 enum { kDictBit = 0x1, kFinalBit = 0x2, kDictSeqBit = 0x4 };
95 uint16_t num_sequences;
98 uint32_t byte_len_size_comprs;
99 uint32_t sequence_size_comprs;
101 uint8_t Write(uint8_t* dest)
const;
105 void Read(
const uint8_t* src);
108 static uint8_t HeaderSize(uint8_t flags);
110 BlockHeader() : flags(0), num_sequences(0), byte_len_size_comprs(0),
111 sequence_size_comprs(0) {}
120 const std::vector<strings::ByteRange>& compressed_blocks()
const {
121 return compressed_blocks_;
124 void ClearCompressedData() {
125 compressed_bufs_.clear();
126 compressed_blocks_.clear();
129 void DisableSeqDictionary() { disable_seq_dict_ =
true; }
136 using SymbId = LiteralDictBase::SymbId;
138 void AnalyzePreDict();
141 bool LearnSeqDict(strings::ByteRange entry);
145 void CompressFlitSequences(
bool final);
148 void CompressRawLit(
bool final);
152 void AnalyzeSequenceDict();
156 bool PrepareForSymbAvailability(uint32_t cnt);
158 void BacktrackToRaw();
159 void AddEncodedSymbols(SymbId* src, uint32_t cnt);
161 virtual uint32_t PrepareDict() = 0;
163 base::PODArray<uint8> lit_data_, prev_block_;
168 base::PODArray<uint32> len_code_;
170 static_assert(
alignof(uint32_t) <= decltype(lit_data_)::alignment_v,
"");
172 std::vector<std::unique_ptr<uint8_t[]>> compressed_bufs_;
173 std::vector<strings::ByteRange> compressed_blocks_;
175 bool disable_seq_dict_ =
false;
176 uint32_t literal_size_;
197 enum State { PRE_DICT, LIT_DICT, NO_LIT_DICT } state_ = PRE_DICT;
206 google::dense_hash_map<strings::ByteRange, EntryVal> seq_map_;
207 base::PODArray<uint32_t> duplicate_seq_;
211 google::dense_hash_map<strings::ByteRange, uint32> dict_seq_map_;
213 base::PODArray<uint8> compress_data_, tmp_space_;
214 base::PODArray<SymbId> tmp_symb_;
217 std::unique_ptr<ZstdCntx> zstd_cntx_;
219 std::unique_ptr<uint8_t[]> zstd_dict_;
220 size_t zstd_dict_size_ = 0;
221 size_t added_lit_cnt_ = 0, dict_ref_bytes_ = 0;
222 double dict_nominal_ratio_ = 0;
226 static_assert(INT_SIZE == 4 || INT_SIZE == 8,
"");
229 using UT = std::conditional_t<INT_SIZE == 4, uint32_t, uint64_t>;
233 void Add(
const UT* src,
unsigned cnt);
236 bool GetDictSerialized(std::string* dest);
241 uint32_t PrepareDict()
override;
243 bool AddDictEncoded(
const UT* src,
unsigned cnt);
249 using SymbId = LiteralDictBase::SymbId;
258 int Decompress(strings::ByteRange br, uint32_t* consumed);
260 void SetDict(
const uint8_t* src,
unsigned cnt);
263 void InflateSequences();
265 void DecompressCodes(
const uint8_t* src);
267 virtual void SetLitDict(strings::ByteRange br) = 0;
268 virtual bool AddFlitSeq(strings::ByteRange src) = 0;
271 bool read_header_ =
false;
273 base::PODArray<uint32_t> len_code_;
274 base::PODArray<uint8_t> code_buf_, data_buf_;
277 std::vector<strings::ByteRange> seq_dict_range_;
279 uint32_t next_seq_id_ = 0;
280 uint8_t* next_flit_ptr_;
283 std::unique_ptr<Zstd> zstd_cntx_;
288 static_assert(INT_SIZE == 4 || INT_SIZE == 8,
"");
293 using UT = std::conditional_t<INT_SIZE == 4, uint32_t, uint64_t>;
294 using IntRange = strings::Range<UT*>;
306 IntRange GetNextIntPage();
309 void SetLitDict(strings::ByteRange br)
override;
310 bool AddFlitSeq(strings::ByteRange src)
override;
312 base::PODArray<UT> lit_dict_, int_buf_;
318 constexpr
unsigned kSmallNum = 5;
320 template<
typename UT,
typename MapperFn> uint32_t DeflateFlitAndMap(
321 const uint8_t* src, uint32_t cnt, MapperFn mapper_fn,
322 UT* dest, uint32_t dest_capacity) {
323 namespace flit = base::flit;
325 const uint8_t* end = src + cnt;
327 const uint8_t* next = src + flit::ParseT(src, &val);
328 LiteralDictBase::SymbId symbid = val;
330 dest[0] = mapper_fn(symbid);
332 uint32_t dest_index = 1;
333 bool prev_small = val < kSmallNum;
334 uint32_t prev_val = val + 1;
337 next += flit::ParseT(next, &val);
340 bool is_rep = val & 1;
346 if (dest_index + val > dest_capacity)
347 return dest_index + val;
349 for (
unsigned i = 0; i < val; ++i) {
351 dest[dest_index++] = mapper_fn(symbid);
358 if (dest_index >= dest_capacity)
359 return dest_index + 1;
361 prev_small = val < kSmallNum;
366 dest[dest_index++] = mapper_fn(symbid);
State
dictionary. In any case lit_data_ and len_ contain binary blobs and len codes to decode them.