set_encoder.h
1 // Copyright 2017, Beeri 15. All rights reserved.
2 // Author: Roman Gershman (romange@gmail.com)
3 //
4 #pragma once
5 
6 #include <vector>
7 #include <unordered_map>
8 
9 #include "base/pod_array.h"
10 #include "base/flit.h"
11 
12 #include "strings/unique_strings.h"
13 #include "util/coding/sequence_array.h"
14 
15 namespace util {
16 
17 // src must be sorted. Returns 16bit array of literals (shifted left with lsb 0)
18 // and rep-codes(lsb 1). The input array should have numbers less than 1 << 15.
19 // rep-code - means that the previous literal should be replicated another (rep >> 1) times.
20 // For example, "{1,2,3,4,5,6,7,8,9}" will be translated into 3 numbers:
21 // "{1, 0 << 1 | 0, (7 << 1) | 1}".
22 // Encodes inline, i.e. dest and src can point to the same memory region.
23 // returns number of written literals in the dest.
24 // *** If src contains 14 bit numbers than it's guaranteed that the flit encoding of output array
25 // won't be larger than the flit encoding of the input array.
26 unsigned DeltaEncode16(const uint16_t* src, unsigned cnt, uint16_t* dest);
27 
29  public:
30  typedef uint16 SymbId;
31 
32  // To allow varint encoding of at most 2 bytes.
33  enum { kMaxAlphabetSize = (1 << 14) - 1, kInvalidId = kMaxAlphabetSize + 1 };
34 
35  protected:
36  struct Record {
37  unsigned cnt;
38  SymbId id;
39 
40  Record() : cnt(0), id(0) {}
41  };
42 };
43 
44 template<typename T> class LiteralDict : public LiteralDictBase {
45  public:
46  typedef T Literal;
47 
48  // If true was returned, then arr was recoded into 'cnt' 2-byte Dict::SymbId symbols into dest.
49  // False - then the input data does not have a narrow dictionary that justifies its creation,
50  // it's too random. dest and arr can be the same pointer to allow recoding in place.
51  // bool Prepare(const T* arr, uint32_t cnt, SymbId* dest);
52 
53  void Add(T val) { ++freq_map_[val].cnt; }
54 
55  size_t alphabet_size() const { return alphabet_.size(); }
56 
57  T FromId(SymbId i) const { return alphabet_[i]; }
58 
59  // returns kInvalidId if not found.
60  SymbId Resolve(T t) const {
61  auto it = freq_map_.find(t);
62  if (it == std::end(freq_map_))
63  return kInvalidId;
64  return it->second.id;
65  }
66 
67  // dest and src can point to the same address to allow recoding in place.
68  bool Resolve(const T* src, uint32_t count, SymbId* dest);
69 
70  void Clear() {
71  freq_map_.clear();
72  alphabet_.clear();
73  }
74 
75  // dest must have at least GetMaxSerializedSize.
76  size_t SerializeTo(uint8_t* dest) const;
77  size_t GetMaxSerializedSize() const;
78 
79  void Build();
80 
81  size_t dict_size() const { return freq_map_.size(); }
82  private:
83 
84  std::unordered_map<T, Record> freq_map_;
85  std::vector<T> alphabet_;
86 };
87 
88 
89 struct BlockHeader {
90  enum { kDictBit = 0x1, kFinalBit = 0x2, kDictSeqBit = 0x4 };
91 
92  uint8_t flags; // use dictionary.
93 
94  // num_sequences and byte_len_size_comprs defined only if "use dictionary" is set.
95  uint16_t num_sequences;
96 
97  // 3 bytes because we do not store more than 128K of intermediate data in a single block.
98  uint32_t byte_len_size_comprs; // 3 bytes
99  uint32_t sequence_size_comprs; // 3 bytes.
100 
101  uint8_t Write(uint8_t* dest) const;
102 
103  // Reads data from src into the object. must have at least HeaderSize bytes.
104  // src should point right after magic string.
105  void Read(const uint8_t* src);
106 
107  // Not including magic string but including flags byte.
108  static uint8_t HeaderSize(uint8_t flags);
109 
110  BlockHeader() : flags(0), num_sequences(0), byte_len_size_comprs(0),
111  sequence_size_comprs(0) {}
112 };
113 
115  public:
116  SeqEncoderBase();
117 
118  virtual ~SeqEncoderBase();
119 
120  const std::vector<strings::ByteRange>& compressed_blocks() const {
121  return compressed_blocks_;
122  }
123 
124  void ClearCompressedData() {
125  compressed_bufs_.clear();
126  compressed_blocks_.clear();
127  }
128 
129  void DisableSeqDictionary() { disable_seq_dict_ = true; }
130 
131  uint32 Cost() const;
132 
133  void Flush();
134 
135  protected:
136  using SymbId = LiteralDictBase::SymbId;
137 
138  void AnalyzePreDict();
139 
140  // Returns true of the sequence was assed to seq_map_. .
141  bool LearnSeqDict(strings::ByteRange entry);
142 
143  // Compresses flit sequence data into a binary blob.
144  // Empties lit_data_ buffer.
145  void CompressFlitSequences(bool final);
146 
147  // Compress raw literals as is.
148  void CompressRawLit(bool final);
149 
150  void AddCompressedBuf(const BlockHeader& bh);
151 
152  void AnalyzeSequenceDict();
153 
154  // Checks if cnt literals can be added to batch buffers.
155  // Returns true if succeeded, false if encoder state has been changed.
156  bool PrepareForSymbAvailability(uint32_t cnt);
157 
158  void BacktrackToRaw();
159  void AddEncodedSymbols(SymbId* src, uint32_t cnt);
160 
161  virtual uint32_t PrepareDict() = 0;
162 
163  base::PODArray<uint8> lit_data_, prev_block_;
164 
165  // For sequence dictionary mode, after dict_seq_map_ is built len_ points either to
166  // consequetive sequences in frame_seq_
167  // (using id = 0) or using index from dict_seq_ (by actually storing index + 1).
168  base::PODArray<uint32> len_code_;
169 
170  static_assert(alignof(uint32_t) <= decltype(lit_data_)::alignment_v, "");
171 
172  std::vector<std::unique_ptr<uint8_t[]>> compressed_bufs_;
173  std::vector<strings::ByteRange> compressed_blocks_;
174 
175  bool disable_seq_dict_ = false;
176  uint32_t literal_size_; // Single literal size (4 or 8 bytes).
177 
178  /*
179  The sequences are stored fully in each block, no overflows to the next one.
180  In case of NO_LIT_DICT we just store "uncompressible data" only literals.
181 */
182  // NO_LIT_DICT is final and can be switched to from any state. It means that the literals
183  // dictionary can not be built or not worth to be built. This can be decide abruptly in the
184  // middle of processing so the algorithm should always know how to fallback to that state.
185  // PRE_DICT switches to LIT_DICT or to NO_LIT_DICT.
186  // LIT_DICT switches to NO_LIT_DICT.
187  //
188  // PRE_DICT - to gather the first batch and to decide if it's possible to encode literals with
189  // dictionary encoding + varint/flit encoding.
190  // If yes, switches to LIT_DICT with all sequences reduced using dictionary and flit encodings.
191  // They are mapped for building seq dictionary via seq_map_.
192  // LIT_DICT - gathers all the sequences via seq_map_ until it's full.
193  // LIT_DICT has mini states by its own. It can process seq_map_ and to decide whether dict_seq_
194  // should be filled. If dict_seq_ is filled then LIT_DICT uses both literal and sequence
196  // At this moment sequence dictionary is final once it's built in CreateSequenceDict().
197  enum State { PRE_DICT, LIT_DICT, NO_LIT_DICT } state_ = PRE_DICT;
198 
199  struct EntryVal {
200  uint32 ref_cnt = 0;
201  uint32 dict_id = 0;
202  };
203 
204  // First step map - gathers all the sequences.
205  // Allows to store the information about duplicates during the learning phase.
206  google::dense_hash_map<strings::ByteRange, EntryVal> seq_map_;
207  base::PODArray<uint32_t> duplicate_seq_;
208 
209  // dict_seq_ is the extracted dictionary.
210  SequenceArray dict_seq_;
211  google::dense_hash_map<strings::ByteRange, uint32> dict_seq_map_; // reverse mapping.
212 
213  base::PODArray<uint8> compress_data_, tmp_space_;
214  base::PODArray<SymbId> tmp_symb_;
215  struct ZstdCntx;
216 
217  std::unique_ptr<ZstdCntx> zstd_cntx_;
218 
219  std::unique_ptr<uint8_t[]> zstd_dict_;
220  size_t zstd_dict_size_ = 0;
221  size_t added_lit_cnt_ = 0, dict_ref_bytes_ = 0;
222  double dict_nominal_ratio_ = 0;
223 };
224 
225 template<size_t INT_SIZE> class SeqEncoder : public SeqEncoderBase {
226  static_assert(INT_SIZE == 4 || INT_SIZE == 8, "");
227 
228  public:
229  using UT = std::conditional_t<INT_SIZE == 4, uint32_t, uint64_t>;
230 
231  SeqEncoder();
232 
233  void Add(const UT* src, unsigned cnt);
234 
235  // Return false if no dictionary was used.
236  bool GetDictSerialized(std::string* dest);
237 
238  private:
239 
240  // Returns alphabet size.
241  uint32_t PrepareDict() override;
242 
243  bool AddDictEncoded(const UT* src, unsigned cnt);
244 
245  LiteralDict<UT> dict_;
246 };
247 
249  using SymbId = LiteralDictBase::SymbId;
250  public:
251  SeqDecoderBase();
252  virtual ~SeqDecoderBase();
253 
254  // Returns 0 if decompression of the frame is ended, 1 if it's still going.
255  // In any case "*consumed" will hold how many bytes were consumed from br.
256  // If negative number is returned - then last portion of br is too small to decompress
257  // In that case, the -(return value) will tell how many input bytes are needed.
258  int Decompress(strings::ByteRange br, uint32_t* consumed);
259 
260  void SetDict(const uint8_t* src, unsigned cnt);
261 
262  protected:
263  void InflateSequences();
264 
265  void DecompressCodes(const uint8_t* src);
266 
267  virtual void SetLitDict(strings::ByteRange br) = 0;
268  virtual bool AddFlitSeq(strings::ByteRange src) = 0;
269 
270  BlockHeader bh_;
271  bool read_header_ = false;
272 
273  base::PODArray<uint32_t> len_code_;
274  base::PODArray<uint8_t> code_buf_, data_buf_;
275 
276  SequenceArray seq_dict_;
277  std::vector<strings::ByteRange> seq_dict_range_;
278 
279  uint32_t next_seq_id_ = 0;
280  uint8_t* next_flit_ptr_;
281 
282  struct Zstd;
283  std::unique_ptr<Zstd> zstd_cntx_;
284 };
285 
286 
287 template<size_t INT_SIZE> class SeqDecoder: public SeqDecoderBase {
288  static_assert(INT_SIZE == 4 || INT_SIZE == 8, "");
289 
290  public:
291  SeqDecoder();
292 
293  using UT = std::conditional_t<INT_SIZE == 4, uint32_t, uint64_t>;
294  using IntRange = strings::Range<UT*>;
295 
296  // After calling Decompress, this function returns the currently available non-empty integer page
297  // based on the decompressed block. If no more data is available - returns an empty range.
298  /* res = decoder.Decompress(src, &consumed);
299  for (auto page = decoder.GetNextIntPage(); !page.empty(); page = decoder.GetNextIntPage()) {
300  // *do something with the page
301  };
302  TODO: this function requires to allocate int_buf_ which is not really needed if we copy
303  into a 3rd party buffer. To revisit the interface and to allow copying data
304  using the sliding window.
305  */
306  IntRange GetNextIntPage();
307 
308  private:
309  void SetLitDict(strings::ByteRange br) override;
310  bool AddFlitSeq(strings::ByteRange src) override;
311 
312  base::PODArray<UT> lit_dict_, int_buf_;
313 
314  UT* next_int_ptr_;
315 };
316 
317 namespace internal {
318 constexpr unsigned kSmallNum = 5;
319 
320 template<typename UT, typename MapperFn> uint32_t DeflateFlitAndMap(
321  const uint8_t* src, uint32_t cnt, MapperFn mapper_fn,
322  UT* dest, uint32_t dest_capacity) {
323  namespace flit = base::flit;
324 
325  const uint8_t* end = src + cnt;
326  uint32_t val;
327  const uint8_t* next = src + flit::ParseT(src, &val);
328  LiteralDictBase::SymbId symbid = val;
329 
330  dest[0] = mapper_fn(symbid);
331 
332  uint32_t dest_index = 1;
333  bool prev_small = val < kSmallNum;
334  uint32_t prev_val = val + 1;
335 
336  while (next < end) {
337  next += flit::ParseT(next, &val);
338 
339  if (prev_small) {
340  bool is_rep = val & 1;
341  val >>= 1;
342 
343  if (is_rep) {
344  val += 1;
345 
346  if (dest_index + val > dest_capacity)
347  return dest_index + val;
348 
349  for (unsigned i = 0; i < val; ++i) {
350  symbid += prev_val;
351  dest[dest_index++] = mapper_fn(symbid);
352  }
353  prev_small = false;
354  continue;
355  }
356  }
357 
358  if (dest_index >= dest_capacity)
359  return dest_index + 1;
360 
361  prev_small = val < kSmallNum;
362  ++val;
363  symbid += val;
364  prev_val = val;
365 
366  dest[dest_index++] = mapper_fn(symbid);
367  }
368 
369  return dest_index;
370 }
371 
372 } // namespace internal
373 
374 } // namespace util
State
dictionary. In any case lit_data_ and len_ contain binary blobs and len codes to decode them.
Definition: set_encoder.h:197