pprint_utils.cc
1 // Copyright 2016, Beeri 15. All rights reserved.
2 // Author: Roman Gershman (romange@gmail.com)
3 //
4 #include "util/pprint/pprint_utils.h"
5 
6 #include <iostream>
7 #include <unordered_map>
8 #include <glog/stl_logging.h>
9 #include <google/protobuf/compiler/importer.h>
10 #include <google/protobuf/dynamic_message.h>
11 
12 #include "base/flags.h"
13 #include "base/logging.h"
14 #include "strings/escaping.h"
15 #include "strings/numbers.h"
16 #include "strings/split.h"
17 #include "strings/strcat.h"
18 
19 DEFINE_bool(short, false, "");
20 DEFINE_string(csv, "", "comma delimited list of tag numbers. For repeated fields, it's possible "
21  "to add :[delimiting char] after a tag number.");
22 DEFINE_bool(use_csv_null, true, "When printing csv format use \\N for outputing undefined "
23  "optional fields.");
24 DEFINE_bool(aggregate_repeated, false, "When printing csv format, aggregate repeated leaves in one "
25  "line: \"xx,yy,..\"");
26 
27 DEFINE_bool(omit_blobs, true, "");
28 DEFINE_bool(skip_value_escaping, false, "");
29 DEFINE_string(root_node, "", "");
30 DEFINE_bool(omit_double_quotes, false, "Omits double quotes when printing string values");
31 
32 using std::cout;
33 using std::string;
34 using std::vector;
35 
36 using absl::StrCat;
37 
38 namespace util {
39 namespace pprint {
40 
41 FdPath::FdPath(const gpb::Descriptor* root, StringPiece path) {
42  std::vector<StringPiece> parts = absl::StrSplit(path, ".");
43  CHECK(!parts.empty()) << path;
44  const gpb::Descriptor* cur_descr = root;
45  for (size_t j = 0; j < parts.size(); ++j) {
46  const gpb::FieldDescriptor* field = nullptr;
47  uint32 tag_id;
48 
49  if (safe_strtou32(parts[j], &tag_id)) {
50  field = cur_descr->FindFieldByNumber(tag_id);
51  } else {
52  string tmp(parts[j].data(), parts[j].size());
53  field = cur_descr->FindFieldByName(tmp);
54  }
55 
56  CHECK(field) << "Can not find tag id " << parts[j];
57  if (j + 1 < parts.size()) {
58  CHECK_EQ(field->cpp_type(), gpb::FieldDescriptor::CPPTYPE_MESSAGE);
59  cur_descr = field->message_type();
60  }
61  path_.push_back(field);
62  }
63 }
64 
65 bool FdPath::IsRepeated() const {
66  for (auto v : path_) {
67  if (v->is_repeated())
68  return true;
69  }
70  return false;
71 }
72 
73 
74 void FdPath::ExtractValueRecur(const gpb::Message& msg, uint32 index, ValueCb cb) const {
75  CHECK_LT(index, path_.size());
76  auto fd = path_[index];
77  const gpb::Reflection* reflection = msg.GetReflection();
78  uint32 cur_repeated_depth = 0;
79  for (uint32 i = 0; i < index; ++i) {
80  if (path_[i]->is_repeated()) ++cur_repeated_depth;
81  }
82  if (fd->is_repeated()) {
83  int sz = reflection->FieldSize(msg, fd);
84  if (sz > 0) {
85  if (index + 1 < path_.size()) {
86  // Non leaves, repeated messages.
87  if (cur_repeated_depth < cur_repeated_stack_.size()) {
88  const gpb::Message& new_msg =
89  reflection->GetRepeatedMessage(msg, fd, cur_repeated_stack_[cur_repeated_depth]);
90  ExtractValueRecur(new_msg, index + 1, cb);
91  } else {
92  for (int i = 0; i < sz; ++i) {
93  cur_repeated_stack_.push_back(i);
94  const gpb::Message& new_msg = reflection->GetRepeatedMessage(msg, fd, i);
95  ExtractValueRecur(new_msg, index + 1, cb);
96  cur_repeated_stack_.pop_back();
97  }
98  }
99 
100  } else {
101  // Repeated leaves.
102  if (FLAGS_aggregate_repeated) {
103  cb(msg, fd, -1, sz);
104  } else {
105  for (int i = 0; i < sz; ++i) {
106  cb(msg, fd, i, -1);
107  }
108  }
109  }
110  }
111  return;
112  }
113 
114  if (index + 1 < path_.size()) {
115  const gpb::Message& new_msg = reflection->GetMessage(msg, fd);
116  ExtractValueRecur(new_msg, index + 1, cb);
117  return;
118  }
119  /*if (FLAGS_use_csv_null && !reflection->HasField(msg, fd)) {
120  cb("\\N");
121  return;
122  }
123  string res;
124  printer_.PrintFieldValueToString(msg, fd, -1, &res);*/
125  cb(msg, fd, -1, -1);
126 }
127 
128 static gpb::SimpleDescriptorDatabase proto_db;
129 static gpb::DescriptorPool proto_db_pool(&proto_db);
130 
131 gpb::Message* AllocateMsgByMeta(const string& type, const string& fd_set) {
132  CHECK(!type.empty());
133  CHECK(!fd_set.empty());
134 
135 
136  const gpb::Descriptor* descriptor = proto_db_pool.FindMessageTypeByName(type);
137  if (!descriptor) {
138  gpb::FileDescriptorSet fd_set_proto;
139  CHECK(fd_set_proto.ParseFromString(fd_set));
140  for (int i = 0; i < fd_set_proto.file_size(); ++i) {
141  // LOG(INFO) << fd_set_proto.file(i).DebugString();
142  /*const gpb::FileDescriptor* filed =
143  gpb::DescriptorPool::generated_pool()->FindFileByName(fd_set_proto.file(i).name());
144  if (filed != nullptr) {
145  LOG(INFO) << "Already exists " << filed->name();
146  } else {*/
147  CHECK(proto_db.Add(fd_set_proto.file(i)));
148  // filed = proto_db_pool.BuildFile(fd_set_proto.file(i));
149  // VLOG(1) << "Built " << filed->name() << "\n" << filed->DebugString();
150 
151  //
152  /*vector<int> exts;
153  proto_db.FindAllExtensionNumbers("google.protobuf.FieldOptions", &exts);
154  LOG(INFO) << "extensions " << exts;*/
155  }
156  descriptor = proto_db_pool.FindMessageTypeByName(type);
157  }
158 
159  CHECK(descriptor) << "Can not find " << type << " in the proto pool.";
160  return AllocateMsgFromDescr(descriptor);
161 }
162 
163 gpb::Message* AllocateMsgFromDescr(const gpb::Descriptor* descr) {
164  static gpb::DynamicMessageFactory message_factory(&proto_db_pool);
165  message_factory.SetDelegateToGeneratedFactory(true);
166 
167  const gpb::Message* msg_proto = message_factory.GetPrototype(descr);
168  CHECK_NOTNULL(msg_proto);
169  return msg_proto->New();
170 }
171 
172 PathNode* PathNode::AddChild(const gpb::FieldDescriptor* fd) {
173  for (PathNode& n : children) {
174  if (n.fd == fd) return &n;
175  }
176  children.push_back(PathNode(fd));
177  return &children.back();
178 }
179 
180 class BetterPrinter : public gpb::TextFormat::FieldValuePrinter {
181 public:
182  virtual string PrintString(const string& val) const override {
183  if (FLAGS_omit_blobs) {
184  if (val.size() > 100 && std::any_of(val.begin(), val.end(),
185  [](char c) { return c < 32; })) {
186  return "\"Not work safe!\"";
187  }
188  }
189  const string& val2 = FLAGS_skip_value_escaping ? val : absl::Utf8SafeCEscape(val);
190  if (FLAGS_omit_double_quotes) {
191  return val2;
192  }
193  return absl::StrCat("\"", val2, "\"");
194  }
195 };
196 
197 void RegisterCustomFieldPrinter(
198  const gpb::Descriptor* descriptor, Printer::FieldPrinterPredicate pred,
199  const std::unordered_map<int, const gpb::FieldDescriptor*>& fo_tags_map,
200  gpb::TextFormat::Printer* printer) {
201  CHECK_NOTNULL(descriptor);
202 
203  for (int i = 0; i < descriptor->field_count(); ++i) {
204  const gpb::FieldDescriptor* fd = descriptor->field(i);
205 
206  if (fd->cpp_type() == gpb::FieldDescriptor::CPPTYPE_MESSAGE) {
207  RegisterCustomFieldPrinter(fd->message_type(), pred, fo_tags_map, printer);
208  continue;
209  }
210  gpb::TextFormat::FieldValuePrinter* custom = pred(*fd);
211  if (custom) {
212  printer->RegisterFieldValuePrinter(fd, custom);
213  }
214  }
215 }
216 
217 Printer::Printer(const gpb::Descriptor* descriptor, FieldPrinterPredicate pred)
218  : type_name_(descriptor->full_name()) {
219  printer_.SetDefaultFieldValuePrinter(new BetterPrinter());
220  printer_.SetUseShortRepeatedPrimitives(true);
221 
222 
223  std::vector<StringPiece> tags = absl::StrSplit(FLAGS_csv, ",", absl::SkipWhitespace());
224  if (tags.empty()) {
225  printer_.SetInitialIndentLevel(1);
226  printer_.SetSingleLineMode(FLAGS_short);
227  if (!FLAGS_root_node.empty()) {
228  root_path_ = FdPath{descriptor, FLAGS_root_node};
229  CHECK(root_path_.valid());
230  const gpb::FieldDescriptor* fd = root_path_.path().back();
231  CHECK_EQ(gpb::FieldDescriptor::CPPTYPE_MESSAGE, fd->cpp_type());
232  }
233  } else {
234  for (StringPiece tag_path : tags) {
235  FdPath fd_path(descriptor, tag_path);
236  PathNode* cur_node = &root_;
237  for (const gpb::FieldDescriptor* fd: fd_path.path()) {
238  cur_node = cur_node->AddChild(fd);
239  }
240  fds_.push_back(std::move(fd_path));
241  }
242  }
243 
244  const gpb::Descriptor* fo_descr_root =
245  proto_db_pool.FindMessageTypeByName("google.protobuf.FieldOptions");
246  if (fo_descr_root == nullptr) {
247  fo_descr_root = gpb::DescriptorPool::generated_pool()
248  ->FindMessageTypeByName("google.protobuf.FieldOptions");
249  }
250 
251  CHECK_NOTNULL(fo_descr_root);
252 
253  std::unordered_map<int, const gpb::FieldDescriptor*> fo_tags_map;
254  vector<const gpb::FieldDescriptor*> fields;
255  proto_db_pool.FindAllExtensions(fo_descr_root, &fields);
256 
257  for (const gpb::FieldDescriptor* fl : fields) {
258  fo_tags_map[fl->number()] = fl;
259  }
260 
261  if (pred)
262  RegisterCustomFieldPrinter(descriptor, pred, fo_tags_map, &printer_);
263 
264  google::FlushLogFiles(google::GLOG_INFO);
265 
266 }
267 
268 void Printer::Output(const gpb::Message& msg) const {
269  string text_output;
270  if (fds_.empty()) {
271  CHECK(printer_.PrintToString(msg, &text_output));
272  std::cout << type_name_ << " {" << (FLAGS_short ? " " : "\n")
273  << text_output << "}\n";
274  } else {
275  PrintValueRecur(0, "", false, msg);
276  }
277 }
278 
279 void Printer::PrintValueRecur(size_t path_index, const string& prefix,
280  bool has_value, const gpb::Message& msg) const {
281  CHECK_LT(path_index, fds_.size());
282  auto cb_fun = [path_index, this, has_value, &prefix, &msg](
283  // num_items - #items in leaf repeated field. if given (!-1): aggregate all values: "xx,yy,.."
284  // item_index - item index in leaf repeated field. if given (!-1): print line with this item.
285  const gpb::Message& parent, const gpb::FieldDescriptor* fd, int item_index, int num_items) {
286  string val;
287  CHECK_NE(num_items, 0);
288 
289  if (num_items != -1) {
290  if (!FLAGS_omit_double_quotes)
291  val = "\"";
292  for (int i=0; i < num_items; i++) {
293  string repeated_val;
294  printer_.PrintFieldValueToString(parent, fd, i, &repeated_val);
295  absl::StrAppend(&val, repeated_val, ",");
296  }
297  if (FLAGS_omit_double_quotes)
298  val.pop_back();
299  else
300  val.back() = '"';
301  } else {
302  printer_.PrintFieldValueToString(parent, fd, item_index, &val);
303  if (item_index == -1) {
304  const gpb::Reflection* reflection = parent.GetReflection();
305  if (FLAGS_use_csv_null && !reflection->HasField(parent, fd)) {
306  val = "\\N";
307  }
308  }
309  }
310 
311  string next_val = (path_index == 0) ? val : StrCat(prefix, ",", val);
312  bool next_has_value = has_value | !val.empty();
313  if (path_index + 1 == fds_.size()) {
314  if (next_has_value)
315  cout << next_val << std::endl;
316  } else {
317  PrintValueRecur(path_index + 1, next_val, next_has_value, msg);
318  }
319  };
320  fds_[path_index].ExtractValue(msg, cb_fun);
321 }
322 
323 using FD = gpb::FieldDescriptor;
324 
325 static void PrintBqSchemaInternal(unsigned offset, const gpb::Descriptor* descr,
326  const PrintBqSchemaOptions& options) {
327  cout << "[\n";
328  bool continuation_field = false;
329  for (int i = 0; i < descr->field_count(); ++i) {
330  const gpb::FieldDescriptor* fd = descr->field(i);
331  string fname = options.field_name_cb ? options.field_name_cb(*fd) : fd->name();
332  if (fname.empty())
333  continue;
334 
335  if (continuation_field) {
336  cout << ",\n"; // Finalize previous field.
337  }
338 
339  continuation_field = true;
340  cout << string(offset, ' ') << R"( { "name": ")" << fname << R"(", "type": ")";
341  const string& type_name = options.type_name_cb ? options.type_name_cb(*fd) : string{};
342 
343  if (type_name.empty()) {
344  switch (fd->cpp_type()) {
345  case FD::CPPTYPE_INT32:
346  case FD::CPPTYPE_UINT32:
347  case FD::CPPTYPE_INT64:
348  case FD::CPPTYPE_UINT64:
349  cout << "INTEGER\"";
350  break;
351  case FD::CPPTYPE_BOOL:
352  cout << "BOOLEAN\"";
353  break;
354 
355  case FD::CPPTYPE_STRING:
356  cout << "STRING\"";
357  break;
358  case FD::CPPTYPE_DOUBLE:
359  case FD::CPPTYPE_FLOAT:
360  cout << "FLOAT\"";
361  break;
362  case FD::CPPTYPE_ENUM:
363  cout << "INTEGER\"";
364  break;
365  case FD::CPPTYPE_MESSAGE:
366  cout << R"(RECORD", "fields": )";
367  PrintBqSchemaInternal(offset + 2, fd->message_type(), options);
368  cout << string(offset + 4, ' ');
369  break;
370  default:
371  LOG(FATAL) << " not supported " << fd->cpp_type_name();
372  }
373  } else {
374  cout << type_name << "\"";
375  }
376  if (fd->is_repeated()) {
377  cout << R"(, "mode": "REPEATED")";
378  } else if (fd->is_required()) {
379  cout << R"(, "mode": "REQUIRED")";
380  }
381  cout << " }";
382  }
383  cout << " ]\n";
384 }
385 
386 void PrintBqSchema(const gpb::Descriptor* descr, const PrintBqSchemaOptions& options) {
387  PrintBqSchemaInternal(0, descr, options);
388 }
389 
390 static std::vector<const gpb::FieldDescriptor *> ListFields(const gpb::Message &msg) {
391  std::vector<const gpb::FieldDescriptor *> initialized_fields;
392  msg.GetReflection()->ListFields(msg, &initialized_fields);
393  return initialized_fields;
394 }
395 
396 static size_t GetSize(const gpb::Message &msg,
397  const gpb::FieldDescriptor *field) {
398  const gpb::Reflection *reflect = msg.GetReflection();
399  const size_t field_size = field->is_repeated() ? reflect->FieldSize(msg, field) : 1;
400  // TODO(ORI): Need to handle variant encoding in protobufs
401  // (otherwise our calculation of the integral fields is very inaccurate)
402  switch (field->type()) {
403  case gpb::FieldDescriptor::TYPE_DOUBLE:
404  case gpb::FieldDescriptor::TYPE_INT64:
405  case gpb::FieldDescriptor::TYPE_UINT64:
406  case gpb::FieldDescriptor::TYPE_FIXED64:
407  case gpb::FieldDescriptor::TYPE_SFIXED64:
408  case gpb::FieldDescriptor::TYPE_SINT64:
409  return 8 * field_size;
410  case gpb::FieldDescriptor::TYPE_FLOAT:
411  case gpb::FieldDescriptor::TYPE_INT32:
412  case gpb::FieldDescriptor::TYPE_UINT32:
413  case gpb::FieldDescriptor::TYPE_FIXED32:
414  case gpb::FieldDescriptor::TYPE_SFIXED32:
415  case gpb::FieldDescriptor::TYPE_SINT32:
416  case gpb::FieldDescriptor::TYPE_ENUM: // TODO(ORI): Is this correct?
417  return 4 * field_size;
418  case gpb::FieldDescriptor::TYPE_BOOL:
419  return field_size;
420  case gpb::FieldDescriptor::TYPE_STRING:
421  case gpb::FieldDescriptor::TYPE_BYTES: {
422  std::string temp;
423  if (field->is_repeated()) {
424  size_t sum = 0;
425  for (size_t i = 0; i < field_size; ++i)
426  sum += reflect->GetRepeatedStringReference(msg, field, i, &temp).size();
427  return sum;
428  } else {
429  return reflect->GetStringReference(msg, field, &temp).size();
430  }
431  }
432  default:
433  LOG(FATAL) << " not supported " << field->type();
434  return -1;
435  }
436 }
437 
438 static SizeSummarizer::Trie FillTrie(const gpb::Descriptor *descr) {
439  using Trie = SizeSummarizer::Trie;
440  Trie trie;
441  trie.Resize(descr->field_count());
442  for (int i = 0; i < descr->field_count(); ++i) {
443  if (descr->field(i)->type() == gpb::FieldDescriptor::TYPE_MESSAGE)
444  trie.Put(i, std::unique_ptr<Trie>(new Trie(FillTrie(descr->field(i)->message_type()))));
445  else
446  trie.Put(i, std::unique_ptr<Trie>(new Trie));
447  trie.Get(i)->name = descr->field(i)->name();
448  }
449  return trie;
450 }
451 
452 SizeSummarizer::SizeSummarizer(const gpb::Descriptor *descr)
453  : trie_(FillTrie(descr)) {}
454 
455 static size_t AddSizesImpl(const gpb::Message &msg,
456  SizeSummarizer::Trie *trie) {
457  size_t ret = 0;
458  for (const auto &field : ListFields(msg)) {
459  size_t sz;
460  auto subtrie = trie->Get(field->index());
461  if (field->type() == gpb::FieldDescriptor::TYPE_MESSAGE) {
462  const gpb::Reflection *reflect = msg.GetReflection();
463  if (field->is_repeated()) {
464  size_t field_size = reflect->FieldSize(msg, field);
465  sz = 0;
466  for (size_t i = 0; i < field_size; ++i) {
467  const gpb::Message &msg2 = reflect->GetRepeatedMessage(msg, field, i);
468  sz += AddSizesImpl(msg2, subtrie);
469  }
470  } else {
471  const gpb::Message &msg2 = reflect->GetMessage(msg, field);
472  sz = AddSizesImpl(msg2, subtrie);
473  }
474  } else {
475  sz = GetSize(msg, field);
476  }
477  subtrie->bytes += sz;
478  ret += sz;
479  }
480  return ret;
481 }
482 
483 void SizeSummarizer::AddSizes(const gpb::Message &msg) {
484  AddSizesImpl(msg, &trie_);
485 }
486 
487 static void GetSizesImpl(const SizeSummarizer::Trie &trie,
488  const std::string &path,
489  std::map<std::string, size_t> *out) {
490  std::string new_path;
491  if (path.empty()) {
492  if (trie.name.empty())
493  new_path = path;
494  else
495  new_path = trie.name;
496  } else {
497  CHECK(!trie.name.empty());
498  new_path = path + "." + trie.name;
499  }
500 
501  if (trie.bytes) {
502  auto iter_and_is_new = out->emplace(new_path, trie.bytes);
503  auto iter = iter_and_is_new.first;
504  bool is_new = iter_and_is_new.second;
505  CHECK(is_new);
506  iter->second = trie.bytes;
507  }
508 
509  for (size_t i = 0; i < trie.Size(); ++i)
510  GetSizesImpl(*trie.Get(i), new_path, out);
511 
512 }
513 
514 std::map<std::string, size_t> SizeSummarizer::GetSizes() const {
515  std::map<std::string, size_t> ret;
516  GetSizesImpl(trie_, "", &ret);
517  return ret;
518 }
519 
520 void SizeSummarizer::Print(std::ostream *out_p) const {
521  for (const auto &name_and_size : this->GetSizes())
522  std::cout << name_and_size.first << " - " << name_and_size.second << "\n";
523 }
524 } // namespace pprint
525 } // namespace util