Version: 1.0
image_frequencies.cpp
Go to the documentation of this file.
1 // Copyright (c) 2020, the GRAPHGEN contributors, as
2 // shown by the AUTHORS file. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6 
7 #include "image_frequencies.h"
8 
9 #include <iostream>
10 #include <limits>
11 #include <filesystem>
12 #include <algorithm>
13 #include <iterator>
14 
15 #include "utilities.h"
16 #include "performance_evaluator.h"
17 
18 using namespace std;
19 using namespace filesystem;
20 using namespace cv;
21 
22 mask::mask(const rule_set& rs) : rs_{ rs } {
23  const auto& ps = rs.ps_;
24  increment_ = ps.GetShiftX();
25  exp_ = static_cast<int>(ps.pixels_.size());
26  for (int i = 0; i < exp_; ++i) {
27  border_ = max(border_, max(abs(ps.pixels_[i].GetDx()), abs(ps.pixels_[i].GetDy())));
28  top_ = min(top_, ps.pixels_[i].GetDy());
29  right_ = max(right_, ps.pixels_[i].GetDx());
30  left_ = min(left_, ps.pixels_[i].GetDx());
31  bottom_ = max(bottom_, ps.pixels_[i].GetDy());
32  }
33 
34  left_ = abs(left_);
35  top_ = abs(top_);
36 
37  mask_ = Mat1b(top_ + bottom_ + 1, left_ + right_ + 1, uchar(0));
38  for (int i = 0; i < exp_; ++i) {
39  mask_(ps.pixels_[i].GetDy() + top_, ps.pixels_[i].GetDx() + left_) = 1;
40  }
41 }
42 
43 size_t mask::MaskToLinearMask(const cv::Mat1b& r_img) const {
44  size_t linearMask = 0;
45 
46  for (const auto& p : rs_.ps_) {
47  linearMask |= r_img(p.GetDy() + top_, p.GetDx() + left_) << rs_.conditions_pos.at(p.name_);
48  }
49 
50  return linearMask;
51 }
52 
53 // This function extracts all the configurations of a given mask (mask) in a given image (img) and stores the occurrences (frequencies) in the rRules vector
54 //void CalculateConfigurationsFrequencyOnImage(const cv::Mat1b& img, const mask& msk, rule_set& rs) {
55 //
56 // cv::Mat1b clone;
57 // copyMakeBorder(img, clone, msk.border_, msk.border_, msk.border_, msk.border_, cv::BORDER_CONSTANT, 0);
58 // const int h = clone.rows, w = clone.cols;
59 //
60 // for (int r = msk.border_; r < h - msk.border_; r += msk.increment_) {
61 // for (int c = msk.border_; c < w - msk.border_; c += msk.increment_) {
62 // cv::Mat1b read_pixels = clone(cv::Rect(cv::Point(c - msk.left_, r - msk.top_), cv::Point(c + 1 + msk.right_, r + 1 + msk.bottom_))).clone();
63 // // bitwise_and(msk.mask_, read_pixels, read_pixels);
64 // size_t rule = msk.MaskToLinearMask(read_pixels);
65 // rs.rules[rule].frequency++;
66 // if (rs.rules[rule].frequency == numeric_limits<unsigned long long>::max()) {
67 // cout << "OVERFLOW freq\n";
68 // }
69 // }
70 // }
71 //}
72 
73 
74 // Overloaded function that accepts a vector instead of a ruleset
75 void CalculateConfigurationsFrequencyOnImage(const cv::Mat1b& img, const mask& msk, vector<unsigned long long>& freqs) {
76 
77  cv::Mat1b clone;
78  copyMakeBorder(img, clone, msk.border_, msk.border_, msk.border_, msk.border_, cv::BORDER_CONSTANT, 0);
79  const int h = clone.rows, w = clone.cols;
80 
81  for (int r = msk.border_; r < h - msk.border_; r += msk.increment_) {
82 
83  for (int c = msk.border_; c < w - msk.border_; c += msk.increment_) {
84 
85  const cv::Mat1b read_pixels = clone(cv::Rect(cv::Point(c - msk.left_, r - msk.top_), cv::Point(c + 1 + msk.right_, r + 1 + msk.bottom_)));
86  size_t rule = msk.MaskToLinearMask(read_pixels);
87  freqs[rule]++;
88  if (freqs[rule] == numeric_limits<unsigned long long>::max()) {
89  cout << "OVERFLOW freq\n";
90  }
91  }
92  }
93 }
94 
95 bool GetBinaryImage(const string& FileName, cv::Mat1b& binary) {
96  // Image load
97  cv::Mat image;
98  image = cv::imread(FileName, cv::IMREAD_GRAYSCALE); // Read the file
99 
100  if (image.empty()) // Check if image exists
101  return false;
102 
104  //Mat grayscaleMat;
105  //cvtColor(image, grayscaleMat, CV_RGB2GRAY);
106 
107  // Adjust the threshold to actually make it binary
108  cv::threshold(image, binary, 100, 1, cv::THRESH_BINARY);
109 
110  return true;
111 }
112 
113 bool LoadFileList(vector<pair<string, bool>>& filenames, const string& files_path)
114 {
115  // Open files_path (files.txt)
116  ifstream is(files_path);
117  if (!is.is_open()) {
118  return false;
119  }
120 
121  string cur_filename;
122  while (getline(is, cur_filename)) {
123  // To delete possible carriage return in the file name
124  // (especially designed for windows file newline format)
125  RemoveCharacter(cur_filename, '\r');
126  filenames.push_back(make_pair(cur_filename, true));
127  }
128 
129  is.close();
130  return true;
131 }
132 
133 //bool CalculateRulesFrequencies(const pixel_set& ps, vector<pair<path, bool>>& paths, rule_set& rs) {
134 // mask msk(ps);
135 //
136 // cv::Mat1b img;
137 //
138 // cout << "Counting frequencies of patterns in datasets . . . \n";
139 //
140 // PerformanceEvaluator perf;
141 // perf.start();
142 //
143 // unsigned int existing_datasets = 0;
144 // for (uint i = 0; i < paths.size(); ++i) {
145 // path dataset_path = paths[i].first;
146 // vector<pair<string, bool>> files_list;
147 // if (!LoadFileList(files_list, (dataset_path / path("files.txt")).string())) {
148 // cout << "Unable to find 'files.txt' of " << dataset_path << ", dataset skipped.\n";
149 // paths[i].second = false;
150 // continue;
151 // }
152 // cout << dataset_path.filename().string() << ":\n";
153 //
154 // unsigned int files_list_size = files_list.size();
155 // for (uint d = 0; d < files_list_size; ++d) {
156 // cout << '\r' << d << '/' << files_list_size;
157 // path file_name = files_list[d].first;
158 // GetBinaryImage((dataset_path / file_name).string(), img);
159 // if (img.empty()) {
160 // cout << "Unable to find '" << file_name << "' image in '" << dataset_path << "' dataset, image skipped\n";
161 // continue;
162 // }
163 // CalculateConfigurationsFrequencyOnImage(img, msk, rs);
164 // }
165 // cout << '\r' << files_list_size << '/' << files_list_size << '\n';
166 // existing_datasets++;
167 // }
168 //
169 // cout << "done. " << perf.stop() << " ms.\n";
170 //
171 // return existing_datasets > 0;
172 //}
173 
174 
175 bool CountFrequenciesOnDataset(const string& dataset, rule_set& rs, bool force) {
176 
177  path frequencies_output_path = conf.frequencies_path_ / conf.mask_name_ / (dataset + conf.frequencies_suffix_);
178 
179  if (!force) {
180  // Try to load frequencies from file
181  ifstream is;
182  is.exceptions(fstream::badbit | fstream::failbit | fstream::eofbit);
183 
184  try {
185  is.open(frequencies_output_path, ios::binary);
186  vector<rule> new_rules = rs.rules;
187  std::for_each(new_rules.begin(), new_rules.end(), [&is](rule& r) {
188  unsigned long long v;
189  is.read(reinterpret_cast<char*>(&v), 8);
190  r.frequency += v;
191  });
192  rs.rules = new_rules;
193  cout << "Frequencies of " << dataset << " were loaded from file.\n";
194  return true;
195  }
196  catch (const ifstream::failure&) {
197  cout << "Frequencies of " << dataset << " couldn't be loaded from file.\n";
198  }
199  catch (const runtime_error&) {
200  cout << "Frequencies of " << dataset << " couldn't be loaded from file.\n";
201  }
202  }
203 
204  mask msk(rs);
205 
206  cv::Mat1b img;
207 
208  path dataset_path = conf.global_input_path_ / path(dataset);
209  vector<pair<string, bool>> files_list;
210  if (!LoadFileList(files_list, (dataset_path / path("files.txt")).string())) {
211  cout << "Unable to find 'files.txt' of " << dataset_path << ", dataset skipped.\n";
212  return false;
213  }
214  cout << dataset << ":\n";
215 
216  vector<unsigned long long> freqs(rs.rules.size(), 0);
217 
218  size_t files_list_size = files_list.size();
219  for (size_t d = 0; d < files_list_size; ++d) {
220  cout << '\r' << d << '/' << files_list_size;
221  path file_name = files_list[d].first;
222  GetBinaryImage((dataset_path / file_name).string(), img);
223  if (img.empty()) {
224  cout << "Unable to find '" << file_name << "' image in '" << dataset_path << "' dataset, image skipped\n";
225  continue;
226  }
228  }
229  cout << '\r' << files_list_size << '/' << files_list_size << '\n';
230 
231  for_each(freqs.begin(), freqs.end(), [rs_it = rs.rules.begin()](unsigned long long f) mutable { (*rs_it++).frequency += f; });
232 
233  ofstream os(frequencies_output_path, ios::binary);
234  if (!os) {
235  cerr << "Frequencies of " << dataset << " couldn't be stored into file.\n";
236  }
237  else {
238  for_each(freqs.begin(), freqs.end(), [&os](unsigned long long f) { os.write(reinterpret_cast<const char*>(&f), 8); });
239  }
240 
241  return true;
242 }
243 
244 
245 //void CalculateRulesFrequencies(const pixel_set& ps, const vector<string>& paths, rule_set& rs) {
246 // vector<path> datasets_path(paths.size());
247 // generate(datasets_path.begin(), datasets_path.end(), [paths_it = paths.begin()]() mutable { return path(*paths_it++); });
248 // CalculateRulesFrequencies(ps, datasets_path, rs);
249 //}
250 
251 //bool AddFrequenciesToRuleset(const ConfigData& config, rule_set& rs, bool force) {
252 //
253 // std::string dataset_names;
254 // bool first = true;
255 // for (const auto& piece : conf.datasets_) {
256 // if (!first) {
257 // dataset_names += '-';
258 // }
259 // else {
260 // first = false;
261 // }
262 // dataset_names += piece;
263 // }
264 //
265 // if (!force) {
266 // ifstream is;
267 // is.exceptions(fstream::badbit | fstream::failbit | fstream::eofbit);
268 //
269 // try {
270 // is.open(config.GetFrequenciesPath(dataset_names), ios::binary);
271 // vector<rule> new_rules = rs.rules;
272 // unsigned int n;
273 // is.read(reinterpret_cast<char*>(&n), 4);
274 // if (config.datasets_.size() != n) {
275 // throw runtime_error("Number of datasets in stored file doesn't match that of datasets in config.\n");
276 // }
277 // set<string> stored_datasets;
278 // for (unsigned int i = 0; i < n; i++) {
279 // string buf;
280 // char c;
281 // while (true) {
282 // is.get(c);
283 // if (c == '\n') {
284 // break;
285 // }
286 // else {
287 // buf += c;
288 // }
289 // }
290 // stored_datasets.insert(buf);
291 // }
292 // if (set<string>(config.datasets_.begin(), config.datasets_.end()) != stored_datasets) {
293 // throw runtime_error("Datasets in stored file and don't match those in config.\n");
294 // }
295 // for_each(new_rules.begin(), new_rules.end(), [&is](rule& r) { is.read(reinterpret_cast<char*>(&r.frequency), 8); });
296 // rs.rules = new_rules;
297 // cout << "Frequencies were loaded from file.\n";
298 // return true;
299 // }
300 // catch (const ifstream::failure&) {
301 // cout << "Frequencies couldn't be loaded from file.\n";
302 // }
303 // catch (const runtime_error&) {
304 // cout << "Frequencies couldn't be loaded from file.\n";
305 // }
306 // }
307 //
308 // if (config.datasets_path_.empty()) {
309 // cerr << "Frequencies couldn't be counted because no input dataset is specified.\n";
310 // return false;
311 // }
312 //
313 // vector<pair<path, bool>> existing_datasets(config.datasets_path_.size());
314 // generate(existing_datasets.begin(), existing_datasets.end(), [it = config.datasets_path_.begin()]() mutable {return make_pair(*it++, true); });
315 //
316 // if (!CalculateRulesFrequencies(rs.ps_, existing_datasets, rs)) {
317 // cerr << "Couldn't count frequencies.\n";
318 // return false;
319 // }
320 //
321 // // Store frequencies into file
322 // // Format is the following
323 // // - n: little endian 4 bytes unsigned integer, stores the number of datasets considered
324 // // - datasets: a sequence of n datasets names, each of them ending with a '\n'
325 // // - frequencies: the array of frequencies stored as little endian 8 bytes unsigned integers
326 // ofstream os(config.GetFrequenciesPath(dataset_names), ios::binary);
327 // if (!os) {
328 // cerr << "Frequencies couldn't be stored into file.\n";
329 // }
330 // else {
331 // unsigned int n = 0;
332 // for_each(existing_datasets.begin(), existing_datasets.end(), [&n](const pair<path, bool>& dataset) {n += dataset.second; });
333 // os.write(reinterpret_cast<const char*>(&n), 4);
334 // for (const pair<path, bool>& dataset : existing_datasets) {
335 // if (dataset.second) {
336 // string dataset_name = dataset.first.filename().string();
337 // os.write(dataset_name.c_str(), dataset_name.size() * sizeof(char));
338 // os.put('\n');
339 // }
340 // }
341 // for_each(rs.rules.begin(), rs.rules.end(), [&os](const rule& r) { os.write(reinterpret_cast<const char*>(&r.frequency), 8); });
342 // }
343 //
344 // return true;
345 //}
346 
347 bool AddFrequenciesToRuleset(rule_set& rs, bool force, bool is_thinning) {
348 
349  int n = 0;
350 
351  for (const string& dataset : conf.datasets_) {
352  n += CountFrequenciesOnDataset(dataset, rs, force);
353  }
354 
355  if (is_thinning) {
356  assert((rs.rules.size() % 2) == 0);
357  size_t half = rs.rules.size() / 2;
358  for (size_t i = half; i < rs.rules.size(); i++) {
359  rs.rules[i].frequency = rs.rules[i - half].frequency;
360  }
361  }
362 
363  return n > 0;
364 
365 }
bool AddFrequenciesToRuleset(rule_set &rs, bool force, bool is_thinning)
bool CountFrequenciesOnDataset(const string &dataset, rule_set &rs, bool force)
void CalculateConfigurationsFrequencyOnImage(const cv::Mat1b &img, const mask &msk, vector< unsigned long long > &freqs)
bool GetBinaryImage(const string &FileName, cv::Mat1b &binary)
bool LoadFileList(vector< pair< string, bool >> &filenames, const string &files_path)
std::filesystem::path global_input_path_
Definition: config_data.h:29
std::string mask_name_
Definition: config_data.h:31
std::string frequencies_suffix_
Definition: config_data.h:73
std::vector< std::string > datasets_
Definition: config_data.h:67
std::filesystem::path frequencies_path_
Definition: config_data.h:72
int increment_
size_t MaskToLinearMask(const cv::Mat1b &r_img) const
mask(const rule_set &rs)
cv::Mat1b mask_
const rule_set & rs_
pixel_set ps_
Definition: rule_set.h:53
std::vector< rule > rules
Definition: rule_set.h:52
std::unordered_map< std::string, size_t > conditions_pos
Definition: rule_set.h:49
Definition: rule_set.h:41
ConfigData conf
Definition: utilities.cpp:9
std::string binary(size_t u, size_t nbits)
Definition: utilities.cpp:11