1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#include "query.h"
#include <fcntl.h>
#include <cstdio>
#include <cstring>
namespace dictzip {
std::vector<std::string> get_lines_starting_with(
const std::string& path, const std::string& substring) {
FILE* file = std::fopen(path.c_str(), "r");
if ( !file ) {
return std::vector<std::string>{};
}
posix_fadvise(fileno(file), 0, 0, 1); // FDADVICE_SEQUENTIAL
std::vector<std::string> result;
char buffer[16*1024 + 1];
char* start_of_match = nullptr;
std::string overlap;
while ( std::size_t n = std::fread(buffer,
sizeof(char),
sizeof(buffer) - 1,
file) ) {
for ( char* p = buffer;
(p = static_cast<char*>(std::memchr(p, '\n', (buffer + n) - p)));
++p ) {
if ( start_of_match != nullptr ) {
if ( overlap.empty() ) {
result.emplace_back(start_of_match, p - start_of_match);
start_of_match = nullptr;
} else {
result.emplace_back(overlap.append(buffer, p - buffer));
start_of_match = nullptr;
overlap.clear();
}
}
if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) {
start_of_match = p+1;
}
}
if ( start_of_match != nullptr ) {
overlap = std::string(start_of_match, (buffer + n) - start_of_match);
}
}
std::fclose(file);
return result;
}
std::string get_encoded_offset(const std::string& line) {
const std::size_t start = line.find_first_of('\t');
const std::size_t end = line.find_last_of('\t');
return line.substr(start + 1, end - (start + 1));
}
std::string get_encoded_length(const std::string& line) {
const std::size_t start = line.find_last_of('\t');
return line.substr(start + 1);
}
}
|