vkdb
A time series database engine in C++.
Loading...
Searching...
No Matches
bloom_filter.cpp
1#include <vkdb/bloom_filter.h>
2#include <iostream>
3
4namespace vkdb {
5BloomFilter::BloomFilter(std::string&& str) noexcept {
6 std::stringstream ss{str};
7 size_type no_of_bits;
8 ss >> no_of_bits;
9 bits_.resize(no_of_bits);
10
11 size_type no_of_hashes;
12 ss >> no_of_hashes;
13 seeds_.resize(no_of_hashes);
14 for (size_type i{0}; i < no_of_hashes; ++i) {
15 ss >> seeds_[i];
16 }
17
18 std::string bits;
19 ss >> bits;
20 for (size_type i{0}; i < bits.size(); ++i) {
21 bits_[i] = bits[i] - '0';
22 }
23}
24
26 size_type expected_no_of_elems,
27 double false_positive_rate) {
28 if (expected_no_of_elems == 0) {
29 throw std::invalid_argument{
30 "BloomFilter(): Expected elements must be greater than 0."
31 };
32 }
33
34 if (false_positive_rate <= MIN_FALSE_POSITIVE_RATE ||
35 false_positive_rate >= MAX_FALSE_POSITIVE_RATE) {
36 throw std::invalid_argument{
37 "BloomFilter(): False positive rate must be in the range (0, 1)."
38 };
39 }
40
41 auto log_false_positive_rate{std::log(false_positive_rate)};
42 auto log_2_squared{std::log(2) * std::log(2)};
43
44 auto no_of_bits{static_cast<size_type>(
45 -(expected_no_of_elems * log_false_positive_rate) / log_2_squared
46 )};
47 bits_.resize(no_of_bits);
48
49 auto no_of_hashes{static_cast<size_type>(
50 (no_of_bits / expected_no_of_elems) * std::log(2)
51 )};
52 initialise_seeds(no_of_hashes);
53}
54
55void BloomFilter::insert(const key_type& key) noexcept {
56 for (size_type i{0}; i < seeds_.size(); ++i) {
57 bits_[hash(key, i)] = true;
58 }
59}
60
61bool BloomFilter::mayContain(const key_type& key) const noexcept {
62 for (size_type i{0}; i < seeds_.size(); ++i) {
63 if (!bits_[hash(key, i)]) {
64 return false;
65 }
66 }
67 return true;
68}
69
70std::string BloomFilter::str() const noexcept {
71 std::stringstream ss;
72 ss << bits_.size() << " ";
73 ss << seeds_.size() << " ";
74 auto first{true};
75 for (const auto& seed : seeds_) {
76 if (!first) {
77 ss << " ";
78 }
79 ss << seed;
80 first = false;
81 }
82 ss << " ";
83 for (const auto& bit : bits_) {
84 ss << bit;
85 }
86 return ss.str();
87}
88
89void BloomFilter::initialise_seeds(size_type no_of_hashes) noexcept {
90 seeds_.resize(no_of_hashes);
91 for (size_type i{0}; i < no_of_hashes; ++i) {
92 seeds_[i] = random<size_type>();
93 }
94}
95
96BloomFilter::HashValue BloomFilter::hash(
97 const key_type& key,
98 size_type i
99) const noexcept {
100 HashValue hash_value{0};
101 auto std_hash_value{std::hash<std::string>{}(key.str())};
102 MurmurHash3_x86_32(&std_hash_value, sizeof(std_hash_value), i, &hash_value);
103 return hash_value % bits_.size();
104}
105} // namespace vkdb
std::string str() const noexcept
Get the string representation of the Bloom filter.
BloomFilter()=delete
Deleted default constructor.
bool mayContain(const key_type &key) const noexcept
Check if the Bloom filter may contain a key.
void insert(const key_type &key) noexcept
Insert a key into the Bloom filter.
static constexpr double MIN_FALSE_POSITIVE_RATE
Minimum (exclusive) false positive rate.
static constexpr double MAX_FALSE_POSITIVE_RATE
Maximum (exclusive) false positive rate.
Represents a key in vkdb.