Some documentation

This commit is contained in:
TinyAtoms
2020-02-18 00:00:25 -03:00
parent b82199e3b0
commit 94b7399926
39 changed files with 87 additions and 8900 deletions

View File

@@ -1,15 +1,15 @@
#include "./includes/generator.h"
#include "./includes/generator.h" // imports a generator to be used for the functions
// generates string to be used as a key
std::string gen_string() { // 90^size posibilities
std::string randomstring;
std::string randomstring;
for (int i = 0; i < 5; ++i) {
randomstring += singlechar(generator);
}
return randomstring;
}
// gen strings that dont exist in the hashmap
std::string gen_unsuccesfull_string() { // 90^size posibilities
std::string randomstring;
for (int i = 0; i < 4; ++i) {
@@ -18,10 +18,11 @@ std::string gen_unsuccesfull_string() { // 90^size posibilities
return randomstring;
}
// gen integers to be used as keys
int gen_int() {
return insert_int(generator);
}
//gen ints that don't exist in the hashmap
int gen_unsuccesfull_int() {
return noninsert_int(generator);
}

View File

@@ -4,13 +4,23 @@
#include <fstream>
#include "./tests.h"
// sizes that will be tested
vector<int> sizes = {
50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
600000, 700000, 800000, 900000, 1000000,
2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000
};
// to print type info
/*
to print typenames
for more info, see
https://stackoverflow.com/a/20170989
and
https://stackoverflow.com/a/56766138
*/
template<typename T>
constexpr auto type_name() {
@@ -39,6 +49,17 @@ std::basic_string_view<char> name(T var) {
return type_name<decltype(var)>();
}
/*
This is the function that outputs the results to a file.
it calls int_test (n_run) times for all sizes < maxsize, and then append the
results to the outputfile.
We use a template function to massively reduce the ammount of code needed.
Instead of writing 17 different functions for all different hashmaps, we can
do this. The compiler will then see that a call is made where the map is of type
std::unordered_map, for example, and then generate the function where T is
replaced with std::unordered_map. Well, that's the simple explanation,
more info at https://en.cppreference.com/w/cpp/language/templates
*/
template<class T>
void int_test_aggregate(T map, int runs, int maxsize=20000000) {
@@ -71,7 +92,10 @@ void int_test_aggregate(T map, int runs, int maxsize=20000000) {
}
/*
This is pretty much the same function, but it calls string_test instead of
int_test. More info on why we needed to split this, can be seen in tests.h
*/
template<class T>
void string_test_aggregate(T map, int runs, int maxsize=20000000) {
std::ofstream output{"results.csv", std::ios_base::app};

View File

@@ -5,13 +5,15 @@
#include <random>
#include <string>
// this generates bytes, and we use a seed so it stays deterministic
static std::mt19937 generator(INT32_MAX - 2020);
// these are distributions. they take the bytes that generator outputs
// and do stuff with it so that every output of these has an equally likely chance of getting outputted
static std::uniform_int_distribution<int> insert_int(1, INT32_MAX * 0.875);
static std::uniform_int_distribution<int> noninsert_int(INT32_MAX * 0.875, INT32_MAX);
static std::uniform_int_distribution<int> singlechar(33, 123);
// see generator.cpp for more detail, but the names are pretty self explanatory
int gen_int();
int gen_unsuccesfull_int();

View File

@@ -28,11 +28,17 @@
using std::string;
using absl::Hash;
// this is the prepare function, again using template
// this is for all maps which only need this. Below are the ones that
// need something different
template<class T>
void prepare(T& map, int size) {
map.reserve(size);
}
// needs a tombstone marker(a key that's exclusively used to signify something
// is deleted) and it doesn't have a reserve(size) member
void prepare(google::sparse_hash_map<int, int>& map, int size) {
map.set_deleted_key(-1);
}
@@ -41,6 +47,10 @@ void prepare(google::sparse_hash_map<string, string>& map, int size) {
map.set_deleted_key("a");
}
// needs a tombstone marker(a key that's exclusively used to signify something
// is deleted) and an empty key marker
// and it doesn't have a reserve(size) member
void prepare(google::dense_hash_map<int, int>& map, int size) {
map.set_empty_key(0);
map.set_deleted_key(-1);
@@ -51,7 +61,9 @@ void prepare(google::dense_hash_map<string, string>& map, int size) {
map.set_empty_key("");
}
// with abseil hash
// this is a repeat of the 4 written above, but with types that accept abseil::Hash as hashing function
void prepare(google::sparse_hash_map<int, int, Hash<int>>& map, int size) {
map.set_deleted_key(-1);
}

View File

@@ -18,14 +18,34 @@ using namespace std::chrono;
using std::vector;
using std::cout;
/*
This is yet again a template function.
basic functionality is like this:
1. we create a vector to store the times
2. we create and populate vectors for keys that will be used for the tests
3. create the hashmap.
4. call prepare(hashmap, size)
5. populate the hashmap with size - 10 k,v pairs
6. benchmark the vector access time, which will be subtracted later
7. insert 10k keys(from insert_keys) and time it
8. lookup 10k keys(from sample_keys) and time it
9. lookup 10k nonexistent keys(nonkeys) and time it
10. delete 10k keys(sample_keys) and time it
times are added to the results vector, and that is returned.
(4) this step is called because some hashmaps require some extra steps before
you use them. For example, setting a key that will be the thombstone marker, the
key that will mark a location as empty, etc.
*/
template<class T>
vector<int> int_test(T map, int size) {
vector<int> results; // insert, lookup, unsuccesful lookup, delete times
vector<int> sample_keys; // get a sample of keys to lookup and later delete
vector<int> sample_keys; // get a sample of keys to lookup and later delete, will be filled later
// unsuccesful lookup keys
vector<int> nonkeys(10000);
// generate uses a function(here, gen_unsuccesfull_int) to fill a container with values
std::generate(nonkeys.begin(), nonkeys.end(), gen_unsuccesfull_int);
// keys for insert test
@@ -38,12 +58,17 @@ vector<int> int_test(T map, int size) {
{ // seperate scope, so all_keys gets destroyed. for good measure, empty it too
vector<int> all_keys(size - 10000);
std::generate(all_keys.begin(), all_keys.end(), gen_int);
// sample inserts x ammount of values from old_container to
// new_container with the help of a generator instance
// in this case, random 10k keys from all_keys to sample_keys
std::sample(all_keys.begin(), all_keys.end(), std::back_inserter(sample_keys), 10000, generator);
for (auto i : all_keys) {
testmap.insert({i, i});
}
all_keys.clear();
all_keys.clear(); // going out of scope should call the destructor to
// clear it, but just making sure it's done
}
@@ -65,7 +90,7 @@ vector<int> int_test(T map, int size) {
auto insert_time = (duration_cast<nanoseconds>(insert_end - insert_start) - vector_acces_time) / 10000;
results.push_back(insert_time.count());
// remove some memory
// clear all values in here, clear up some memory
insert_keys.clear();
// lookup test
@@ -103,7 +128,9 @@ vector<int> int_test(T map, int size) {
}
// pretty much the same, but with strings
// the reason it's split up in 2 functions is because we need other functions to
// generate the keys, and unfortunately we can't overload based on return type
template<class T>
vector<int> string_test(T map, int size) {
vector<int> results; // insert, lookup, unsuccesful lookup, delete times

View File

@@ -24,7 +24,13 @@ bool use_abseil_hash = false;
int runs = 1;
int maxsize = 50000000;
int main(int argc, char** argv) {
/*
int_test_aggregate and string_test_aggregate are called for different hashmaps
based on the choices
see implementation of these in includes/aggregate_tests
*/
int main(int argc, char **argv) {
// This is just stuff to add options like selecting which hashmaps need to get benchmarked
CLI::App app{"Hashmap benchmarks"};
app.add_option("-i,--implementation", hashmaps, choicetext )->delimiter(',');
app.add_option("-a,--abseil", use_abseil_hash, "use absl::Hash, default is false");
@@ -32,6 +38,7 @@ int main(int argc, char** argv) {
app.add_option("-m, --maxsize", maxsize, "The max size of the hashmaps to test for. Default is 50 million.");
CLI11_PARSE(app, argc, argv);
time_point<steady_clock> start_test = steady_clock::now();
// calls int_test_aggregate and it's string version for different hashmaps based on the choices selected
if (use_abseil_hash) {
for (auto i : hashmaps) {
switch (i) {