Some documentation

2020-02-18 00:00:25 -03:00
parent b82199e3b0
commit 94b7399926
39 changed files with 87 additions and 8900 deletions
--- a/src/generator.cpp
+++ b/src/generator.cpp
@@ -1,15 +1,15 @@

-#include "./includes/generator.h"
-
+#include "./includes/generator.h" // imports a generator to be used for the functions

+// generates string to be used as a key
 std::string gen_string() { // 90^size posibilities
-    std::string randomstring;
+  std::string randomstring;
    for (int i = 0; i < 5; ++i) {
        randomstring += singlechar(generator);
    }
    return randomstring;
 }
-
+// gen strings that dont exist in the hashmap
 std::string gen_unsuccesfull_string() { // 90^size posibilities
    std::string randomstring;
    for (int i = 0; i < 4; ++i) {
@@ -18,10 +18,11 @@ std::string gen_unsuccesfull_string() { // 90^size posibilities
    return randomstring;
 }

+// gen integers to be used as keys
 int gen_int() {
    return insert_int(generator);
 }
-
+//gen ints that don't exist in the hashmap
 int gen_unsuccesfull_int() {
    return noninsert_int(generator);
 }
--- a/src/includes/aggregate_tests.h
+++ b/src/includes/aggregate_tests.h
@@ -4,13 +4,23 @@
 #include <fstream>
 #include "./tests.h"

+// sizes that will be tested
 vector<int> sizes = {
        50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
        600000, 700000, 800000, 900000, 1000000,
        2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
        15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000
 };
-// to print type info
+
+
+
+/*
+to print typenames
+for more info, see
+https://stackoverflow.com/a/20170989
+and
+https://stackoverflow.com/a/56766138
+*/

 template<typename T>
 constexpr auto type_name() {
@@ -39,6 +49,17 @@ std::basic_string_view<char> name(T var) {
    return type_name<decltype(var)>();
 }

+/*
+This is the function that outputs the results to a file.
+it calls int_test (n_run) times for all sizes < maxsize, and then append the
+results to the outputfile.
+We use a template function to massively reduce the ammount of code needed.
+Instead of writing 17 different functions for all different hashmaps, we can
+do this. The compiler will then see that a call is made where the map is of type
+std::unordered_map, for example, and then generate the function where T is
+replaced with std::unordered_map. Well, that's the simple explanation,
+more info at https://en.cppreference.com/w/cpp/language/templates
+*/

 template<class T>
 void int_test_aggregate(T map, int runs, int maxsize=20000000) {
@@ -71,7 +92,10 @@ void int_test_aggregate(T map, int runs, int maxsize=20000000) {

 }

-
+/*
+This is pretty much the same function, but it calls string_test instead of
+int_test. More info on why we needed to split this, can be seen in tests.h
+*/
 template<class T>
 void string_test_aggregate(T map, int runs, int maxsize=20000000) {
    std::ofstream output{"results.csv", std::ios_base::app};
--- a/src/includes/generator.h
+++ b/src/includes/generator.h
@@ -5,13 +5,15 @@

 #include <random>
 #include <string>
-
+// this generates bytes, and we use a seed so it stays deterministic
 static std::mt19937 generator(INT32_MAX - 2020);
+// these are distributions. they take the bytes that generator outputs
+// and do stuff with it so that every output of these has an equally likely chance of getting outputted
 static std::uniform_int_distribution<int> insert_int(1, INT32_MAX * 0.875);
 static std::uniform_int_distribution<int> noninsert_int(INT32_MAX * 0.875, INT32_MAX);
 static std::uniform_int_distribution<int> singlechar(33, 123);

-
+// see generator.cpp for more detail, but the names are pretty self explanatory
 int gen_int();

 int gen_unsuccesfull_int();
--- a/src/includes/prepare.h
+++ b/src/includes/prepare.h
@@ -28,11 +28,17 @@
 using std::string;
 using absl::Hash;

+
+// this is the prepare function, again using template
+// this is for all maps which only need this. Below are the ones that
+// need something different
 template<class T>
 void prepare(T& map, int size) {
    map.reserve(size);
 }

+// needs a tombstone marker(a key that's exclusively used to signify something
+// is deleted) and it doesn't have a reserve(size) member
 void prepare(google::sparse_hash_map<int, int>& map, int size) {
    map.set_deleted_key(-1);
 }
@@ -41,6 +47,10 @@ void prepare(google::sparse_hash_map<string, string>& map, int size) {
    map.set_deleted_key("a");
 }

+// needs a tombstone marker(a key that's exclusively used to signify something
+// is deleted) and an empty key marker
+// and it doesn't have a reserve(size) member
+
 void prepare(google::dense_hash_map<int, int>& map, int size) {
    map.set_empty_key(0);
    map.set_deleted_key(-1);
@@ -51,7 +61,9 @@ void prepare(google::dense_hash_map<string, string>& map, int size) {
    map.set_empty_key("");
 }

+
 // with abseil hash
+// this is a repeat of the 4 written above, but with types that accept abseil::Hash as hashing function
 void prepare(google::sparse_hash_map<int, int, Hash<int>>& map, int size) {
    map.set_deleted_key(-1);
 }
--- a/src/includes/tests.h
+++ b/src/includes/tests.h
@@ -18,14 +18,34 @@ using namespace std::chrono;
 using std::vector;
 using std::cout;

+/*
+This is yet again a template function.
+basic functionality is like this:
+1. we create a vector to store the times
+2. we create and populate vectors for keys that will be used for the tests
+3. create the hashmap.
+4. call prepare(hashmap, size)
+5. populate the hashmap with size - 10 k,v pairs
+6. benchmark the vector access time, which will be subtracted later
+7. insert 10k keys(from insert_keys) and time it
+8. lookup 10k keys(from sample_keys) and time it
+9. lookup 10k nonexistent keys(nonkeys) and time it
+10. delete 10k keys(sample_keys) and time it
+times are added to the results vector, and that is returned.

+(4) this step is called because some hashmaps require some extra steps before
+you use them. For example, setting a key that will be the thombstone marker, the
+key that will mark a location as empty, etc.
+
+*/
 template<class T>
 vector<int> int_test(T map, int size) {
    vector<int> results; // insert, lookup, unsuccesful lookup, delete times
-    vector<int> sample_keys; // get a sample of keys to lookup and later delete
+    vector<int> sample_keys; // get a sample of keys to lookup and later delete, will be filled later

    // unsuccesful lookup keys
    vector<int> nonkeys(10000);
+    // generate uses a function(here, gen_unsuccesfull_int) to fill a container with values
    std::generate(nonkeys.begin(), nonkeys.end(), gen_unsuccesfull_int);

    // keys for insert test
@@ -38,12 +58,17 @@ vector<int> int_test(T map, int size) {
    { // seperate scope, so all_keys gets destroyed. for good measure, empty it too
        vector<int> all_keys(size - 10000);
        std::generate(all_keys.begin(), all_keys.end(), gen_int);
+        // sample inserts x ammount of values from old_container to
+        // new_container with the help of a generator instance
+        // in this case, random 10k keys from all_keys to sample_keys 
        std::sample(all_keys.begin(), all_keys.end(), std::back_inserter(sample_keys), 10000, generator);

        for (auto i : all_keys) {
            testmap.insert({i, i});
        }
-        all_keys.clear();
+        all_keys.clear(); // going out of scope should call the destructor to
+                          // clear it, but just making sure it's done
+        
    }


@@ -65,7 +90,7 @@ vector<int> int_test(T map, int size) {

    auto insert_time = (duration_cast<nanoseconds>(insert_end - insert_start) - vector_acces_time) / 10000;
    results.push_back(insert_time.count());
-    // remove some memory
+    // clear all values in here, clear up some memory
    insert_keys.clear();

    // lookup test
@@ -103,7 +128,9 @@ vector<int> int_test(T map, int size) {

 }

-
+// pretty much the same, but with strings
+// the reason it's split up in 2 functions is because we need other functions to
+// generate the keys, and unfortunately we can't overload based on return type
 template<class T>
 vector<int> string_test(T map, int size) {
    vector<int> results; // insert, lookup, unsuccesful lookup, delete times
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -24,7 +24,13 @@ bool use_abseil_hash = false;
 int runs = 1;
 int maxsize = 50000000;

-int main(int argc, char** argv) {
+/*
+int_test_aggregate and string_test_aggregate are called for different hashmaps
+based on the choices
+see implementation of these in includes/aggregate_tests
+*/
+int main(int argc, char **argv) {
+  // This is just stuff to add options like selecting which hashmaps need to get benchmarked
    CLI::App app{"Hashmap benchmarks"};
    app.add_option("-i,--implementation", hashmaps, choicetext )->delimiter(',');
    app.add_option("-a,--abseil", use_abseil_hash, "use absl::Hash, default is false");
@@ -32,6 +38,7 @@ int main(int argc, char** argv) {
    app.add_option("-m, --maxsize", maxsize, "The max size of the hashmaps to test for. Default is 50 million.");
    CLI11_PARSE(app, argc, argv);
    time_point<steady_clock> start_test = steady_clock::now();
+    // calls int_test_aggregate and it's string version for different hashmaps based on the choices selected
    if (use_abseil_hash) {
        for (auto i : hashmaps) {
            switch (i) {