updated to do 95% CI instead of 1 std error bars. As if a difference is even noticable

2020-05-11 21:28:43 -03:00
parent 607cd2e1dd
commit 529fd8841a
14 changed files with 14286 additions and 5036 deletions
--- a/.ipynb_checkpoints/better-plotting-checkpoint.ipynb
+++ b/.ipynb_checkpoints/better-plotting-checkpoint.ipynb
--- a/.ipynb_checkpoints/ignore_random_ops-checkpoint.ipynb
+++ b/.ipynb_checkpoints/ignore_random_ops-checkpoint.ipynb
--- a/.ipynb_checkpoints/last_attempt-checkpoint.ipynb
+++ b/.ipynb_checkpoints/last_attempt-checkpoint.ipynb
--- a/.ipynb_checkpoints/regression
+++ b/.ipynb_checkpoints/regression
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +0,0 @@
 {
    "python.pythonPath": "C:\\python3.8\\python.exe"
 }
--- a/better-plotting.ipynb
+++ b/better-plotting.ipynb
--- a/ignore_random_ops.ipynb
+++ b/ignore_random_ops.ipynb
--- a/last_attempt.html
+++ b/last_attempt.html
--- a/last_attempt.ipynb
+++ b/last_attempt.ipynb
--- a/last_attempt.py
+++ b/last_attempt.py
@@ -1,223 +0,0 @@
 #!/usr/bin/env python
 # coding: utf-8
 # In[1]:
 import pandas as pd
 from pandas import DataFrame, Series
 from numpy import nan
 import matplotlib.pyplot as plt
 plt.rcParams["figure.figsize"] = (24,5)
 import seaborn as sns
 sns.set()
 sns.set(font_scale=1.75)
 # In[2]:
 cols = [
    "TEST","MAP","SIZE", "TIME"
 ]
 onmodded = pd.read_csv("new_repr.csv", quotechar="'", header=None)
 onmodded = onmodded.drop([0],1)
 onmodded.columns= cols
 onmodded = onmodded.drop([0],0)
 onmodded.iloc[:,2:] = onmodded.iloc[:,2:].astype("int32")
 styles = {'absl::flat_hash_map': ["#0000cc"],  # blue
          "absl::node_hash_map'": ["#3366ff"],
          'absl::node_hash_map': ["#99ccff"],
          'google::dense_hash_map': ["#ff0000"],  # reds
          'google::sparse_hash_map': ["#ff6666"],
          'phmap::parallel_flat_hash_map': ["#ff0066"],
          'ska::bytell_hash_map': ["#009933"],  # greens
          'ska::flat_hash_map': ["#33cc33"],
          'ska::unordered_map': ["#99ff66"],
          'tsl::hopscotch_map': ["#9900cc"],  # purples
          'tsl::robin_map': ["#cc33ff"],
          'tsl::sparse_map': ["#cc99ff"],
          'robin_hood::unordered_flat_map': ["#ffcc99"],
          'robin_hood::unordered_node_map': ["#ccff66"],
          'boost::unordered::unordered_map': ["#663300"],  # brown
          'emilib::HashMap': ["#9900cc"],  # purple
          # weird orange
          'phmap::parallel_node_hash_map': ["#ffcc66", "solid"],
          'std::unordered_map': ["#000000", "solid"],  # black
         }
 ticks = [50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
 600000, 700000, 800000, 900000, 1000000,
 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
 15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000]
 ticklabels = ['50 K', '100 K',
 '150 K', '200 K', '250 K', '300 K',
 '350 K', '400 K', '0.5 M', '0.6 M',
 '0.7 M', '0.8 M', '0.9 M', '1 M',
 '2 M', '3 M', '4 M', '5 M',
 '6 M', '7 M', '8 M', '9 M',
 '10 M', '15 M', '20 M', '25 M',
 '30 M', '35 M', '40 M', '45 M', '50 M']
 labels = {
    'int_delete' : ["mean int deletion time", "deletion time (ns)"],
    'int_insert' : ["mean int insertion time", "insertion time(ns)"],
    'int_nosucc_lookup' : ["mean int unsucessful lookup time", "unsucessful lookup time (ns)"],
    'int_succ_lookup' : ["mean int succesful lookup time", "succesful lookup time (ns)"],
    'string_delete' : ["mean string deletion time", "deletion time (ns)"],
    'string_insert' : ["mean string insertion time", "insertion time(ns)"], 
    'string_nosucc_lookup' : ["mean string unsucessful lookup time", "unsucessful lookup time (ns)"],
    'string_succ_lookup' : ["mean string succesful lookup time", "succesful lookup time (ns)"]
 }
 # In[3]:
 # outlier testing functions
 def remove_with_modified_z_score(data, treshold=3.5):
    # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
    data = data.astype(int)
    stats = data.describe()
    median_absolute_deviation = abs(data - data.median()).median()
    if not median_absolute_deviation:
        return data
    modified_z_scores = abs(0.6745 * (data - data.median()) / median_absolute_deviation)
    cutoff = modified_z_scores <= treshold
    data = data * cutoff
    data = data.replace(0, nan)
    return data
 #function that takes one of the outlier testers and data, and removes outliers
 def remove_outlier(data, function):
    new_data = data.copy(True)
    new_data["TIME"] = new_data["TIME"].astype(int)
    new_data["SIZE"] = new_data["SIZE"].astype(int)
    new_data
    for i in range(4216):
        start = i * 30
        end = start+30
        new_data.loc[start:end, "TIME"] = function(data.loc[start:end, "TIME"])
        if not i % 420:
            print(i / 42 , "% done")
    return new_data
 # helpers for plot functions
 def sort_maps(test):
    maps = data[data["TEST"]== test]["MAP"].unique()
    new = [(gr_max.loc[test, i]["TIME"], i) for i in maps]
    new.sort()
    new = [i[1] for i in new]
    return new
 def divider(df, maplist):
    filters = df['MAP'].isin(maplist)
    return df[filters]
 def plotter2(test, data):
    mydata = data[data["TEST"] == test]
    maps = sort_maps(test)
    set1 = divider(mydata, maps[:5])
    set2 = divider(mydata, maps[5:11])
    set3 = divider(mydata, maps[11:])
    plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set1)
    plt.xscale("log")
    plt.xticks(ticks, ticklabels)
    plot.set_xticklabels(
        plot.get_xticklabels(), 
        rotation=55, 
        horizontalalignment='center',
        fontweight='light',
    )
    plt.ylabel(labels[test][1])
    plt.legend()
    plt.title(labels[test][0])
    plt.savefig("./plots/{}/{}.png".format(test,1))
    plt.clf()
    plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set2)
    plt.xscale("log")
    plt.xticks(ticks, ticklabels)
    plot.set_xticklabels(
        plot.get_xticklabels(), 
        rotation=55, 
        horizontalalignment='center',
        fontweight='light',
        )
    plt.ylabel(labels[test][1])
    plt.legend()
    plt.title(labels[test][0])
    plt.savefig("./plots/{}/{}.png".format(test,2))
    plt.clf()
    plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set2)
    plt.xscale("log")
    plt.xticks(ticks, ticklabels)
    plot.set_xticklabels(
        plot.get_xticklabels(), 
        rotation=55, 
        horizontalalignment='center',
        fontweight='light',
    )
    plt.ylabel(labels[test][1])
    plt.legend()
    plt.title(labels[test][0])
    plt.savefig("./plots/{}/{}.png".format(test,3))
    plt.clf()
 # In[4]:
 data = remove_outlier(onmodded, remove_with_modified_z_score)
 groups = data.groupby(["TEST", "MAP"])
 gr_max = groups.max()
 gr_mean = groups.mean()
 # In[5]:
 tests = data["TEST"].unique()
 for i in tests:
    plotter2(i, data)
 # In[6]:
 # In[ ]:
 # In[ ]:
 # In[ ]:
--- a/plot_stuff.py
+++ b/plot_stuff.py
@@ -1,90 +0,0 @@
 import seaborn as sns
 import pandas as pd
 from pandas import DataFrame, Series
 import matplotlib.pyplot as plt
 import pathlib
 import os
 plt.rcParams["figure.figsize"] = (40, 5)
 sns.set()
 ## new cell
 cols = [
    "TEST", "MAP", '50000', '100000', '150000', '200000', '250000', '300000', '350000', '400000',
    '500000', '600000', '700000', '800000', '900000', '1000000', '2000000', '3000000',
    '4000000', '5000000', '6000000', '7000000', '8000000', '9000000', '10000000', '15000000',
    '20000000', '25000000', '30000000', '35000000', '40000000', '45000000', '50000000'
 ]
 data = pd.read_csv("results.csv", quotechar="'", header=None)
 data.columns = cols
 # data.head()
 ## new cell
 groups = data.groupby(["TEST", "MAP"])
 groups_mean = groups.mean()
 groups_std = groups.std()
 ## new cell
 def max_val(hmap, test):
    return groups_mean.loc[test, hmap].max()
 def sort_maps(test):
    maps = list(groups_mean.loc[test].index)
    new = [(max_val(i, test), i) for i in maps]
    new.sort()
    new = [i[1] for i in new]
    return new
 def plot_test(test, include_error=True, log=False):
    sizes = [50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
        600000, 700000, 800000, 900000, 1000000,
        2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
        15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000]
    maps = sort_maps(test)
    # print(maps)
    count = 16
    repeats = [0, 5, 11]
    while count > -1:
        if not count and count not in repeats:
            break
        mp = maps[count]
        if include_error:
            plt.errorbar(groups_mean.columns,groups_mean.loc[test, mp], yerr=groups_std.loc[test, mp], label=mp)
        else:
            plt.plot(groups_mean.columns, groups_mean.loc[test, mp], label=mp)
        if count in repeats:
            if log:
                plt.xscale("log")
            plt.ylabel("{} time (ns)".format(test))
            plt.legend()
            plt.title(test)
            plt.savefig("./plots/{}/{}.png".format(test,count))
            plt.clf()
            # plt.show()
            repeats.pop(repeats.index(count))
        else:
            count -=1
 ## new cell
 tests = data["TEST"].unique()
 p = pathlib.Path("./plots")
 if not p.is_dir():
    p.mkdir()
 for i in tests:
    path = pathlib.Path("./plots/{}/".format(i))
    if not path.is_dir():
        path.mkdir()
    print(i)
    plot_test(i, False)
    # break
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,8 @@
 # Files and what they contain
 # HTML export for those who don't have a data science stack + python installed
 [last_attempt.html](./last_attempt.html)
 ## results.csv
 [Results.csv](./results.csv) has the raw data as outputed by the benchmark code
@@ -17,12 +20,13 @@ to
 ## new_repr_no_outlier.csv
-[New_repr.csv](./new_repr_no_outlier.csv) has the data from no_repr but with outliers removed with the modified z score test
+[new_repr_no_outlier.csv](./new_repr_no_outlier.csv) has the data from no_repr but with outliers removed with the modified z score test
 # Notebooks
 ## Better_plotting
 better plotting is an incomplete and now broken notebook where we attempted to plot on the raw results
 ## Last_attempt
 Last attempt has our last attempt at plotting, plus some random stuff at the end
--- a/R.ipynb
+++ b/R.ipynb
--- a/to.py
+++ b/to.py
@@ -1,28 +0,0 @@
 styles = {'absl::flat_hash_map': ["#0000cc", "--"],  # blue
          "absl::node_hash_map'": ["#3366ff", "--"],
          'absl::node_hash_map': ["#99ccff", "--"],
          'google::dense_hash_map': ["#ff0000", "-."],  # reds
          'google::sparse_hash_map': ["#ff6666", "-,"],
          'phmap::parallel_flat_hash_map': ["#ff0066", "-."],
          'ska::bytell_hash_map': ["#009933", "- "],  # greens
          'ska::flat_hash_map': ["#33cc33", "- "],
          'ska::unordered_map': ["#99ff66", "- "],
          'tsl::hopscotch_map': ["#9900cc", ":"],  # purples
          'tsl::robin_map': ["#cc33ff", ":"],
          'tsl::sparse_map': ["#cc99ff", ":"],
          'robin_hood::unordered_flat_map': ["#ffcc99", ".."],
          'robin_hood::unordered_node_map': ["#ccff66", ".."],
          'boost::unordered::unordered_map': ["#663300", "solid"],  # brown
          'emilib::HashMap': ["#9900cc", "solid"],  # purple
          # weird orange
          'phmap::parallel_node_hash_map': ["#ffcc66", "solid"],
          'std::unordered_map': ["#000000", "solid"],  # black
          }