diff --git a/graphs-big.pdf b/graphs-big.pdf new file mode 100644 index 0000000..e501c46 Binary files /dev/null and b/graphs-big.pdf differ diff --git a/last_attempt.html b/graphs.html similarity index 81% rename from last_attempt.html rename to graphs.html index 038d69f..d78c2db 100644 --- a/last_attempt.html +++ b/graphs.html @@ -1,14038 +1,100 @@ - - + + + -last_attempt - - - - - - - - - - - - - - - - - - - - - -
-
+

Integers as keys

+

+ Deletion time +

+

Fastest 5

+ -
-
-
-

Importing libs and setting some defaults like the plot size and forcing matplotlib to use seaborn color schemes with softer, more palatable colors

-
-
-
-
-
-
In [1]:
-
-
-
import pandas as pd
-from pandas import DataFrame, Series
-from numpy import nan
-import matplotlib.pyplot as plt
-import statsmodels.api as sm
-import statsmodels.formula.api as smf
-import numpy as np
-plt.rcParams["figure.figsize"] = (24,5)
-import seaborn as sns
-sns.set()
-sns.set(font_scale=1.75)
-
+

Slowest 5

-
-
-
- -
-
-
-
-

Some default styles that will be used for some functions such as tickmarks, labels, etc

-
-
-
-
-
-
In [2]:
-
-
-
cols = [
-    "TEST","MAP","SIZE", "TIME"
-]
-onmodded = pd.read_csv("new_repr.csv", quotechar="'", header=None)
-
-onmodded = onmodded.drop([0],1)
-onmodded.columns= cols
-onmodded = onmodded.drop([0],0)
-onmodded.iloc[:,2:] = onmodded.iloc[:,2:].astype("int32")
-
-# used for plotter and plotter3, but we can't choose nice looking and distinct enough colors to plot 17 different lines.
-# ultimately abandoned in favor of the default styles from seaborn
-styles = {'absl::flat_hash_map': ["#0000cc"],  # blue
-          "absl::node_hash_map'": ["#3366ff"],
-          'absl::node_hash_map': ["#99ccff"],
-
-          'google::dense_hash_map': ["#ff0000"],  # reds
-          'google::sparse_hash_map': ["#ff6666"],
-          'phmap::parallel_flat_hash_map': ["#ff0066"],
-
-          'ska::bytell_hash_map': ["#009933"],  # greens
-          'ska::flat_hash_map': ["#33cc33"],
-          'ska::unordered_map': ["#99ff66"],
-
-          'tsl::hopscotch_map': ["#9900cc"],  # purples
-          'tsl::robin_map': ["#cc33ff"],
-          'tsl::sparse_map': ["#cc99ff"],
-
-          'robin_hood::unordered_flat_map': ["#ffcc99"],
-          'robin_hood::unordered_node_map': ["#ccff66"],
-
-          'boost::unordered::unordered_map': ["#663300"],  # brown
-
-          'emilib::HashMap': ["#9900cc"],  # purple
-
-          # weird orange
-          'phmap::parallel_node_hash_map': ["#ffcc66", "solid"],
-
-          'std::unordered_map': ["#000000", "solid"],  # black
-         }
-# where to place the x tickmarks and how to label them
-ticks = [50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
-600000, 700000, 800000, 900000, 1000000,
-2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
-15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000]
-ticklabels = ['50 K', '100 K',
- '150 K', '200 K', '250 K', '300 K',
- '350 K', '400 K', '0.5 M', '0.6 M',
- '0.7 M', '0.8 M', '0.9 M', '1 M',
- '2 M', '3 M', '4 M', '5 M',
- '6 M', '7 M', '8 M', '9 M',
- '10 M', '15 M', '20 M', '25 M',
- '30 M', '35 M', '40 M', '45 M', '50 M']
-# convenient way to label the plots for later
-labels = {
-    'int_delete' : ["mean int deletion time", "deletion time (ns)"],
-    'int_insert' : ["mean int insertion time", "insertion time(ns)"],
-    'int_nosucc_lookup' : ["mean int unsucessful lookup time", "unsucessful lookup time (ns)"],
-    'int_succ_lookup' : ["mean int succesful lookup time", "succesful lookup time (ns)"],
-    'string_delete' : ["mean string deletion time", "deletion time (ns)"],
-    'string_insert' : ["mean string insertion time", "insertion time(ns)"], 
-    'string_nosucc_lookup' : ["mean string unsucessful lookup time", "unsucessful lookup time (ns)"],
-    'string_succ_lookup' : ["mean string succesful lookup time", "succesful lookup time (ns)"]
-}
-
- -
-
-
- -
-
-
-
-

Function definitions

Some functions that accept and delete outliers in a range
-A function that removes outliers with the functions mentioned above
-Some helper functions
-Some plotter functions

- -
-
-
-
-
-
In [3]:
-
-
-
# outlier testing functions
-def remove_with_modified_z_score(data, treshold=3.5):
-    # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
-    data = data.astype(int)
-    stats = data.describe()
-    median_absolute_deviation = abs(data - data.median()).median()
-    if not median_absolute_deviation:
-        return data
-    modified_z_scores = abs(0.6745 * (data - data.median()) / median_absolute_deviation)
-    cutoff = modified_z_scores <= treshold
-    data = data * cutoff
-    data = data.replace(0, nan)
-    return data
-
-#function that takes one of the outlier testers and data, and removes outliers
-# originally, we had multiple outlier tests we tried out and this made it convenient to check the differences
-# now it's just the modified_z_score, which we chose
-def remove_outlier(data, function):
-    new_data = data.copy(True)
-    new_data["TIME"] = new_data["TIME"].astype(int)
-    new_data["SIZE"] = new_data["SIZE"].astype(int)
-    new_data
-    for i in range(4216):
-        start = i * 30
-        end = start+30
-        new_data.loc[start:end, "TIME"] = function(data.loc[start:end, "TIME"])
-        if not i % 420:
-            print(i / 42 , "% done")
-    return new_data
-
-
-# helpers for plot functions
-def sort_maps(test):
-    maps = data[data["TEST"]== test]["MAP"].unique()
-    new = [(gr_max.loc[test, i]["TIME"], i) for i in maps]
-    new.sort()
-    new = [i[1] for i in new]
-    return new
-
-def divider(df, maplist):
-    filters = df['MAP'].isin(maplist)
-    return df[filters]
-
-
-
-
-# These are in no way optimized
-# way too slow for normal use
-# And not particularly reusable
-
-def plotter(test, groups):
-    maps = sort_maps(test)
-    plot_now = [5,11,16]
-#     plot_now = [2,5,8,11,14,16]
-
-    for mp in range(0,17):
-        map_data = groups.get_group(('int_insert', maps[mp]))        
-        sns.lineplot(x="SIZE", y="TIME", data=map_data, err_style="bars", color=styles[maps[mp]][0], label=maps[mp], linestyle="--")
-#         sns.lmplot(x=map_data["SIZE"], y=map_data["TIME"], color=styles[maps[mp]][0], label=maps[mp])
-        if mp in plot_now:
-            plt.xscale("log")
-            plt.xticks(ticks, ticklabels)
-            plt.set_xticklabels(rotation=45)
-            plt.ylabel(labels[test][1])
-            plt.legend()
-            plt.title(labels[test][0])
-            plt.show()
-
-def plotter2(test, data):
-    mydata = data[data["TEST"] == test]
-    maps = sort_maps(test)
-    set1 = divider(mydata, maps[:5])
-    set3 = divider(mydata, maps[12:])
-    # ^ updated this to only do the top 5 and bottom 5
-    plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set1, ci=95) # < updated to use 95% CI
-    plt.xscale("log")
-    plt.xticks(ticks, ticklabels)
-    plot.set_xticklabels(
-        plot.get_xticklabels(), 
-        rotation=55, 
-        horizontalalignment='center',
-        fontweight='light',
-    )
-    plt.ylabel(labels[test][1])
-    plt.legend()
-    plt.title(labels[test][0])
-    plt.show()
-    
-    plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set3, ci=95) # < updated to use 95% CI
-    plt.xscale("log")
-    plt.xticks(ticks, ticklabels)
-    plot.set_xticklabels(
-        plot.get_xticklabels(), 
-        rotation=55, 
-        horizontalalignment='center',
-        fontweight='light',
-    )
-    plt.ylabel(labels[test][1])
-    plt.legend()
-    plt.title(labels[test][0])
-    plt.show()
-
-    
-def plotter3(test, groups):
-    maps = sort_maps(test)
-    plot_now = [5,11,16]
-#     plot_now = [2,5,8,11,14,16]
-    for mp in range(0,17):
-        map_data = groups.get_group(('int_insert', maps[mp]))    
-        sns.lineplot(x="SIZE", y="TIME", data=map_data, err_style="band", color=styles[maps[mp]][0], label=maps[mp], linestyle="--")
-#         sns.regplot(x="SIZE", y="TIME", color=styles[maps[mp]][0],label=maps[mp], data=map_data)
-        if mp in plot_now:
-            plt.xscale("log")
-            plt.xticks(ticks, ticklabels)
-            plt.ylabel(labels[test][1])
-            plt.legend()
-#             plt.set_xticklabels(rotation=45)
-            plt.title(labels[test][0])
-            plt.show()
-
- -
-
-
- -
-
-
-
-
    -
  • Remove the outliers
  • -
  • group the data by test and map, in case plotter or plotter3 is used
  • -
  • compute means and max of the map
  • -
- -
-
-
-
-
-
In [4]:
-
-
-
data = remove_outlier(onmodded, remove_with_modified_z_score)
-groups = data.groupby(["TEST", "MAP"])
-gr_max = groups.max()
-gr_mean = groups.mean()
-tests = data["TEST"].unique()
-
- -
-
-
- -
-
- - -
- -
- - -
-
0.0 % done
-10.0 % done
-20.0 % done
-30.0 % done
-40.0 % done
-50.0 % done
-60.0 % done
-70.0 % done
-80.0 % done
-90.0 % done
-100.0 % done
-
-
-
- -
-
- -
-
-
-
In [5]:
-
-
-
plotter2(tests[0], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
- -
- -
- -
- -
- - - - -
-
+" /> +

+ Insertion time +

+

Fastest 5

-
- -
-
- -
-
-
-
In [6]:
-
-
-
plotter2(tests[1], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
-
- -
- -
- -
- - - - -
+" /> +

Slowest 5

-
+" /> -
+

Unsuccesful lookup time

+

Fastest 5

-
-
- -
-
-
-
In [7]:
-
-
-
plotter2(tests[2], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
-
+" /> -
+

Slowest 5

-
- -
- - - - -
-
+" /> -
+

Successful lookup time

+

Fastest 5

-
-
- -
-
-
-
In [8]:
-
-
-
plotter2(tests[3], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
-
+" />, -
+

slowest 5

-
- -
- - - - -
-
+" /> -
+

Strings as keys

+

Deletion time

+

Fastest 5

-
-
- -
-
-
-
In [9]:
-
-
-
plotter2(tests[4], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
-
- -
- -
- -
+" /> +

Slowest 5

- -
-
+" /> -
+

Insertion time

+

Fastest 5

-
-
- -
-
-
-
In [10]:
-
-
-
plotter2(tests[5], data)
-
- -
-
-
- -
-
- - -
- -
- - - - -
-
- -
- -
- -
- - - - -
+" /> +

Slowest 5

-
- -
- -
-
- -
-
-
-
In [11]:
-
-
-
plotter2(tests[6], data)
-
- -
-
-
- -
-
+" /> +

Unsuccesful lookup time

+

Fastest 5

-
- -
- - - - -
-
+" /> +

Slowest 5

-
- -
- -
- - - - -
-
- -
- -
-
- -
-
-
-
In [12]:
-
-
-
plotter2(tests[7], data)
-
- -
-
-
- -
-
+" /> +

Successful lookup time

+

Fastest 5

-
- -
- - - - -
-
+" /> -
- -
- -
- - - - -
+

Slowest 5

-
+" /> -
- -
-
- -
-
-
-
In [13]:
-
-
-
data["MAP"].unique()
-
- -
-
-
- -
-
- - -
- -
Out[13]:
- - - - -
-
array(['absl::flat_hash_map', 'absl::node_hash_map',
-       'boost::unordered::unordered_map', 'emilib::HashMap',
-       'google::dense_hash_map', 'google::sparse_hash_map',
-       'phmap::parallel_flat_hash_map', 'phmap::parallel_node_hash_map',
-       'robin_hood::unordered_flat_map', 'robin_hood::unordered_node_map',
-       'ska::bytell_hash_map', 'ska::flat_hash_map', 'ska::unordered_map',
-       'std::unordered_map', 'tsl::hopscotch_map', 'tsl::robin_map',
-       'tsl::sparse_map'], dtype=object)
-
- -
- -
-
- -
-
-
-
-

Quick way of getting the mean time of each operation and map (for all sizes)
-This number is totally meaningless on it's own, and the absolute values (in ns) should just not be mentioned -I think it's meaningless to mention that the mean insertion time of a map is 50ns between sizes 50K and 50M. -It "loses" so much info.
-But it is a decent use to rank the maps on.
-Eventhough it is somewhat biased towards maps that perform decently at large sizes. Or in other words, this ranking makes the fact that a function is blazing fast for small sizes matter way less.
-This is due to the following: -lets say there are 5 measuring points below 1M, and 20 measuring points between 1M and 50M -when that is averaged, way more points will be >1M, so they will weight more for the mean.

-

But the graphs are pretty hard to get a quick overview of after a quick glance, so this is my best attempt to create a summary of the relative rankings. -And I don't have the slightest idea how you would rank them besides a mean value. A max value would be a good idea, except that there's random spiky behavour which have higher values in the middle than at 50M. And it's even less representative of the times as a whole compared to the mean. So we have this.

- -
-
-
-
-
-
In [14]:
-
-
-
test = "string_insert"
-alpha_maps = gr_mean.loc[test]["TIME"].index
-rank = list(gr_mean.loc[test]["TIME"].sort_values().index) # sort by mean
-rank = [rank.index(i) for i in alpha_maps] # get the index for each map in alphamaps
-for i in rank:
-    print(i + 1)
-
- -
-
-
- -
-
- - -
- -
- - -
-
11
-9
-13
-4
-6
-17
-12
-14
-7
-1
-8
-3
-10
-15
-5
-2
-16
-
-
-
- -
-
- -
-
-
-
In [15]:
-
-
-
alpha_maps
-
- -
-
-
- -
-
- - -
- -
Out[15]:
- - - - -
-
Index(['absl::flat_hash_map', 'absl::node_hash_map',
-       'boost::unordered::unordered_map', 'emilib::HashMap',
-       'google::dense_hash_map', 'google::sparse_hash_map',
-       'phmap::parallel_flat_hash_map', 'phmap::parallel_node_hash_map',
-       'robin_hood::unordered_flat_map', 'robin_hood::unordered_node_map',
-       'ska::bytell_hash_map', 'ska::flat_hash_map', 'ska::unordered_map',
-       'std::unordered_map', 'tsl::hopscotch_map', 'tsl::robin_map',
-       'tsl::sparse_map'],
-      dtype='object', name='MAP')
-
- -
- -
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
- - - - + \ No newline at end of file diff --git a/readme.md b/readme.md index 529a476..aa6f434 100644 --- a/readme.md +++ b/readme.md @@ -1,8 +1,5 @@ # Files and what they contain -# HTML export for those who don't have a data science stack + python installed -[last_attempt.html](./last_attempt.html) - ## results.csv [Results.csv](./results.csv) has the raw data as outputed by the benchmark code @@ -16,7 +13,7 @@ [New_repr.csv](./new_repr.csv) has the raw data, but transformed from `test, map, 50k, ...., 50M` to -`test, map, size, time` +`test, map, size, time` ## new_repr_no_outlier.csv @@ -28,5 +25,8 @@ to Last attempt has our last attempt at plotting, plus some random stuff at the end +# HTML export for those who don't have a data science stack + python installed +[graphs-big.pdf](./graphs-big.pdf) +