updated to do 95% CI instead of 1 std error bars. As if a difference is even noticable

This commit is contained in:
MassiveAtoms 2020-05-11 21:28:43 -03:00
parent 607cd2e1dd
commit 529fd8841a
14 changed files with 14286 additions and 5036 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
{
"python.pythonPath": "C:\\python3.8\\python.exe"
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

14038
last_attempt.html Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,223 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
from pandas import DataFrame, Series
from numpy import nan
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (24,5)
import seaborn as sns
sns.set()
sns.set(font_scale=1.75)
# In[2]:
cols = [
"TEST","MAP","SIZE", "TIME"
]
onmodded = pd.read_csv("new_repr.csv", quotechar="'", header=None)
onmodded = onmodded.drop([0],1)
onmodded.columns= cols
onmodded = onmodded.drop([0],0)
onmodded.iloc[:,2:] = onmodded.iloc[:,2:].astype("int32")
styles = {'absl::flat_hash_map': ["#0000cc"], # blue
"absl::node_hash_map'": ["#3366ff"],
'absl::node_hash_map': ["#99ccff"],
'google::dense_hash_map': ["#ff0000"], # reds
'google::sparse_hash_map': ["#ff6666"],
'phmap::parallel_flat_hash_map': ["#ff0066"],
'ska::bytell_hash_map': ["#009933"], # greens
'ska::flat_hash_map': ["#33cc33"],
'ska::unordered_map': ["#99ff66"],
'tsl::hopscotch_map': ["#9900cc"], # purples
'tsl::robin_map': ["#cc33ff"],
'tsl::sparse_map': ["#cc99ff"],
'robin_hood::unordered_flat_map': ["#ffcc99"],
'robin_hood::unordered_node_map': ["#ccff66"],
'boost::unordered::unordered_map': ["#663300"], # brown
'emilib::HashMap': ["#9900cc"], # purple
# weird orange
'phmap::parallel_node_hash_map': ["#ffcc66", "solid"],
'std::unordered_map': ["#000000", "solid"], # black
}
ticks = [50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
600000, 700000, 800000, 900000, 1000000,
2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000]
ticklabels = ['50 K', '100 K',
'150 K', '200 K', '250 K', '300 K',
'350 K', '400 K', '0.5 M', '0.6 M',
'0.7 M', '0.8 M', '0.9 M', '1 M',
'2 M', '3 M', '4 M', '5 M',
'6 M', '7 M', '8 M', '9 M',
'10 M', '15 M', '20 M', '25 M',
'30 M', '35 M', '40 M', '45 M', '50 M']
labels = {
'int_delete' : ["mean int deletion time", "deletion time (ns)"],
'int_insert' : ["mean int insertion time", "insertion time(ns)"],
'int_nosucc_lookup' : ["mean int unsucessful lookup time", "unsucessful lookup time (ns)"],
'int_succ_lookup' : ["mean int succesful lookup time", "succesful lookup time (ns)"],
'string_delete' : ["mean string deletion time", "deletion time (ns)"],
'string_insert' : ["mean string insertion time", "insertion time(ns)"],
'string_nosucc_lookup' : ["mean string unsucessful lookup time", "unsucessful lookup time (ns)"],
'string_succ_lookup' : ["mean string succesful lookup time", "succesful lookup time (ns)"]
}
# In[3]:
# outlier testing functions
def remove_with_modified_z_score(data, treshold=3.5):
# https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
data = data.astype(int)
stats = data.describe()
median_absolute_deviation = abs(data - data.median()).median()
if not median_absolute_deviation:
return data
modified_z_scores = abs(0.6745 * (data - data.median()) / median_absolute_deviation)
cutoff = modified_z_scores <= treshold
data = data * cutoff
data = data.replace(0, nan)
return data
#function that takes one of the outlier testers and data, and removes outliers
def remove_outlier(data, function):
new_data = data.copy(True)
new_data["TIME"] = new_data["TIME"].astype(int)
new_data["SIZE"] = new_data["SIZE"].astype(int)
new_data
for i in range(4216):
start = i * 30
end = start+30
new_data.loc[start:end, "TIME"] = function(data.loc[start:end, "TIME"])
if not i % 420:
print(i / 42 , "% done")
return new_data
# helpers for plot functions
def sort_maps(test):
maps = data[data["TEST"]== test]["MAP"].unique()
new = [(gr_max.loc[test, i]["TIME"], i) for i in maps]
new.sort()
new = [i[1] for i in new]
return new
def divider(df, maplist):
filters = df['MAP'].isin(maplist)
return df[filters]
def plotter2(test, data):
mydata = data[data["TEST"] == test]
maps = sort_maps(test)
set1 = divider(mydata, maps[:5])
set2 = divider(mydata, maps[5:11])
set3 = divider(mydata, maps[11:])
plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set1)
plt.xscale("log")
plt.xticks(ticks, ticklabels)
plot.set_xticklabels(
plot.get_xticklabels(),
rotation=55,
horizontalalignment='center',
fontweight='light',
)
plt.ylabel(labels[test][1])
plt.legend()
plt.title(labels[test][0])
plt.savefig("./plots/{}/{}.png".format(test,1))
plt.clf()
plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set2)
plt.xscale("log")
plt.xticks(ticks, ticklabels)
plot.set_xticklabels(
plot.get_xticklabels(),
rotation=55,
horizontalalignment='center',
fontweight='light',
)
plt.ylabel(labels[test][1])
plt.legend()
plt.title(labels[test][0])
plt.savefig("./plots/{}/{}.png".format(test,2))
plt.clf()
plot = sns.lineplot(x="SIZE", y="TIME", hue="MAP", data=set2)
plt.xscale("log")
plt.xticks(ticks, ticklabels)
plot.set_xticklabels(
plot.get_xticklabels(),
rotation=55,
horizontalalignment='center',
fontweight='light',
)
plt.ylabel(labels[test][1])
plt.legend()
plt.title(labels[test][0])
plt.savefig("./plots/{}/{}.png".format(test,3))
plt.clf()
# In[4]:
data = remove_outlier(onmodded, remove_with_modified_z_score)
groups = data.groupby(["TEST", "MAP"])
gr_max = groups.max()
gr_mean = groups.mean()
# In[5]:
tests = data["TEST"].unique()
for i in tests:
plotter2(i, data)
# In[6]:
# In[ ]:
# In[ ]:
# In[ ]:

View File

@ -1,90 +0,0 @@
import seaborn as sns
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import pathlib
import os
plt.rcParams["figure.figsize"] = (40, 5)
sns.set()
## new cell
cols = [
"TEST", "MAP", '50000', '100000', '150000', '200000', '250000', '300000', '350000', '400000',
'500000', '600000', '700000', '800000', '900000', '1000000', '2000000', '3000000',
'4000000', '5000000', '6000000', '7000000', '8000000', '9000000', '10000000', '15000000',
'20000000', '25000000', '30000000', '35000000', '40000000', '45000000', '50000000'
]
data = pd.read_csv("results.csv", quotechar="'", header=None)
data.columns = cols
# data.head()
## new cell
groups = data.groupby(["TEST", "MAP"])
groups_mean = groups.mean()
groups_std = groups.std()
## new cell
def max_val(hmap, test):
return groups_mean.loc[test, hmap].max()
def sort_maps(test):
maps = list(groups_mean.loc[test].index)
new = [(max_val(i, test), i) for i in maps]
new.sort()
new = [i[1] for i in new]
return new
def plot_test(test, include_error=True, log=False):
sizes = [50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 500000,
600000, 700000, 800000, 900000, 1000000,
2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000,
15000000, 20000000, 25000000, 30000000, 35000000, 40000000, 45000000, 50000000]
maps = sort_maps(test)
# print(maps)
count = 16
repeats = [0, 5, 11]
while count > -1:
if not count and count not in repeats:
break
mp = maps[count]
if include_error:
plt.errorbar(groups_mean.columns,groups_mean.loc[test, mp], yerr=groups_std.loc[test, mp], label=mp)
else:
plt.plot(groups_mean.columns, groups_mean.loc[test, mp], label=mp)
if count in repeats:
if log:
plt.xscale("log")
plt.ylabel("{} time (ns)".format(test))
plt.legend()
plt.title(test)
plt.savefig("./plots/{}/{}.png".format(test,count))
plt.clf()
# plt.show()
repeats.pop(repeats.index(count))
else:
count -=1
## new cell
tests = data["TEST"].unique()
p = pathlib.Path("./plots")
if not p.is_dir():
p.mkdir()
for i in tests:
path = pathlib.Path("./plots/{}/".format(i))
if not path.is_dir():
path.mkdir()
print(i)
plot_test(i, False)
# break

View File

@ -1,5 +1,8 @@
# Files and what they contain
# HTML export for those who don't have a data science stack + python installed
[last_attempt.html](./last_attempt.html)
## results.csv
[Results.csv](./results.csv) has the raw data as outputed by the benchmark code
@ -17,12 +20,13 @@ to
## new_repr_no_outlier.csv
[New_repr.csv](./new_repr_no_outlier.csv) has the data from no_repr but with outliers removed with the modified z score test
[new_repr_no_outlier.csv](./new_repr_no_outlier.csv) has the data from no_repr but with outliers removed with the modified z score test
# Notebooks
## Better_plotting
better plotting is an incomplete and now broken notebook where we attempted to plot on the raw results
## Last_attempt
Last attempt has our last attempt at plotting, plus some random stuff at the end

File diff suppressed because it is too large Load Diff

28
to.py
View File

@ -1,28 +0,0 @@
styles = {'absl::flat_hash_map': ["#0000cc", "--"], # blue
"absl::node_hash_map'": ["#3366ff", "--"],
'absl::node_hash_map': ["#99ccff", "--"],
'google::dense_hash_map': ["#ff0000", "-."], # reds
'google::sparse_hash_map': ["#ff6666", "-,"],
'phmap::parallel_flat_hash_map': ["#ff0066", "-."],
'ska::bytell_hash_map': ["#009933", "- "], # greens
'ska::flat_hash_map': ["#33cc33", "- "],
'ska::unordered_map': ["#99ff66", "- "],
'tsl::hopscotch_map': ["#9900cc", ":"], # purples
'tsl::robin_map': ["#cc33ff", ":"],
'tsl::sparse_map': ["#cc99ff", ":"],
'robin_hood::unordered_flat_map': ["#ffcc99", ".."],
'robin_hood::unordered_node_map': ["#ccff66", ".."],
'boost::unordered::unordered_map': ["#663300", "solid"], # brown
'emilib::HashMap': ["#9900cc", "solid"], # purple
# weird orange
'phmap::parallel_node_hash_map': ["#ffcc66", "solid"],
'std::unordered_map': ["#000000", "solid"], # black
}