From 2e4189c1f80d617dd7fa9f0a5614e8e68816ea99 Mon Sep 17 00:00:00 2001 From: Clement Stenac Date: Tue, 18 Feb 2020 12:18:18 +0100 Subject: [PATCH 1/3] Experiment creating typed classes for computation definition Still hard to find your children back ... --- dataikuapi/dss/statistics.py | 93 ++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/dataikuapi/dss/statistics.py b/dataikuapi/dss/statistics.py index ea6b60a8..e8d41520 100644 --- a/dataikuapi/dss/statistics.py +++ b/dataikuapi/dss/statistics.py @@ -207,6 +207,99 @@ def get_raw(self): @staticmethod def _from_computation_or_dict(computation_or_dict): + if isinstance(computation_or_dict, ComputationBase): + computation_or_dict = computation_or_dict.to_model() if isinstance(computation_or_dict, DSSStatisticsComputationSettings): computation_or_dict = computation_or_dict.get_raw() return DSSStatisticsComputationSettings(computation_or_dict) + +class ComputationBase(object): + def __init__(self): + pass + + def grouped_by_alphanum(self, column, max_values=10, group_others=False): + return GroupedComputation(self, { + "type" : "anum", + "column" : column, + "maxValues": max_values, + "groupOthers": group_others + }) + + def grouped_by_bins(self, column, nb_bins=None, bin_size=None, keep_na=False): + if nb_bins is not None: + return GroupedComputation(self, { + "type" : "binned", + "column" : column, + "mode": "FIXED_NB", + "nbBins" : nb_bins, + "keepNA" : keep_na + }) + elif bin_size is not None: + return GroupedComputation(self, { + "type" : "binned", + "column" : column, + "mode": "FIXED_SIZE", + "binSize" : bin_size, + "keepNA" : keep_na + }) + +class DescriptiveStatistics(ComputationBase): + def __init__(self, columns, mean=False, sum=False, stddev=False, variance=False, skewness=False,kurtosis=False,sem=False): + self.columns = columns + self.mean = mean + self.sum = sum + self.stddev = stddev + self.variance = variance + self.skewness = skewness + self.kurtosis = kurtosis + self.sem = sem + + def to_model(self): + computations = [] + for col in self.columns: + if self.mean: + computations.append({"type": "mean", "column": col}) + if self.sum: + computations.append({"type": "sum", "column": col}) + if self.stddev: + computations.append({"type": "std_dev", "column": col}) + if self.variance: + computations.append({"type": "variance", "column": col}) + if self.skewness: + computations.append({"type": "skewness", "column": col}) + if self.kurtosis: + computations.append({"type": "kurtosis", "column": col}) + if self.sem: + computations.append({"type": "sem", "column": col}) + return {"type": "multi", "computations" : computations} + +class DistributionFit(ComputationBase): + def __init__(self, column, type="normal", test=True, **kwargs): + self.column = column + self.type = type + self.test = test + self.distribution_args = kwargs + + def to_model(self): + distribution = { + "type" : self.type + } + distribution.update(self.distribution_args) + return { + "type": "fit_distribution", + "column" : self.column, + "distribution": distribution, + "test" :self.test + } + +class GroupedComputation(ComputationBase): + def __init__(self, computation, grouping): + self.computation = computation + self.grouping = grouping + + def to_model(self): + return { + "type": "grouped", + "computation" : self.computation.to_model(), + "grouping": self.grouping + } \ No newline at end of file From bbc08be8d5f96a97305828927e873f0d898bd5f0 Mon Sep 17 00:00:00 2001 From: Clement Stenac Date: Tue, 18 Feb 2020 12:36:47 +0100 Subject: [PATCH 2/3] A few more computations --- dataikuapi/dss/statistics.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/dataikuapi/dss/statistics.py b/dataikuapi/dss/statistics.py index e8d41520..741392ac 100644 --- a/dataikuapi/dss/statistics.py +++ b/dataikuapi/dss/statistics.py @@ -273,6 +273,17 @@ def to_model(self): computations.append({"type": "sem", "column": col}) return {"type": "multi", "computations" : computations} +class TTest1Sample(ComputationBase): + def __init__(self, column, hypothesized_mean): + self.column = column + self.hypothesized_mean = hypothesized_mean + def to_model(self): + return { + "type": "ttest_1samp", + "column": self.column, + "hypothesizedMean" : self.hypothesized_mean + } + class DistributionFit(ComputationBase): def __init__(self, column, type="normal", test=True, **kwargs): self.column = column @@ -292,6 +303,31 @@ def to_model(self): "test" :self.test } +class _BasicBivariateComputation(ComputationBase): + def __init__(self, type, column1, column2): + self.type = type + self.column1 = column1 + self.column2 = column2 + + def to_model(self): + return { + "type": self.type, + "xColumn": self.column1, + "yColumn": self.column2 + } + + +class Pearson(_BasicBivariateComputation): + def __init__(self, column1, column2): + super(Pearson, self).__init__("pearson", column1, column2) +class Covariance(_BasicBivariateComputation): + def __init__(self, column1, column2): + super(Pearson, self).__init__("covariance", column1, column2) +class Spearman(_BasicBivariateComputation): + def __init__(self, column1, column2): + super(Pearson, self).__init__("spearman", column1, column2) + + class GroupedComputation(ComputationBase): def __init__(self, computation, grouping): self.computation = computation From 96f1f08033e5bc350e9936933591083a973a09df Mon Sep 17 00:00:00 2001 From: Clement Stenac Date: Thu, 20 Feb 2020 08:58:05 +0100 Subject: [PATCH 3/3] Add quantiles --- dataikuapi/dss/statistics.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dataikuapi/dss/statistics.py b/dataikuapi/dss/statistics.py index 741392ac..f323080c 100644 --- a/dataikuapi/dss/statistics.py +++ b/dataikuapi/dss/statistics.py @@ -273,6 +273,20 @@ def to_model(self): computations.append({"type": "sem", "column": col}) return {"type": "multi", "computations" : computations} +class Quantiles(ComputationBase): + def __init__(self, column, freqs=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99], confidence=None): + self.column = column + self.freqs = freqs + self.confidence = confidence + + def to_model(self): + return { + "type": "quantiles", + "column" : self.column, + "freqs": self.freqs, + "confidence": self.confidence + } + class TTest1Sample(ComputationBase): def __init__(self, column, hypothesized_mean): self.column = column