Skip to content

pyProbabilityDistributionFit API Documentation

distfit

Created on 2022-05-17

see https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python/37616966#37616966

@author: https://stackoverflow.com/users/832621/saullo-g-p-castro see https://stackoverflow.com/a/37616966/1497139

@author: https://stackoverflow.com/users/2087463/tmthydvnprt see https://stackoverflow.com/a/37616966/1497139

@author: https://stackoverflow.com/users/1497139/wolfgang-fahl see

BestFitDistribution

Find the best Probability Distribution Function for the given data

Source code in pdffit/distfit.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
class BestFitDistribution:
    """
    Find the best Probability Distribution Function for the given data
    """

    def __init__(
        self,
        data,
        backend: str = "WebAgg",
        distributionNames: list = None,
        debug: bool = False,
    ):
        """
        constructor

        Args:
            data(dataFrame): the data to analyze
            distributionNames(list): list of distributionNames to try
            debug(bool): if True show debugging information
        """
        self.debug = debug
        self.backend = backend
        self.matplotLibParams()
        if distributionNames is None:
            self.distributionNames = [
                d for d in _distn_names if not d in ["levy_stable", "studentized_range"]
            ]
        else:
            self.distributionNames = distributionNames
        self.data = data

    def matplotLibParams(self):
        """
        set matplotlib parameters
        """
        matplotlib.rcParams["figure.figsize"] = (16.0, 12.0)
        matplotlib.style.use("ggplot")
        matplotlib.use(self.backend)

    # Create models from data
    def best_fit_distribution(self, bins: int = 200, ax=None, density: bool = True):
        """
        Model data by finding best fit distribution to data
        """
        # Get histogram of original data
        y, x = np.histogram(self.data, bins=bins, density=density)
        x = (x + np.roll(x, -1))[:-1] / 2.0

        # Best holders
        best_distributions = []
        distributionCount = len(self.distributionNames)
        # Estimate distribution parameters from data
        for ii, distributionName in enumerate(self.distributionNames):

            print(f"{ii+1:>3} / {distributionCount:<3}: {distributionName}")

            distribution = getattr(st, distributionName)

            # Try to fit the distribution
            try:
                # Ignore warnings from data that can't be fit
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    # fit dist to data
                    params = distribution.fit(self.data)

                    # Separate parts of parameters
                    arg = params[:-2]
                    loc = params[-2]
                    scale = params[-1]

                    # Calculate fitted PDF and error with fit in distribution
                    pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                    sse = np.sum(np.power(y - pdf, 2.0))

                    # if axis pass in add to plot
                    try:
                        if ax:
                            pd.Series(pdf, x).plot(ax=ax)
                    except Exception:
                        pass

                    # identify if this distribution is better
                    best_distributions.append((distribution, params, sse))

            except Exception as ex:
                if self.debug:
                    trace = traceback.format_exc()
                    msg = f"fit for {distributionName} failed:{ex}\n{trace}"
                    print(msg, file=sys.stderr)
                pass

        return sorted(best_distributions, key=lambda x: x[2])

    def make_pdf(self, dist, params: list, size=10000):
        """
        Generate distributions's Probability Distribution Function

        Args:
            dist: Distribution
            params(list): parameter
            size(int): size

        Returns:
            dataframe: Power Distribution Function

        """

        # Separate parts of parameters
        arg = params[:-2]
        loc = params[-2]
        scale = params[-1]

        # Get sane start and end points of distribution
        start = (
            dist.ppf(0.01, *arg, loc=loc, scale=scale)
            if arg
            else dist.ppf(0.01, loc=loc, scale=scale)
        )
        end = (
            dist.ppf(0.99, *arg, loc=loc, scale=scale)
            if arg
            else dist.ppf(0.99, loc=loc, scale=scale)
        )

        # Build PDF and turn into pandas Series
        x = np.linspace(start, end, size)
        y = dist.pdf(x, loc=loc, scale=scale, *arg)
        pdf = pd.Series(y, x)

        return pdf

    def analyze(
        self,
        title,
        x_label,
        y_label,
        facecolor="b",
        alpha=0.5,
        callback: Callable = None,
        outputFilePrefix=None,
        imageFormat: str = "png",
        allBins: int = 50,
        distBins: int = 200,
        density: bool = True,
    ):
        """

        analyze the Probabilty Distribution Function

        Args:
            data: Panda Dataframe or numpy array
            title(str): the title to use
            x_label(str): the label for the x-axis
            y_label(str): the label for the y-axis

            facecolor(str): the color to use
            alpha(float): the opacity to use

            callback(Callable): a function to be called for the plots

            outputFilePrefix(str): the prefix of the outputFile
            imageFormat(str): imageFormat e.g. png,svg

            allBins(int): the number of bins for all
            distBins(int): the number of bins for the distribution
            density(bool): if True show relative density
        """
        self.allBins = allBins
        self.distBins = distBins
        self.density = density
        self.title = title
        self.x_label = x_label
        self.y_label = y_label
        self.facecolor = facecolor
        self.alpha = alpha
        self.callback = callback
        self.imageFormat = imageFormat
        self.outputFilePrefix = outputFilePrefix
        self.best_dist = None
        self.analyzeAll()
        if self.callback:
            self.callback(self.figAll, isAll=True)
        if outputFilePrefix is not None:
            self.saveFig(f"{outputFilePrefix}All.{imageFormat}", imageFormat)
            plt.close(self.figAll)
        if self.best_dist:
            self.analyzeBest()
            if self.callback:
                self.callback(self.figBest, isAll=False)
            if outputFilePrefix is not None:
                self.saveFig(f"{outputFilePrefix}Best.{imageFormat}", imageFormat)
                plt.close(self.figBest)

    def analyzeAll(self):
        """
        analyze the given data

        """
        # Plot for comparison
        figTitle = f"{self.title}\n All Fitted Distributions"
        self.figAll = plt.figure(figTitle, figsize=(12, 8))
        ax = self.data.plot(
            kind="hist",
            bins=self.allBins,
            density=self.density,
            alpha=self.alpha,
            facecolor=self.facecolor,
        )

        # Save plot limits
        dataYLim = ax.get_ylim()
        # Update plots
        ax.set_ylim(dataYLim)
        ax.set_title(figTitle)
        ax.set_xlabel(self.x_label)
        ax.set_ylabel(self.y_label)

        # Find best fit distribution
        best_distributions = self.best_fit_distribution(
            bins=self.distBins, ax=ax, density=self.density
        )
        if len(best_distributions) > 0:
            self.best_dist = best_distributions[0]
            # Make PDF with best params
            self.pdf = self.make_pdf(self.best_dist[0], self.best_dist[1])

    def analyzeBest(self):
        """
        analyze the Best Property Distribution function
        """
        # Display
        figLabel = "PDF"
        self.figBest = plt.figure(figLabel, figsize=(12, 8))
        ax = self.pdf.plot(lw=2, label=figLabel, legend=True)
        self.data.plot(
            kind="hist",
            bins=self.allBins,
            density=self.density,
            alpha=self.alpha,
            label="Data",
            legend=True,
            ax=ax,
            facecolor=self.facecolor,
        )

        param_names = (
            (self.best_dist[0].shapes + ", loc, scale").split(", ")
            if self.best_dist[0].shapes
            else ["loc", "scale"]
        )
        param_str = ", ".join(
            ["{}={:0.2f}".format(k, v) for k, v in zip(param_names, self.best_dist[1])]
        )
        dist_str = "{}({})".format(self.best_dist[0].name, param_str)

        ax.set_title(f"{self.title} with best fit distribution \n" + dist_str)
        ax.set_xlabel(self.x_label)
        ax.set_ylabel(self.y_label)

    def saveFig(self, outputFile: str = None, imageFormat="png"):
        """
        save the current Figure to the given outputFile

        Args:
            outputFile(str): the outputFile to save to
            imageFormat(str): the imageFormat to use e.g. png/svg
        """
        plt.savefig(outputFile, format=imageFormat)  # dpi

__init__(data, backend='WebAgg', distributionNames=None, debug=False)

constructor

Parameters:

Name Type Description Default
data(dataFrame)

the data to analyze

required
distributionNames(list)

list of distributionNames to try

required
debug(bool)

if True show debugging information

required
Source code in pdffit/distfit.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    data,
    backend: str = "WebAgg",
    distributionNames: list = None,
    debug: bool = False,
):
    """
    constructor

    Args:
        data(dataFrame): the data to analyze
        distributionNames(list): list of distributionNames to try
        debug(bool): if True show debugging information
    """
    self.debug = debug
    self.backend = backend
    self.matplotLibParams()
    if distributionNames is None:
        self.distributionNames = [
            d for d in _distn_names if not d in ["levy_stable", "studentized_range"]
        ]
    else:
        self.distributionNames = distributionNames
    self.data = data

analyze(title, x_label, y_label, facecolor='b', alpha=0.5, callback=None, outputFilePrefix=None, imageFormat='png', allBins=50, distBins=200, density=True)

analyze the Probabilty Distribution Function

Parameters:

Name Type Description Default
data

Panda Dataframe or numpy array

required
title(str)

the title to use

required
x_label(str)

the label for the x-axis

required
y_label(str)

the label for the y-axis

required
facecolor(str)

the color to use

required
alpha(float)

the opacity to use

required
callback(Callable)

a function to be called for the plots

required
outputFilePrefix(str)

the prefix of the outputFile

required
imageFormat(str)

imageFormat e.g. png,svg

required
allBins(int)

the number of bins for all

required
distBins(int)

the number of bins for the distribution

required
density(bool)

if True show relative density

required
Source code in pdffit/distfit.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def analyze(
    self,
    title,
    x_label,
    y_label,
    facecolor="b",
    alpha=0.5,
    callback: Callable = None,
    outputFilePrefix=None,
    imageFormat: str = "png",
    allBins: int = 50,
    distBins: int = 200,
    density: bool = True,
):
    """

    analyze the Probabilty Distribution Function

    Args:
        data: Panda Dataframe or numpy array
        title(str): the title to use
        x_label(str): the label for the x-axis
        y_label(str): the label for the y-axis

        facecolor(str): the color to use
        alpha(float): the opacity to use

        callback(Callable): a function to be called for the plots

        outputFilePrefix(str): the prefix of the outputFile
        imageFormat(str): imageFormat e.g. png,svg

        allBins(int): the number of bins for all
        distBins(int): the number of bins for the distribution
        density(bool): if True show relative density
    """
    self.allBins = allBins
    self.distBins = distBins
    self.density = density
    self.title = title
    self.x_label = x_label
    self.y_label = y_label
    self.facecolor = facecolor
    self.alpha = alpha
    self.callback = callback
    self.imageFormat = imageFormat
    self.outputFilePrefix = outputFilePrefix
    self.best_dist = None
    self.analyzeAll()
    if self.callback:
        self.callback(self.figAll, isAll=True)
    if outputFilePrefix is not None:
        self.saveFig(f"{outputFilePrefix}All.{imageFormat}", imageFormat)
        plt.close(self.figAll)
    if self.best_dist:
        self.analyzeBest()
        if self.callback:
            self.callback(self.figBest, isAll=False)
        if outputFilePrefix is not None:
            self.saveFig(f"{outputFilePrefix}Best.{imageFormat}", imageFormat)
            plt.close(self.figBest)

analyzeAll()

analyze the given data

Source code in pdffit/distfit.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def analyzeAll(self):
    """
    analyze the given data

    """
    # Plot for comparison
    figTitle = f"{self.title}\n All Fitted Distributions"
    self.figAll = plt.figure(figTitle, figsize=(12, 8))
    ax = self.data.plot(
        kind="hist",
        bins=self.allBins,
        density=self.density,
        alpha=self.alpha,
        facecolor=self.facecolor,
    )

    # Save plot limits
    dataYLim = ax.get_ylim()
    # Update plots
    ax.set_ylim(dataYLim)
    ax.set_title(figTitle)
    ax.set_xlabel(self.x_label)
    ax.set_ylabel(self.y_label)

    # Find best fit distribution
    best_distributions = self.best_fit_distribution(
        bins=self.distBins, ax=ax, density=self.density
    )
    if len(best_distributions) > 0:
        self.best_dist = best_distributions[0]
        # Make PDF with best params
        self.pdf = self.make_pdf(self.best_dist[0], self.best_dist[1])

analyzeBest()

analyze the Best Property Distribution function

Source code in pdffit/distfit.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
def analyzeBest(self):
    """
    analyze the Best Property Distribution function
    """
    # Display
    figLabel = "PDF"
    self.figBest = plt.figure(figLabel, figsize=(12, 8))
    ax = self.pdf.plot(lw=2, label=figLabel, legend=True)
    self.data.plot(
        kind="hist",
        bins=self.allBins,
        density=self.density,
        alpha=self.alpha,
        label="Data",
        legend=True,
        ax=ax,
        facecolor=self.facecolor,
    )

    param_names = (
        (self.best_dist[0].shapes + ", loc, scale").split(", ")
        if self.best_dist[0].shapes
        else ["loc", "scale"]
    )
    param_str = ", ".join(
        ["{}={:0.2f}".format(k, v) for k, v in zip(param_names, self.best_dist[1])]
    )
    dist_str = "{}({})".format(self.best_dist[0].name, param_str)

    ax.set_title(f"{self.title} with best fit distribution \n" + dist_str)
    ax.set_xlabel(self.x_label)
    ax.set_ylabel(self.y_label)

best_fit_distribution(bins=200, ax=None, density=True)

Model data by finding best fit distribution to data

Source code in pdffit/distfit.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def best_fit_distribution(self, bins: int = 200, ax=None, density: bool = True):
    """
    Model data by finding best fit distribution to data
    """
    # Get histogram of original data
    y, x = np.histogram(self.data, bins=bins, density=density)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Best holders
    best_distributions = []
    distributionCount = len(self.distributionNames)
    # Estimate distribution parameters from data
    for ii, distributionName in enumerate(self.distributionNames):

        print(f"{ii+1:>3} / {distributionCount:<3}: {distributionName}")

        distribution = getattr(st, distributionName)

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")

                # fit dist to data
                params = distribution.fit(self.data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                except Exception:
                    pass

                # identify if this distribution is better
                best_distributions.append((distribution, params, sse))

        except Exception as ex:
            if self.debug:
                trace = traceback.format_exc()
                msg = f"fit for {distributionName} failed:{ex}\n{trace}"
                print(msg, file=sys.stderr)
            pass

    return sorted(best_distributions, key=lambda x: x[2])

make_pdf(dist, params, size=10000)

Generate distributions's Probability Distribution Function

Parameters:

Name Type Description Default
dist

Distribution

required
params(list)

parameter

required
size(int)

size

required

Returns:

Name Type Description
dataframe

Power Distribution Function

Source code in pdffit/distfit.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def make_pdf(self, dist, params: list, size=10000):
    """
    Generate distributions's Probability Distribution Function

    Args:
        dist: Distribution
        params(list): parameter
        size(int): size

    Returns:
        dataframe: Power Distribution Function

    """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = (
        dist.ppf(0.01, *arg, loc=loc, scale=scale)
        if arg
        else dist.ppf(0.01, loc=loc, scale=scale)
    )
    end = (
        dist.ppf(0.99, *arg, loc=loc, scale=scale)
        if arg
        else dist.ppf(0.99, loc=loc, scale=scale)
    )

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf

matplotLibParams()

set matplotlib parameters

Source code in pdffit/distfit.py
63
64
65
66
67
68
69
def matplotLibParams(self):
    """
    set matplotlib parameters
    """
    matplotlib.rcParams["figure.figsize"] = (16.0, 12.0)
    matplotlib.style.use("ggplot")
    matplotlib.use(self.backend)

saveFig(outputFile=None, imageFormat='png')

save the current Figure to the given outputFile

Parameters:

Name Type Description Default
outputFile(str)

the outputFile to save to

required
imageFormat(str)

the imageFormat to use e.g. png/svg

required
Source code in pdffit/distfit.py
293
294
295
296
297
298
299
300
301
def saveFig(self, outputFile: str = None, imageFormat="png"):
    """
    save the current Figure to the given outputFile

    Args:
        outputFile(str): the outputFile to save to
        imageFormat(str): the imageFormat to use e.g. png/svg
    """
    plt.savefig(outputFile, format=imageFormat)  # dpi

version

Created on 2022-05-18

@author: wf

Version

Bases: object

Version handling for pyProbabilityDistributionFit

Source code in pdffit/version.py
 8
 9
10
11
12
13
14
15
16
17
class Version(object):
    """
    Version handling for pyProbabilityDistributionFit
    """

    version = "0.0.5"
    date = "2022-05-18"
    updated = "2022-07-05"
    name = "pyProbabilityDistributionFit"
    description = "Find the best Probability Distribution Function for the given data"