Proposed modification to boxplot()

_Sajec_Mike_TQO · February 15, 2006, 1:30am

I would like to propose modifying the boxplot function as follows:

Instead of only accepting an (mxn) matrix (x) and creating (n) boxplots
from the columns of (x), optionally in place of the (mxn) matrix accept
a list of numeric arrays which can be any length, and create boxplots
for each of the arrays in the list.

The obvious benefit in doing this is that one can easily create boxplots
to compare datasets with differing numbers of data-points.

For example:

from RandomArray import normal
x1 = normal(10,3,100)
x2 = normal(10,5,150)
x3 = normal(10,1,1000)

boxplot([x1, x2, x3])

The code below implements the proposed change by modifying the boxplot
function axes.py. It works and is proof of concept if nothing else.

···

##--------------------------------------------------##
## modifications to the boxplot function in axes.py ##
##--------------------------------------------------##

    def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,
                positions=None, widths=None):
        """
        boxplot(x, notch=0, sym='+', vert=1, whis=1.5,
                positions=None, widths=None)

        Make a box and whisker plot for each column of x.
        Or
        Make a box and whisker plot for each array in list x.

        The box extends from the lower to upper quartile values
        of the data, with a line at the median. The whiskers
        extend from the box to show the range of the data. Flier
        points are those past the end of the whiskers.

notch = 0 (default) produces a rectangular box plot.
notch = 1 will produce a notched box plot

sym (default 'b+') is the default symbol for flier points.
Enter an empty string ('') if you don't want to show fliers.

        vert = 1 (default) makes the boxes vertical.
        vert = 0 makes horizontal boxes. This seems goofy, but
        that's how Matlab did it.

        whis (default 1.5) defines the length of the whiskers as
        a function of the inner quartile range. They extend to the
        most extreme data point within ( whis*(75%-25%) ) data range.

        positions (default 1,2,...,n) sets the horizontal positions of
        the boxes. The ticks and limits are automatically set to match
        the positions.

        widths is either a scalar or a vector and sets the width of
        each box. The default is 0.5, or 0.15*(distance between extreme
        positions) if that is smaller.

        x is either:
     (1) a Numeric array
    (2) a list of 1-dimension Numeric arrays of any length

        Returns a list of the lines added
        """

        if not self._hold: self.cla()
        holdStatus = self._hold

whiskers, caps, boxes, medians, fliers = , , , ,

# CASE #1: x is a numeric array
if type(x) == type(array([0])):

x = asarray(x)

            rank = len(x.shape)
            if 1 == rank:
                x.shape = -1, 1

row, col = x.shape

        # CASE #2: x is a list of numeric arrays
        if type(x) == type(list([0])):
            col = len(x) # one column for each array

            #reshape the vectors in list x if necessary
            for ii in range(len(x)):
                rank = len(x[ii].shape)
                if 1 == rank:
                    x[ii].shape = -1, 1

        # get some plot info
        if positions is None:
            positions = range(1, col + 1)
        if widths is None:
            distance = max(positions) - min(positions)
            widths = min(0.15*max(distance,1.0), 0.5)
        if isinstance(widths, float) or isinstance(widths, int):
            widths = ones((col,), 'd') * widths

        # loop through columns, adding each to plot
        self.hold(True)
        for i,pos in enumerate(positions):

            # CASE #1: x is a numeric array
            if type(x)==type(array([0])):
                d = x[:,i]

            # CASE #2: x is a list of numeric arrays
            if type(x)==type(list([0])):
                d = x[i][:,0]
                row = len(d)

            # get median and quartiles
            q1, med, q3 = prctile(d,[25,50,75])
            # get high extreme
            iq = q3 - q1
            hi_val = q3 + whis*iq
            wisk_hi = compress( d <= hi_val , d )
            if len(wisk_hi) == 0:
                wisk_hi = q3
            else:
                wisk_hi = max(wisk_hi)
            # get low extreme
            lo_val = q1 - whis*iq
            wisk_lo = compress( d >= lo_val, d )
            if len(wisk_lo) == 0:
                wisk_lo = q1
            else:
                wisk_lo = min(wisk_lo)
            # get fliers - if we are showing them
            flier_hi =
            flier_lo =
            flier_hi_x =
            flier_lo_x =
            if len(sym) != 0:
                flier_hi = compress( d > wisk_hi, d )
                flier_lo = compress( d < wisk_lo, d )
                flier_hi_x = ones(flier_hi.shape[0]) * pos
                flier_lo_x = ones(flier_lo.shape[0]) * pos

            # get x locations for fliers, whisker, whisker cap and box
sides
            box_x_min = pos - widths[i] * 0.5
            box_x_max = pos + widths[i] * 0.5

wisk_x = ones(2) * pos

            cap_x_min = pos - widths[i] * 0.25
            cap_x_max = pos + widths[i] * 0.25
            cap_x = [cap_x_min, cap_x_max]

# get y location for median
med_y = [med, med]

            # calculate 'regular' plot
            if notch == 0:
                # make our box vectors
                box_x = [box_x_min, box_x_max, box_x_max, box_x_min,
box_x_min ]
                box_y = [q1, q1, q3, q3, q1 ]
                # make our median line vectors
                med_x = [box_x_min, box_x_max]
            # calculate 'notch' plot
            else:
                notch_max = med + 1.57*iq/sqrt(row)
                notch_min = med - 1.57*iq/sqrt(row)
                if notch_max > q3:
                    notch_max = q3
                if notch_min < q1:
                    notch_min = q1
                # make our notched box vectors
                box_x = [box_x_min, box_x_max, box_x_max, cap_x_max,
box_x_max, box_x_max, box_x_min, box_x_min, cap_x_min, box_x_min,
box_x_min ]
                box_y = [q1, q1, notch_min, med, notch_max, q3, q3,
notch_max, med, notch_min, q1]
                # make our median line vectors
                med_x = [cap_x_min, cap_x_max]
                med_y = [med, med]

            # vertical or horizontal plot?
            if vert:
                def doplot(*args):
                    return self.plot(*args)
            else:
                def doplot(*args):
                    shuffled =
                    for i in range(0, len(args), 3):
                        shuffled.extend([args[i+1], args[i], args[i+2]])
                    return self.plot(*shuffled)

            whiskers.extend(doplot(wisk_x, [q1, wisk_lo], 'b--',
                                   wisk_x, [q3, wisk_hi], 'b--'))
            caps.extend(doplot(cap_x, [wisk_hi, wisk_hi], 'k-',
                               cap_x, [wisk_lo, wisk_lo], 'k-'))
            boxes.extend(doplot(box_x, box_y, 'b-'))
            medians.extend(doplot(med_x, med_y, 'r-'))
            fliers.extend(doplot(flier_hi_x, flier_hi, sym,
                                 flier_lo_x, flier_lo, sym))

        # fix our axes/ticks up a little
        if 1 == vert:
            setticks, setlim = self.set_xticks, self.set_xlim
        else:
            setticks, setlim = self.set_yticks, self.set_ylim

        newlimits = min(positions)-0.5, max(positions)+0.5
        setlim(newlimits)
        setticks(positions)

        # reset hold status
        self.hold(holdStatus)

return dict(whiskers=whiskers, caps=caps, boxes=boxes,
medians=medians, fliers=fliers)