/* ------------------------------------------------------------------------
Author: W. Oevel, 7.3.2005
First shipped with MuPAD 3.5
Calls:
stats::frequency(data, )
stats::frequency(data, n, )
stats::frequency(data, [n], )
stats::frequency(data, [a1..b1, a2..b2, ..], )
stats::frequency(data, [[a1, b1], [a2, b2], ..], )
stats::frequency(data, Classes = n, )
stats::frequency(data, Classes = [n], )
stats::frequency(data, Classes = [a1..b1, a2..b2, ..], )
stats::frequency(data, Classes = [[a1, b1], [a2, b2], ..], )
stats::frequency(data, Cells = n, )
stats::frequency(data, Cells = [n], )
stats::frequency(data, Cells = [a1..b1, a2..b2, ..], )
stats::frequency(data, Cells = [[a1, b1], [a2, b2], ..], )
Parameter:
data - a list, a set, a table, a matrix or an array
of numerical real data (may be exact expressions
such as PI, sin(2), 3 + sqrt(5) etc).
n - the number of cells (classes): a positive integer.
create n cells of equal length covering the range
from min(data) to max(data). For technical reasons,
however, the left most cell extends to -infinity.
If no cell specification is given, n = 10 is
used by default.
a1, b1, a2, b2, ...
- cell (class) boundaries: real numerical values (may
be exact numerical values such as PI, sqrt(2) etc.).
Must satisfy
a1 <= b1 <= a2 <= b2 <= a3 <= ....
The cell (class) given by a.i, b.i is the semi-open
interval (a.i, b.i]. With CellsClosedLeft, the cells
with boundaries a.i, b.i represend the semi-open interval
[a.i, b.i)
Data outside the union of these cells are ignored
(not tallied into the classes).
Options:
CellsClosed = Left/Right :
The class a.i..b.i is interpreted as [a.i, b.i) with
CellsClosed = Left and as (a.i, b.i] with
CellsClosed = Right
The default is CellsClosed = Right.
ClassesClosed = Left/Right is identical with CellsClosed = Left/Right
Return Value:
table(
1 = [[a1, b1], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
2 = [[a2, b2], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
3 = [[a3, b3], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
):
------------------------------------------------------------------------ */
unprotect(stats):
stats::frequency:= proc(data, cells)
local numericaldata, minval, maxval, i, closed_left,
fcells, classdata, bucketsort1, bucketsort2;
begin
//-----------------------------
// check the data
//-----------------------------
if args(0) < 1 or
( domtype(data) <> DOM_LIST and
domtype(data) <> DOM_SET and
domtype(data) <> DOM_TABLE and
domtype(data) <> DOM_ARRAY and
domtype(data) <> stats::sample and
not data::dom::hasProp(Cat::Matrix)
)then
error("expecting numerical data as a list, a set, a table, ".
"an array, a matrix, or a stats::sample"):
end_if;
case domtype(data)
of DOM_LIST do break;
of DOM_SET do
of DOM_ARRAY do
data:= [op(data)];
break;
of DOM_TABLE do
data:= map([op(data)], op, 2);
break;
of stats::sample do
data:= map(data::dom::convert_to(data, DOM_LIST), op);
break;
otherwise // data = matrix
data:= [op(data)];
end_case;
if {op(map(data, domtype))} subset {DOM_INT, DOM_RAT, DOM_FLOAT} then
numericaldata:= TRUE;
else // there are expressions such as sqrt(2), PI, sin(5) etc.
numericaldata:= FALSE;
//-----------------------------------------------------
// check that all data can be converted to real floats:
//-----------------------------------------------------
if {op(map(data, domtype@float))} <> {DOM_FLOAT} then
error("cannot convert all data elements to real floats"):
end_if;
end_if;
//--------------------------------
// check for the option CellsClosed = Left
//--------------------------------
closed_left:= FALSE;
if has([args(2..args(0))], CellsClosed = Left) or
has([args(2..args(0))], ClassesClosed = Left) then
closed_left:= TRUE;
end_if;
if has([args(2..args(0))], CellsClosed = Right) or
has([args(2..args(0))], ClassesClosed = Right) then
closed_left:= FALSE;
end_if;
//-----------------------------
// check the cell specification
//-----------------------------
if args(0) = 1 then
cells:= 10: // default
end_if;
if type(cells) = "_equal" then
if lhs(cells) <> Cells and lhs(cells) <> Classes then
error("2nd argument: expecting an equation 'Classes = integer' ".
"or 'Classes = [a1..b1, a2..b2, ...]' ".
"or 'Classes = [[a1, b1], [a2, b2], ..]'");
end_if;
cells:= rhs(cells);
end_if;
case domtype(cells)
of DOM_INT do
if cells <= 0 then
error("2nd argument: expecting a positive number of classes");
end_if;
break;
of DOM_LIST do
if nops(cells) = 0 then
error("expecting a specification of the classes. Got an empty list.");
end_if;
if nops(cells) = 1 and domtype(cells[1]) = DOM_INT then
cells:= cells[1];
if cells <= 0 then
error("2nd argument: expecting a positive number of classes");
end_if;
break;
end_if;
// replace +- infinity by RD_INF/RD_NINF for speed:
cells:= subs(cells, [-infinity = RD_NINF, infinity = RD_INF]):
for i from 1 to nops(cells) do
if type(cells[i]) = "_range" then
cells[i]:= [op(cells[i])];
end_if;
if type(cells[i]) <> DOM_LIST or
nops(cells[i]) <> 2 then
error("2nd argument: wrong specification of class ".expr2text(i).
"; expecting a range a..b or a list [a, b]");
end_if;
if domtype(float(cells[i][1])) <> DOM_FLOAT and
cells[i][1] <> -infinity then
error("2nd argument: wrong specification of class ".expr2text(i).
"; the left border ".expr2text(cells[i][1]).
" is not a real numerical value or -infinity");
end_if;
if type(float(cells[i][2])) <> DOM_FLOAT and
cells[i][1] <> -infinity then
error("2nd argument: wrong specification of class ".expr2text(i).
"; the right border ".expr2text(cells[i][1]).
" is not a real numerical value or infinity");
end_if;
end_for:
end_case;
//-------------------------------------------------------
// automatic generation of equidistant cells
//-------------------------------------------------------
if domtype(cells) = DOM_INT then
// a value is tallied into cell[i] by bucketsort if
// cell[i][1] < value <= cell[i][2].
// Unfortunately, this does not count the leftmost value.
// Hence, decrease the minimal value a tiny bit to the left.
minval := min(data);
maxval:= max(data);
if closed_left then
cells:= [((cells-i)/cells*minval+i/cells*maxval)$i=0..cells-1, RD_INF];
else
cells:= [RD_NINF, ((cells-i)/cells*minval+i/cells*maxval)$i=1..cells];
end_if;
cells:= zip(cells, cells[2..-1], DOM_LIST);
fcells:= float(cells);
else // check the cells
fcells:= float(cells):
if {op(map(fcells, cell -> (domtype(cell[1]), domtype(cell[2]))))} <> {DOM_FLOAT}
then
error("cannot convert all class boundaries to real floats"):
end_if;
end_if;
// check ordering of cells
for i from 1 to nops(fcells) do
if fcells[i][1] > fcells[i][2] then
error("the left boundary of class ".expr2text(i).
" is bigger than the right boundary"):
end_if;
end_for;
for i from 1 to nops(fcells) - 1 do
if fcells[i][2] > fcells[i + 1][1] then
error("the right boundary of class ".expr2text(i).
" is bigger than the left boundary of the next class");
end_if;
end_for:
//-------------------------------------------------------
// tally data into cells
//-------------------------------------------------------
// bucketsort1 is faster than bucketsort2 for small nops(cells)
if closed_left then
bucketsort1:= proc(data, buckets)
local i, dummy;
begin
for i from 1 to nops(buckets) do
if (i = 1 and buckets[i][1] <> RD_NINF) or
(i > 1 and buckets[i - 1][2] <= buckets[i][1]) then
// throw away all data below the current bucket
[dummy, data, dummy]:= split(data, _less, buckets[i][1]);
end_if;
[classdata[i], data, dummy]:= split(data, _less, buckets[i][2]);
end_for:
end_proc:
else
bucketsort1:= proc(data, buckets)
local i, dummy;
begin
for i from 1 to nops(buckets) do
if (i = 1 and buckets[i][1] <> RD_NINF) or
(i > 1 and buckets[i - 1][2] < buckets[i][1]) then
// throw away all data below the current bucket
[dummy, data, dummy]:= split(data, _leequal, buckets[i][1]);
end_if;
[classdata[i], data, dummy]:= split(data, _leequal, buckets[i][2]);
end_for:
end_proc:
end_if;
// bucketsort2 is faster than bucketsort1 for large nops(cells)
if closed_left then
bucketsort2:= proc(data, locellindex, hicellindex)
local midcellindex, midvalue, lodata, dummy;
begin
if nops(data) = 0 then
return()
end_if;
if locellindex = hicellindex then
data:= select(data, _less, op(fcells[locellindex], 2));
classdata[hicellindex]:=
select(data, _not@_less, op(fcells[locellindex], 1));
return():
end_if;
midcellindex:= (locellindex + hicellindex) div 2;
midvalue:= op(cells[midcellindex], 2):
[lodata, data, dummy]:= split(data, _less, midvalue);
bucketsort2(lodata, locellindex, midcellindex);
bucketsort2(data, midcellindex+1, hicellindex);
end_proc:
else
bucketsort2:= proc(data, locellindex, hicellindex)
local midcellindex, midvalue, lodata, dummy;
begin
if nops(data) = 0 then
return()
end_if;
if locellindex = hicellindex then
data:= select(data, _leequal, op(fcells[locellindex], 2));
classdata[hicellindex]:=
select(data, _not@_leequal, op(fcells[locellindex], 1));
return():
end_if;
midcellindex:= (locellindex + hicellindex) div 2;
midvalue:= op(cells[midcellindex], 2):
[lodata, data, dummy]:= split(data, _leequal, midvalue);
bucketsort2(lodata, locellindex, midcellindex);
bucketsort2(data, midcellindex+1, hicellindex);
end_proc:
end_if;
//-------------------------------------------------------
// do the bucketsort:
//-------------------------------------------------------
classdata:= [[] $ nops(cells)]; // initialize empty buckets
if numericaldata then
if nops(cells) < 16 then
bucketsort1(data, fcells); // sort into buckets
else
bucketsort2(data, 1, nops(cells)); // sort into buckets
end_if;
else // exaxt data
if nops(cells) < 1000 then
bucketsort1(data, fcells); // sort into buckets
else
bucketsort2(data, 1, nops(cells)); // sort into buckets
end_if;
end_if;
if numericaldata then
classdata:= map(classdata, sort);
else classdata:= map(classdata, prog::sort, float):
end_if;
//-------------------------------------------------------
// return a table
//-------------------------------------------------------
cells:= subs(cells, [RD_NINF = -infinity, RD_INF = infinity]):
table((i = [cells[i], nops(classdata[i]), classdata[i]])
$ i = 1..nops(cells));
end_proc: