Contents

function [FinalError PeakList ANorm BNorm] = NBC (ClassA, ClassB, xValReps,...
OutlierRemoval, RemovePeaks, PeaksToLeave, Normalize)
% (c) Karl Kuschner, College of William and Mary, Dept. of Physics, 2009.
%
% NBC (a wrapper) finds a feature set through forward or backward selection
%
% DESCRIPTION
%       NBC implements a Naive Bayesian Classifier to select features.
%       Given two groups of cases from two different classes, it sets up a
%       cross validation scheme and then begins to remove (or add)
%       features, looking at the ability of each feature's removal to
%       decrease error to determine if that removal should be permanent.
%       You then have the choice of removing highly misclassified samples
%       (if OutlierRemoval~=0). The integer flag RemovePeaks controls
%       whether features  are removed one at a  time to leave the (integer)
%       PeaksToLeave most important ones (RemovePeaks=1). Alternately, if
%       RemovePeaks=-1, features are added. Setting the flag to zero skips
%       this step.
%
%       The function returns the list of peaks that were left after feature
%       removal (or found during feature addition) as well as the
%       corresponding error rates at each step.  These error rates are from
%       nfold cross validation, repeated xValReps (integer) times. Other
%       options are listed immediately below.
%
% USAGE
%       [Error PeakList]=MainProgram(A,B,reps,outlier,remove,leave,norm);
%
% INPUTS
%       A, B: Data is stored in two arrays of continuous data, separated by
%           class, with the cases arranged in the rows and the features, or
%           variables, in columns. A is typically the normal group and B
%           the disease group. The (i,j) value is "intensity of Peak j for
%           Case i."
%       reps: times to repeat the whole process
%       outlier: true if outlier removal is desired, otherwise false.
%       remove: 1 for backward selection, -1 for forward selection
%       leave: number of final features desired
%       norm: Logical, whether to perform total ion normalization
%
% INTERNAL SETTINGS
%       Number of Bins: for discretization, number of bins to build
%           probability distribution. Choices are 2,4, or 6 for discrete; 0
%           for continuous (no bins). 2 works best for highly separated
%           features, 6 or 0 for overlapping groups.
%       n: For n-fold cross-validation
%       Threshold = The probability threshold for declaring the class, e.g.
%           0.5 means "if P(Class)>.5, Class=disease"
%
% OUTPUTS
%       FinalError: List of the error rate after each removal/addition
%       PeakList: List of which of the "leave" peaks remained
%       ANorm, BNorm: Data with total ion normalization applied
%
% CALLED FUNCTIONS
%       CombineGroups: Creates a population from the two inputs
%       CrossValidate: Divides the data into test and training groups
%       PerformBayes: Performs the actual NBC analysis to find P(C|Data)
%       CountCorrect: Finds which cases were classified correctly
%       RemoveOutliers: If desired, removes cases that appear as outliers

Options Section

% See explanation at top
NumberOfBins = 6;
n = 10;
Threshold = .5;

Set up Section

start = clock;
% Check the sizes of the arrays
[NumA ColsA] = size(ClassA);
[NumB ColsB] = size(ClassB);
if ColsA ~= ColsB
    Error = 'Input Arrays must have same width' %#ok<NOPRT>
disp(Error);
else
Cols = ColsA;
    clear ColsA ColsB
end
% Set a bound on feature removal
PeaksToRemove = max ([0 Cols-max([2 PeaksToLeave])] );
Population = CombineGroups(ClassA,ClassB); % see function at end

Normalize (Optional)

%Normalize the sum of each row of data to to the average of the population
if Normalize
    TotIonCt=mean(sum(Population,2)); %Population mean total per row
factA=sum(ClassA,2)/TotIonCt;
    factB=sum(ClassB,2)/TotIonCt;
for i=1:NumA
        ClassA(i,:)=ClassA(i,:)/factA(i);
end
for i=1:NumB
        ClassB(i,:)=ClassB(i,:)/factB(i);
end
end
ANorm=ClassA;
BNorm=ClassB;

Initial Classification Section

Find and display Population Classification

Population = CombineGroups(ClassA,ClassB); % see function at end
PopProbInClassB = PerformBayes (Population, ClassA, ClassB, NumberOfBins);
PopCorrectness = PopProbInClassB; clear PopProbInClassB;
PopCorrectness (1:NumA) = 1 - PopCorrectness (1:NumA);
NominalErrorRate = CountCorrect(PopCorrectness,Threshold);
figure();
hist(PopCorrectness*100,100); figure(gcf); axis ([0 50 0 NumA/2]);
title({'Worst Classified Patients';'All Tokens';...
['Nominal Error Rate =',num2str(NominalErrorRate*100),'%']});
xlabel ('Percent Correct');
ylabel('Number of Patients');

% Optionally remove cases that are badly misclassified
if OutlierRemoval
    prompt = {'Threshold (in percent)for removing misclassified patients'};
    dlg_title = 'Remove Outliers';
    num_lines = 1;
    default_answer = {'0'};
    OutlierRemovalThreshhold = str2num(cell2mat...
(inputdlg(prompt,dlg_title,num_lines,default_answer)))/100;
    clear prompt dlg_title num_lines default_answer;
    NumOut=NumA+NumB;        %Placeholder for number removed
[ClassA ClassB NumA NumB] = RemoveOutliers (ClassA, ClassB, NumA,...
NumB, PopCorrectness, OutlierRemovalThreshhold);
    axis ([0 100 0 NumA+NumB/2]);
    title({['Original Patients, All Tokens, ',num2str(NumberOfBins),...
' Bins'];['Nominal Error Rate =',...
num2str(NominalErrorRate*100),'%']});

% Redo Nominal Error Rate
Population = CombineGroups(ClassA,ClassB); %see function at end
PopProbInClassB =...
PerformBayes (Population, ClassA, ClassB, NumberOfBins);
    PopCorrectness = PopProbInClassB; clear PopProbInClassB;
    PopCorrectness (1:NumA) = 1 - PopCorrectness (1:NumA);
    NominalErrorRate = CountCorrect(PopCorrectness,Threshold);
    xValErrorRate =...
CrossValidate (ClassA, ClassB, n, NumberOfBins, xValReps);

    NumOut=NumOut-(NumA+NumB);
    figure(); hist(PopCorrectness*100,100);  axis ([0 100 0 NumA+NumB]);
    title({[num2str(NumOut),' Patients Removed, All Tokens, ',...
num2str(NumberOfBins),' Bins'];...
['Nominal Error Rate =',num2str(NominalErrorRate*100),'%'];...
['Cross Validated Error Rate =',num2str(xValErrorRate*100),'%']});
    xlabel ('Percent Correct');
    ylabel('Number of Patients');
    msgbox('Outlliers removed. Details saved in OutliersRemoved.mat');
end % of optional outlier removal

Start Removing Peaks (if desired)

if RemovePeaks == 1
% Remove Peaks one at a time, report error rates
PeakNumber = 1:Cols;

% Build a data set without that peak and find the error
for RemovePeak = 1:PeaksToRemove
        NumCols = Cols+1-RemovePeak;
        PeakxValErrorRate = zeros(1,NumCols);
for Peak = 1:NumCols
            ClassAMinus = ClassA;
            ClassAMinus (:,Peak) = [];
            ClassBMinus = ClassB;
            ClassBMinus (:,Peak) = [];
            [SampleCorrectness(Peak,:) PeakxValErrorRate(Peak)]...
= CrossValidate (ClassAMinus,...
ClassBMinus, n, NumberOfBins, xValReps); %#ok<*AGROW>
end

%  Try to capture peak discrimination
CurrentPeakError = zeros (1,Cols);
for p = 1:Peak
            CurrentPeakError(PeakNumber(p))=PeakxValErrorRate(p);
end
[LowErrorRate BestPeakToRemove] = min(PeakxValErrorRate);
        PeakRemoved = PeakNumber(BestPeakToRemove);
        ClassA(:,BestPeakToRemove) = [];
        ClassB(:,BestPeakToRemove) = [];
        PeakNumber(BestPeakToRemove)=[];
        PeaksOut(RemovePeak,1) = RemovePeak;
        PeaksOut(RemovePeak,2) = PeakRemoved;
        PeaksOut(RemovePeak,3) = LowErrorRate;
        DisplayElapsedTime (PeakRemoved, LowErrorRate, PeaksToRemove,...
RemovePeak,start);
        FinalError(PeaksToRemove)=LowErrorRate;
end
PeakList=PeakNumber;
end

Start Adding Peaks (if desired)

if RemovePeaks == -1 % Add Peaks one at a time, report error rates
PeakNumber = 1:Cols;
    ClassAFinal=zeros(NumA,PeaksToLeave);
    ClassBFinal=zeros(NumB,PeaksToLeave);
for AddPeak = 1:PeaksToLeave    % Repeat until desired number of peaks
NumCols = Cols+1-AddPeak;   % Initialize  number of peaks to test
PeakxValErrorRate = zeros(1,NumCols); % Initialize current errors
ClassAPlus=ClassAFinal(:,1:AddPeak);
        ClassBPlus=ClassBFinal(:,1:AddPeak);
for Peak = 1:NumCols
            ClassAPlus(:,AddPeak) = ClassA(:,Peak);
            ClassBPlus(:,AddPeak) = ClassB(:,Peak);
            [SampleCorrectness(Peak,:) PeakxValErrorRate(Peak)]...
= CrossValidate (ClassAPlus,...
ClassBPlus, n, NumberOfBins, xValReps);
end
CurrentPeakError = zeros (1,Cols);
for p = 1:Peak
            CurrentPeakError(PeakNumber(p))=PeakxValErrorRate(p);
end
[LowErrorRate BestPeakToAdd] = min(PeakxValErrorRate);
        ClassAFinal(:,AddPeak)=ClassA(:,BestPeakToAdd);
        ClassBFinal(:,AddPeak)=ClassB(:,BestPeakToAdd);
        PeakAdded = PeakNumber(BestPeakToAdd);
        ClassA(:,BestPeakToAdd) = [];
        ClassB(:,BestPeakToAdd) = [];
        PeakNumber(BestPeakToAdd)=[];
        PeaksAdded(PeakAdded,1) = AddPeak; % Order Added
PeaksAdded(PeakAdded,2) = PeakAdded; % Specific Peak added
PeaksAdded(PeakAdded,3) = LowErrorRate; % Error Value that Peak
DisplayElapsedTime (PeakAdded, LowErrorRate, AddPeak,...
PeaksToLeave,start);
        PeakList(AddPeak)=PeakAdded;
        FinalError(AddPeak)=LowErrorRate;
end
end
end  %function

Combine Groups

function population = CombineGroups (clsA, clsB)
% Combines two groups together to make a population
rowsA = size (clsA,1);
rowsB = size (clsB,1);
population (1 : rowsA, :) = clsA;
population (rowsA+1 : rowsA+rowsB, :) = clsB;
end

Remove Outliers

function [A B nA nB]...
= RemoveOutliers(clsA, clsB, numA, numB, correct, threshhold)
% Removes rows in clsA and clsB whose corresponding correctness is below
% the threshold.
nA=numA;
nB=numB;
correctness = correct;
removed = cell (numA+numB,1);
for i = numB:-1:1
if correct(i+numA)<threshhold
        clsB(i,:)=[];
        nB=nB-1;
        correctness(i+numA)=[];
        removed(numA+i,1) = {['removed because correctness was ',...
num2str(100*correct(i+numA)),'%']};
else
removed(numA+i,1) = {'not removed'};
end
end

for i = numA:-1:1
if correct(i)<threshhold
        clsA(i,:)=[];
        nA=nA-1;
        correctness(i)=[];
        removed(i,1) = {['removed because correctness was ',...
num2str(100*correct(i)),'%']};
else
removed(i,1) = {'not removed'};
end
end

A=clsA;
B=clsB;
save 'OutliersRemoved' removed;
end

Count number correct

function errorrate = CountCorrect(correctvector,threshold)
% Counts the number of entries in correctvector that are above threshold
len=max(size(correctvector));
count=sum(correctvector>threshold);
errorrate=1-(count/len);
end

Perform the Cross Validation

function [correctnesstable xValErrorRate] = CrossValidate (clsA, clsB,...
n, NumberOfBins, reps)
% This function manages the overall cross validation, splitting the data
% into n groups and then choosing one group at a time to be the test group.
% The Bayes analysis is done inside the cross validation attempt.

for r = 1:reps
% Split each class into n subgroups for nfold cross validation

[nGroupsA RowsInGroupsA]=nfold(n, clsA); % This function appears below
[nGroupsB RowsInGroupsB]=nfold(n, clsB); % This function appears below

% Now we iterate the n groups, choosing n-1 groups to "learn" from and
% the other group to classify.
position = 0;
for i = 1 : n
        NumClassA = RowsInGroupsA (i);
        NumClassB = RowsInGroupsB (i);

% Combine n-1 of the groups to train on, the other to test
% This function appears below
[Test TrainA TrainB] = CreateValGroups(i, n, nGroupsA, nGroupsB);

% Send these groups to the Bayesian Classifier
PrBP = PerformBayes(Test, TrainA, TrainB, NumberOfBins);

% Build the correctness table
correctnesstable(position+1:position+NumClassA)=...
1-PrBP(1:NumClassA);
        position = position+NumClassA;
        correctnesstable(position+1:position+NumClassB)=...
PrBP(NumClassA+1:NumClassA+NumClassB);
        position = position + NumClassB;
end

% Call a function that returns an error rate for the correctness table
TrialErrorRate(r) = CountCorrect(correctnesstable,.5);  %#ok<AGROW>
end
xValErrorRate = mean(TrialErrorRate);
end

Perform the Naive Bayesian Classification

function PrBP = PerformBayes (testgroup, groupa, groupb, NumBins)
% This function reads in a pair of training groups, creates a probability
% distribution table, and then uses that to classify a test group. The
% array that is returned has the probability (from 0 to 1) that the
% corresponding element in the test group is in class B

% First determine number of data sets (rows) and elements per data set
% (cols) for each class
[RowsA Cols] = size(groupa);
RowsB = size(groupb,1);
Rows = RowsA + RowsB;
Pa = RowsA/Rows;
Pb = RowsB/Rows;
Prior = Pb;
[RowsT ColsT] = size(testgroup);
PopBins = zeros (NumBins+1,Cols);
ProbDistA = zeros (NumBins+1,Cols);
ProbDistB = zeros (NumBins+1,Cols);
PopProbDist = zeros (NumBins+1,Cols);

% Now Combine the classes into a population
Population(1:RowsA,:)=groupa;
Population (RowsA + 1 : RowsA + RowsB, :) = groupb;
PrPeaksA = zeros (RowsT, Cols);
PrPeaksB = zeros (RowsT, Cols);

% Create Row Vectors with mean and standard dev for each peak
AvgColValue=mean(Population);
StandardDev=std(Population);

if NumBins ==0 % Do Continuous Case
MeanA=mean(groupa);
    MeanB=mean(groupb);
    StDevA=std(groupa);
    StDevB=std(groupb);
% Calculate the probability (from a normal distribution) of getting
% that value.  Use 1% as a minimum.
for c = 1:Cols
for r = 1:RowsT
            PrPeaksA(r,c)=...
max([exp(-(testgroup(r,c)-MeanA(c))^2/StDevA(c)^2) .01]);
            PrPeaksB(r,c)=...
max([exp(-(testgroup(r,c)-MeanB(c))^2/StDevB(c)^2) .01]);
end
end
else % Do Discrete case
for i = 1 : Cols
% Create bins - either 2, 4, or 6 (see below)
if NumBins == 2 % Create Bins for above and below mean
Bins=[-inf, AvgColValue(i),inf];
elseif NumBins == 4 % Create additional bins 1 std dev above/below
Bins=[-inf,AvgColValue(i)-StandardDev(i),AvgColValue(i),...
AvgColValue(i)+StandardDev(i), inf];
elseif NumBins ==6 %Create bins +/- .5,1.2 std devs from mean
Bins=[-inf, AvgColValue(i)-1.2*StandardDev(i),...
AvgColValue(i)-.5*StandardDev(i),...
AvgColValue(i), AvgColValue(i)+.5*StandardDev(i),...
AvgColValue(i)+1.2*StandardDev(i),inf];
else
disp('Number of Bins must be 2, 4, or 6');
end
PopBins(:,i)=Bins;

% Bin each peak in each set into the bins created above store the
% bins for each peak in PopBins, store the normed histogram in
% PopProbDist
ClassProbDistA=histc(groupa(:,i),Bins);
        ClassProbDistB=histc(groupb(:,i),Bins);
        ProbDistA(:,i)=ClassProbDistA/RowsA;
        ProbDistB(:,i)=ClassProbDistB/RowsB;
end % creating Bins and population distributions

% Find the Probability distributions
% ProbDistA and B are NumBins x "number of peaks" arrays. The
% probability distribution lookup table for each peak's set of bins is
% in each column.  This is the likelihood for one peak "i"
% Pr(Pi|Class). BinnedData has the bin that each patient's peak's fall
% within (1-6). Prior is a scalar (0-1) that is the Pr (B) given only
% population info.

% Shave off bottom row which is all zeros
LastRow = size(PopProbDist,1);
    ProbDistA(LastRow,:)=[];
    ProbDistB(LastRow,:)=[];

% Bin the test data
BinnedData = zeros (RowsT, ColsT);
for i = 1 : RowsT
for j = 1 : ColsT
if testgroup (i,j) < PopBins (2,j)
                BinnedData (i,j) = 1;
elseif testgroup (i,j) < PopBins (3,j)
                BinnedData (i,j) = 2;
elseif testgroup (i,j) < PopBins (4,j)
                BinnedData (i,j) = 3;
elseif testgroup (i,j) < PopBins (5,j)
                BinnedData (i,j) = 4;
elseif testgroup (i,j) < PopBins (6,j)
                BinnedData (i,j) = 5;
else
BinnedData (i,j) = 6;
end
end
end

%   Now examine the test data
for i = 1 : RowsT
for j = 1 : ColsT
%  Build two arrays with each entry being the Pr(that bin|Class)
%  for all peaks for all patients.  The pointer on where to look is
%  the array BinnedData (i,j) -- a number 1 to 6 representing the
%  bin for that peak.  It becomes the row that is looked up in the
%  probability distribution table. Set a min of 1% to prevent zero
%  probability.
PrPeaksA (i,j) = max([ProbDistA(BinnedData(i,j),j) .01]);
            PrPeaksB (i,j) = max([ProbDistB(BinnedData(i,j),j) .01]);
end
end
end %choice of discrete or continuos

%  Now find the Pr (P|Class) by taking the product Pr(Pi|Class).  The
%  product ends up as a column vector with each row representing the
%  Pr(P|Class) for that patient.  This is the likelihood.
PrPA = prod (PrPeaksA,2);
PrPB = prod (PrPeaksB,2);

%  Compute evidence - Prob (Peaks) marginalizing across groups
PrP = (PrPA*Pa) + (Pb*PrPB);

%  Use Bayes' rule to calculate the posterior Prob (class|Peaks).  This
%  Posterior is returned as the output of the function. Only the
%  Prob of being in class B is returned. Pr(A|P) is that subtracted from 1.
PrBP = zeros (RowsT,1);
for i = 1 : RowsT
if PrP(i)==0
        PrBP (i) = 0;
else
PrBP(i) = (PrPB(i) * Prior)/PrP(i);
end
end
end

Create n groups for Cross Validation

function [Test TrainA TrainB] =...
CreateValGroups(i,n,ClassAGroups,ClassBGroups)
% Given two sets of groups n x Row x Col, selects the ith group as a test
% group and removes that group from the set.

% Create three 1 x C arrays
Cols = size(ClassAGroups,3);
Test = zeros(1,Cols);
TrainA = Test;
TrainB = Test;
Coltest=Cols;

% Create the Test Group from the ith group of the two input arrays
TestA (:,:) = ClassAGroups(i,:,:);
TestB (:,:) = ClassBGroups(i,:,:);
if Cols == 1
    TestA=TestA';
    TestB=TestB';
end
Test = cat (1, TestA, TestB);

%   Remove the Test Group from the mix
ClassAGroups(i,:,:) = [];
ClassBGroups(i,:,:) = [];

%Create the two training Groups from the remainder of the input arrays
for j = 1 : n-1
    GroupA (:,:) = ClassAGroups (j,:,:);
    GroupB (:,:) = ClassBGroups (j,:,:);
if Coltest == 1 %This section makes sure MATLAB handles a single
% column as a column not a row vector.
GroupA=GroupA';
        GroupB=GroupB';
        Coltest=0;
end
TrainA = cat (1, TrainA, GroupA);
    TrainB = cat (1, TrainB, GroupB);
end

% Because of the uneven size of the arrays, they will have lines of all
% zero which need to be deleted using a function RemoveZeros
Test = RemoveZeros (Test); % This function appears below
TrainA = RemoveZeros (TrainA);
TrainB = RemoveZeros (TrainB);
end

Remove Zeros left by array split up

function ArrayOut = RemoveZeros (ArrayIn)
% Removes any line in ArrayIn that is all zeros* (from bad indexing)
%                        *careful it only checks if the sum is zero
% Build a vector Hash with the sums of each row.
Rows = size(ArrayIn,1);
Hash = sum(ArrayIn,2);

% Look through Rows and delete any with a sum of zero
for i = Rows:-1:1
if Hash(i) == 0
        ArrayIn(i,:)=[];
end
end
ArrayOut = ArrayIn;
end

Split up groups for cross validation

function [ArraysOut RowsInGroup] = nfold(n, ArrayIn)
% nfold splits array into n arrays of nearly equal length and returns
% it as the 3-D array ArraysOut(1) through (n). The last row of some of the
% arrays is all zeros since the input array may not be split evenly
RowsInGroup=zeros(n,1); % Stores how many are in each group

% Determine size of the array
[Rows Columns] = size(ArrayIn);

% Randomize the order of the rows prior to splitting by attaching a random
% vector (random numbers 0-1), sorting by that vector, then deleting it.
SortVector = rand(Rows,1);
AppendData = [SortVector ArrayIn];
SortData = sortrows(AppendData);
SortData (:,1) = [];

% Find out how many rows go in each final array. RowsLeft counts down as we
% pull rows out into the Output array.
RowsLeft=Rows;
NumInGroups = int8 ((Rows-mod(Rows,n))/n);
ArraysOut=zeros(n,NumInGroups+1,Columns);
for i = 1:n
% Determine if the array can be split evenly, if not, add one to each
% group until the split is even
if (mod(RowsLeft,NumInGroups) == 0)
        RowsInGroup(i) = NumInGroups;
else
RowsInGroup(i) = NumInGroups + 1;
end

%   Determine how many rows are left. For however many rows are in the
%   current group, pull a row out of the main array into a sub array
RowsLeft=RowsLeft-RowsInGroup(i);
for j = 1:RowsInGroup(i)
for k = 1:Columns
            ArraysOut (i,j,k) = SortData(RowsLeft+j,k);
end
end
end
end

Timer Tool

function DisplayElapsedTime(PeakRemoved, LowErrorRate, PeaksToRemove,...
RemovePeak, starttime)
% This tool keeps the user informed of the progress
ElapsedTime = clock-starttime;
PeaksLeft=PeaksToRemove-RemovePeak;
%         RemainTime=((ElapsedTime(4)*3600)+(ElapsedTime(5)*60)+...
%             ElapsedTime(6))*(.75)*PeaksLeft/60;
if ElapsedTime(6)<0
    ElapsedTime(6) = ElapsedTime(6)+60;
    ElapsedTime(5) = ElapsedTime(5)-1;
end
if ElapsedTime(5)<0
    ElapsedTime(5) = ElapsedTime(5)+60;
    ElapsedTime(4) = ElapsedTime(4)-1;
end
Note = ['Removing Peak ', num2str(PeakRemoved), ' with error ',...
num2str(LowErrorRate*100),'%, ',...
num2str(PeaksLeft),...
' left, Elapsed time =', num2str(ElapsedTime(4)),...
'h,', num2str(ElapsedTime(5)), 'm,',...
num2str(int8(ElapsedTime(6))), 's.'];
disp(Note);
end