%% +---------------+
%% | ARTICLE TITLE |
%% +---------------+
\title[]{Design of robust pattern classifiers based on optimum-path
forests}
%%\RunningHead{Banon and Braga-Neto}{Author instructions}

%% +---------+
%% | AUTHORS |
%% +---------+
\begin{Authors}
	\Author
   		{João P. Papa}
		\Affil
		[IC]		
		{State University of Campinas, Institute of Computing, Campinas, Brazil \\*
		\email{\{jpaulo,afalcao\}@ic.unicamp.br},pavmbr@yahoo.com.br,celso.suzuki@gmail.com}% institutional e-mail address
	\Author
		{Alexandre X. Falcão}
		\Affilref[IC]	
	\Author
		{Paulo A. V. Miranda}
		\Affilref[IC]	
	\Author
		{Celso T. N. Suzuki}
		\Affilref[IC]	
	\Author
		{Nelson D. A. Mascarenhas}
		\Affil{Federal University of São Carlos, Department of Computing \\*
		\email{nelson@dc.ufscar.br}}% institutional e-mail address
%% If two or more authors share an affiliation, pass an optional
%% argument to the command \Affil when the affiliation is referred
%% for the first time. This argument should be a key consisting 
%% of one or more letters (only letters). Then use the command
%% \Affilref[<KEY>] after any subsequent authors that share the
%% affiliation.
%% Example:
\end{Authors}

%% +-----------------------+
%% | ABSTRACT and KEYWORDS |
%% +-----------------------+
%% Abstract and keywords are written inside two environments,
%% the "Abstract" amd the "Keywords" environments, respectively.
%% Note the uppercase "A" and "K".
%% Keywords are separated with comma. The last keyword ends
%% with a period.
\begin{Abstract}
\begin{sloppypar}
  We present a supervised pattern classifier based on \emph{optimum
    path forest}. The samples in a training set are nodes of a
  complete graph, whose arcs are weighted by the distances between
  sample feature vectors. The training builds a classifier from key
  samples (prototypes) of all classes, where each prototype defines an
  optimum path tree whose nodes are its strongest connected samples.
  The optimum paths are also considered to label unseen test samples
  with the classes of their strongest connected prototypes. We show
  how to find prototypes with none classification errors in the
  training set and propose a learning algorithm to improve accuracy
  over an evaluation set. The method is robust to outliers, handles
  non-separable classes, and can outperform support vector machines.
\end{sloppypar}
\end{Abstract}

\begin{Keywords}
	\Index{supervised classifiers},
	\Index{image foresting transform},
	\Index{image analysis},
	\Index{morphological pattern recognition}.
\end{Keywords}

%----------------------------------------
\section{Introduction}
\label{sec:intro}

\begin{sloppypar}
  Pattern classification methods are generally divided into supervised
  and unsupervised according to their learning
  algorithms~\cite{Duda:00}. Unsupervised techniques assume no
  knowledge about the classes (labels) of the samples in the training
  set, while these labels are exploited in supervised techniques.
\end{sloppypar}

We propose a method to project supervised pattern classifiers based on
\emph{optimum path forests} (OPF). The design of an OPF classifier is
based on labeled samples from training and evaluation sets. A test set
with unseen samples is used to assess the performance of the
classifier.

The training samples are nodes of a complete graph in the sample
feature space (all pairs of nodes are connected by one arc). See
Figure~\ref{fig:example}a. The arcs are weighted by the distance
between the feature vectors of their nodes. A set of prototypes (key
samples) is obtained from the training set. We define a
\emph{path-cost function} based on arc weights, which assigns to any
path in the graph the cost of considering all samples along the path
as belonging to a same class (e.g., function $f_{max}$ which assigns
the maximum arc weight along the path). We then apply the IFT
algorithm~\cite{Falcao:04} to partition the graph into an optimum path
forest rooted at the prototypes (Figure~\ref{fig:example}b). That is,
the prototypes compete among themselves and each prototype defines an
optimum path tree, whose nodes are samples more strongly connected to
that prototype than to any other root, according to that path-cost
function. The training essentially consists of building this optimum
path forest, where the samples in a given optimum path tree are
assumed to have the same label of their root prototype.

\begin{figure}[ht]
\centering
\subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1a.pdf}}  
\hspace{0.002\hsize}
\subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1b.pdf}}  
\hspace{0.002\hsize}
\subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1c.pdf}}  
\hspace{0.002\hsize}
\subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1d.pdf}}  
\caption{(a)~Complete weighted graph for a simple training
          set. (b)~Resulting optimum-path forest from (a) for
          $f_{max}$ and two given prototypes (circled nodes). The
          entries $(x,y)$ over the nodes are, respectively, cost and
          label of the samples. (c)~Test sample (gray square) and its
          connections (dashed lines) with the training nodes.  (d)~The
          optimum path from the most strongly connected prototype, its
          label $2$, and classification cost $0.4$ are assigned to the
          test sample.}
\label{fig:example}
\end{figure}

The classification of a test sample evaluates the optimum paths from
the prototypes to this sample incrementally, as though it were part of
the graph (Figure~\ref{fig:example}c). The optimum path from the most
strongly connected prototype, its label and path cost (classification
cost) are assigned to the test sample (Figure~\ref{fig:example}d).
Note the difference between an OPF classifier with $f_{\max}$ and the
nearest neighbor approach~\cite{Duda:00}. The test sample is assigned
to a given class, even when its closest labeled sample is from another
class. The same rule is used to classify evaluation samples.

Before testing, we propose a learning algorithm which replaces new
samples of the evaluation set by \emph{irrelevant} samples of the
training set. Very often real problems limit the training set size.
The learning algorithm aims to improve accuracy with this limitation.
When an evaluation sample is classified, it is assigned to some
optimum path in the graph. The training samples of this path have
their numbers of right or wrong classifications added by one,
depending on the classification result.  The irrelevant samples are
those with the number of wrong classifications higher than the number
of right classifications. At each iteration, the learning algorithm
creates new evaluation and training sets and recomputes prototypes and
optimum-path forests. These prototypes guarantee none classification
errors in the training set and usually improve the accuracy over the
evaluation sets. The presence of outliers (samples of a given class
that fall inside the region of another class) usually degrades the
project of any classifier. Outliers usually become irrelevant
prototypes and are moved out from the training set. The number of
prototypes will not necessarily increase during learning and the most
representative are usually in the frontiers between classes. The
method handles non-separable classes by estimating key prototypes
within the intersection regions.

Section~\ref{sec:relatedworks} discusses related works. The OPF
classifier is presented in Section~\ref{sec:classifier} and
Section~\ref{sec:learning} presents its learning algorithm, which
outputs the last designed classifier and a learning curve showing the
accuracy values of the designed classifiers along its iterations. In
Section~\ref{sec:results}, we compare the OPF classifier with support
vector machines (SVM)~\cite{Vapnik:92}. This comparison uses databases
with outliers and non-separable multiple classes, being two databases
from image analysis. One contains voxels from white and gray matters
in magnetic resonance images of the human brain and the other contains
2D shapes from binary images. Conclusions and future works are
discussed in Section~\ref{sec:conclusion}.

\section{Related works}
\label{sec:relatedworks}

Graph-based approaches for pattern classification are usually
unsupervised. Zahn~\cite{Zahn:71} proposed an approach that computes a
minimum spanning tree (MST) in the graph and removes inconsistent arcs
to form clusters. Arc removal in the MST can also produce hierarchical
solutions for clustering, such as the popular single-linkage
approach~\cite{Hubert:74}. Other clustering techniques have been
formulated as a graph-cut problem~\cites{Malik:00} with application to
image segmentation, where the graph does not need to be complete. More
recently, graph-cut techniques have also been used for
learning~\cites{Blum:01}. Essentially, graph-based clustering methods
aim to partition the graph into components (clusters), such that each
component contains only samples of a same class. However, there is no
guarantee that the samples in a given cluster belong to the same
class, and it is hard to assign these samples to their correct class
without any prior knowledge.

\begin{sloppypar}
Supervised approaches usually exploit prior knowledge to teach the
machine how to solve the problem. Artificial neural networks
(ANN)~\cite{Kuncheva:04} and support vector machines
(SVM)~\cite{Vapnik:92} are among the most actively pursued approaches
in the last years. An ANN multi-layer perceptron (ANN-MLP), trained by
backpropagation for example, is an unstable classifier. Its accuracy
may be improved at the computational cost of using multiple
classifiers and algorithms (e.g., bagging and boosting) for training
classifier collections~\cite{Kuncheva:04}. However, it seems that
there is an unknown limit in the number of classifiers to avoid an
undesirable degradation in accuracy~\cite{Reyzin:06}. ANN-MLP assumes
that the classes can be separated by hyperplanes in the feature space.
Such assumption is unfortunately not valid in practice. SVM was
proposed to overcome the problem by assuming it is possible to
separate the classes in a higher dimensional space by optimum
hyperplanes. Although SVM usually provides reasonable accuracies, its
computational cost rapidly increases with the training set size and
the number of support vectors. \cite{Tang:06} proposed a method to
reduce the number of support vectors in the multiple-classes problem.
Their approach suffers from slow convergence and higher computational
complexity, because they first minimize the number of support vectors
in several binary SVMs, and then share these vectors among the
machines. \cite{Panda:06} presented a method to reduce the training
set size before computing the SVM algorithm. Their approach aims to
identify and remove samples likely related to non-support vectors.
However, in all SVM approaches, the assumption of separability may
also not be valid in any space of finite
dimension~\cite{Collobert:04}.
\end{sloppypar}

The role of the prototypes for the OPF classifier is very similar to
the importance of the support vectors for SVM. Considering this
together with the fact that SVM is among the best approaches for
supervised pattern classification, we have chosen the SVM code
in~\cite{site-libsvm} with a Gaussian kernel and parameters obtained
by cross validation for comparison.

\section{Optimum path classifier}
\label{sec:classifier}

Let $Z_1$, $Z_2$, and $Z_3$ be training, evaluation, and test sets
with $|Z_1|$, $|Z_2|$, and $|Z_3|$ samples such as points or image
elements (e.g., pixels, voxels, shapes). Let $\lambda(s)$ be the
function that assigns the correct label $i$, $i=1,2,\ldots,c$, from
class $i$ to any sample $s\in Z_1\cup Z_2\cup Z_3$. $Z_1$ and $Z_2$
are labeled sets used to the design of the classifier. The
applications usually impose an upper limit in $|Z_1|$, then the role
of $Z_2$ is to improve the accuracy of the classifier by interchanging
samples with $Z_1$. $Z_3$ is used to assess the performance of the
classifier and it is kept unseen during the project.
 
Let $S \subset Z_1$ be a set of prototypes of all classes (i.e., key
samples that best represent the classes). Let $v$ be an algorithm
which extracts $n$ attributes (color, shape or texture properties)
from any sample $s\in Z_1\cup Z_2\cup Z_3$ and returns a vector
$\vec{v}(s) \in Re^n$. The distance $d(s,t)$ between two samples, $s$
and $t$, is the one between their feature vectors $\vec{v}(s)$ and
$\vec{v}(t)$. One can use any valid metric (e.g., Euclidean) or a more
elaborated distance algorithm~\cites{Arica:BAS:03}.

Our problem consists of using $S$, $(v,d)$, $Z_1$ and $Z_2$ to project
an optimal classifier which can predict the correct label $\lambda(s)$
of any sample $s\in Z_3$. We propose a classifier which creates a
discrete optimal partition of the feature space such that any sample
$s\in Z_3$ can be classified according to this partition. This
partition is an optimum path forest (OPF) computed in $\Re^n$ by the
image foresting transform (IFT) algorithm~\cite{Falcao:04}.

Let $(Z_1,A)$ be a complete graph whose the nodes are the samples in
$Z_1$ and any pair of samples defines an arc in $A=Z_1 \times Z_1$
(Figure~\ref{fig:example}a).  The arcs do not need to be stored and so
the graph does not need to be explicitly represented. A path is a
sequence of distinct samples $\pi=\langle s_1,s_2,\ldots,s_k\rangle$,
where $(s_i,s_{i+1})\in A$ for $1\leq i\leq k-1$. A path is said
\emph{trivial} if $\pi=\langle s_1\rangle$.  We assign to each path
$\pi$ a cost $f(\pi)$ given by a path-cost function $f$. A path $\pi$
is said optimum if $f(\pi)\leq f(\pi')$ for any other path $\pi'$,
where $\pi$ and $\pi'$ end at a same sample $s_k$.  We also denote by
$\pi \cdot \langle s,t \rangle$ the concatenation of a path $\pi$ with
terminus at $s$ and an arc $(s,t)$.

The OPF algorithm may be used with any \emph{smooth} path-cost
function which can group samples with similar
properties~\cite{Falcao:04}. A function $f$ is smooth in $(Z_1,A)$
when for any sample $t\in Z_1$, there exists an optimum path $\pi$
ending at $t$ which either is trivial, or has the form $\tau
\cdot\langle s,t \rangle$ where
\begin{itemize} \labelwidth 0pt
      \item $f(\tau) \leq f(\pi)$,
      \item $\tau$ is optimum,
      \item for any optimum path $\tau'$ ending at $s$, 
        $f(\tau' \cdot \langle s,t \rangle) = f(\pi)$.
\end{itemize} 

We will address the path-cost function $f_{max}$, because of its
theoretical properties for estimating optimum prototypes:
\begin{eqnarray}
f_{max}(\langle
s\rangle) & = & \left\{ \begin{array}{ll}
  0 & \mbox{if $s\in S$,} \\
+\infty & \mbox{otherwise}
\end{array}\right. \nonumber \\
f_{max}(\pi \cdot \langle s,t \rangle) & = & \max\{f_{max}(\pi),d(s,t)\} 
\end{eqnarray}
such that $f_{max}(\pi)$ computes the maximum distance between
adjacent samples in $\pi$, when $\pi$ is not a trivial path.

The OPF algorithm assigns one optimum path $P^{\ast}(s)$ from $S$ to
every sample $s\in Z_1$, forming an optimum path forest $P$ (a
function with no cycles which assigns to each $s\in Z_1\backslash S$
its predecessor $P(s)$ in $P^{\ast}(s)$ or a marker $nil$ when $s\in
S$, as shown in Figure~\ref{fig:example}b). Let $R(s)\in S$ be the
root of $P^{\ast}(s)$ which can be reached from $P(s)$. The OPF
algorithm computes for each $s\in Z_1$, the cost $C(s)$ of
$P^{\ast}(s)$, the label $L(s)=\lambda(R(s))$, and the predecessor
$P(s)$, as follows.

\begin{nicealgo}{alg:opf}
\naTITLE{OPF algorithm}
\naPREAMBLE
\naINPUT{A $\lambda$-labeled training set $Z_1$, prototypes $S\subset Z_1$ and the pair $(v,d)$ for feature vector and distance computations.}
\naOUTPUT{Optimum path forest $P$, cost map $C$ and label map $L$.}
\naAUX{Priority queue $Q$ and cost variable $cst$.}
\naBODY
\na{For each $s \in Z_1\backslash S$, set $C(s)\mget +\infty$.}
\naBEGIN{For each $s \in S$, do}
\naEND{$C(s)\mget 0$, $P(s)\mget nil$, $L(s)\mget \lambda(s)$, and insert $s$ in $Q$.}
\naBEGIN{While $Q$ is not empty, do}
\na{Remove from $Q$ a sample $s$ such that $C(s)$ is minimum.}
\naBEGIN{For each $t\in Z_1$ such that $t\neq s$ and $C(t) > C(s)$, do}
\na{Compute $cst \mget \max\{C(s), d(s,t)\}$.}
\naBEGIN{If $cst < C(t)$, then}
\na{If $C(t) \neq +\infty$, then remove $t$ from $Q$.}
\naENDN{3}{$P(t)\mget s$, $L(t)\mget L(s)$, $C(t)\mget cst$, and insert $t$ in $Q$.}
\end{nicealgo}

Lines $1-3$ initialize maps and insert prototypes in $Q$. The main
loop computes an optimum path from $S$ to every sample $s$ in a
non-decreasing order of cost (Lines $4-10$). At each iteration, a path
of minimum cost $C(s)$ is obtained in $P$ when we remove its last node
$s$ from $Q$ (Line 5). Ties are broken in $Q$ using first-in-first-out
policy. That is, when two optimum paths reach an ambiguous sample $s$
with the same minimum cost, $s$ is assigned to the first path that
reached it. Note that $C(t) > C(s)$ in Line 6 is false when $t$ has
been removed from $Q$ and, therefore, $C(t) \neq +\infty$ in Line 9 is
true only when $t\in Q$. Lines $8-10$ evaluate if the path that
reaches an adjacent node $t$ through $s$ is cheaper than the current
path with terminus $t$ and update the position of $t$ in $Q$, $C(t)$,
$L(t)$ and $P(t)$ accordingly.

\begin{sloppypar}
  The OPF algorithm for $f_{max}$ is an ``IFT-watershed
  transform''~\cite{LotufoMM:00} computed in the $n$-dimensional feature
  space. Apart from this extension, the most significant contributions
  are the training and learning processes which find optimum
  prototypes (markers) in the frontier between classes and avoid
  \emph{outliers} (samples of a given class that fall inside the
  region of another class in the feature space) in the training set,
  increasing the accuracy of the classifier.
\end{sloppypar}

The label $L(s)$ may be different from $\lambda(s)$, leading to
classification errors in $Z_1$. The training finds prototypes with
none classification errors in $Z_1$.

\subsection{Training}
\label{ssec:training}

\begin{sloppypar}
  We say that $S^\ast$ is an optimum set of prototypes when
  Algorithm~\ref{alg:opf} propagates the labels $L(s)=\lambda(s)$ for
  every $s\in Z_1$. Set $S^\ast$ can be found by exploiting the
  theoretical relation between \emph{Minimum Spanning Tree}
  (MST)~\cite{Cormen:90} and optimum path tree for $f_{max}$.  The
  training essentially consists of finding $S^\ast$ and an OPF
  classifier rooted at $S^\ast$.
\end{sloppypar}

By computing an MST in the complete graph $(Z_1,A)$, we obtain a
connected acyclic graph whose nodes are all samples in $Z_1$ and the
arcs are undirected and weighted by the distance $d$ between the
adjacent sample feature vectors
(Figure~\ref{fig:mst-classification}a). This spanning tree is optimum
in the sense that the sum of its arc weights is minimum as compared to
any other spanning tree in the complete graph. In the MST, every pair
of samples is connected by a single path which is optimum according to
$f_{max}$. That is, for any given sample $s\in Z_1$, it is possible to
direct the arcs of the MST such that the result will be an optimum
path tree $P$ for $f_{max}$ rooted at $s$.

\begin{figure}[ht]
\centering
\subfigure[]{\includegraphics[width=0.28\hsize]{\fullpaperpdirectory/images/figure2.pdf}}
\hspace{0.1\hsize}
\subfigure[]{\includegraphics[width=0.28\hsize]{\fullpaperpdirectory/images/figure3.pdf}}
\caption{(a) MST of the graph shown in Figure~\ref{fig:example}a where
  the optimum prototypes share the arc of weight $0.6$. (b) The
  classification of the test sample (gray square) $t$ as in
  Figure~\ref{fig:example}c assigns the optimum path $P^\ast(t)$ from
  $R(t)\in S^\ast$ to $t$ passing through $s^\ast$.}
\label{fig:mst-classification}
\end{figure}

The optimum prototypes are the closest elements in the MST with
different labels in $Z_1$. By removing the arcs between different
classes, their adjacent samples become prototypes in $S^\ast$ and
Algorithm~\ref{alg:opf} can compute an optimum path forest with none
classification errors in $Z_1$ (Figure~\ref{fig:example}b), which can be explained by the theoretical relation between minimum spanning trees and the optimum path tree obtained by OPF with $f_{max}$~\cite{Cousty:07}. Note that,
a given class may be represented by multiple prototypes (i.e., optimum
path trees) and there must exist at least one prototype per class. 

\subsection{Classification}

For any sample $t\in Z_3$, we consider all arcs connecting $t$ with
samples $s \in Z_1$, as though $t$ were part of the graph
(Figure~\ref{fig:example}c). Considering all possible paths from
$S^\ast$ to $t$, we wish to find the optimum path $P^\ast(t)$ from
$S^\ast$ and label $t$ with the class $\lambda(R(t))$ of its most
strongly connected prototype $R(t)\in S^\ast$
(Figure~\ref{fig:mst-classification}b). This path
can be identified incrementally, by evaluating the optimum cost $C(t)$
as
\begin{equation}
\label{eq:classification}
C(t) = \min\{ \max\{C(s),d(s,t)\}\},\ \forall s \in Z_1.
\end{equation}

Let the node $s^\ast\in Z_1$ be the one that satisfies the above
equation (i.e., the predecessor $P(t)$ in the optimum path
$P^\ast(t)$). Given that $L(s^\ast)=\lambda(R(t))$, the classification
simply assigns $L(s^\ast)$ as the class of $t$. An error occurs when
$L(s^\ast)\neq \lambda(t)$.

Similar procedure is applied for samples in the evaluation set $Z_2$.
In this case, however, we would like to use samples of $Z_2$ to learn
the distribution of the classes in the feature space and improve the
performance of the OPF classifier.

\section{Learning Algorithm}
\label{sec:learning}

The performance of the OPF classifier improves when the closest
samples from different classes are included in $Z_1$, because the
method finds prototypes that will work as sentinels in the frontier
between classes. We propose a learning algorithm to identify better
prototypes from samples of $Z_2$ that have never been in $Z_1$
(Algorithm~\ref{alg:learning}). Firstly we give preference to replace
\emph{irrelevant} samples of $Z_1$ by errors in $Z_2$, and secondly
other samples of $Z_2$ are replaced by \emph{irrelevant} samples of
$Z_1$. In both cases, we never let a sample of $Z_2$ return to $Z_1$.
If the application did not impose any limitation in $|Z_1|$, the
prototypes could be found from $Z_1\cup Z_2$ with none classification
errors in both sets. The learning algorithm essentially tries to
identify these prototypes from a few iterations of classification over
$Z_2$.

The algorithm outputs a \emph{learning curve} over $T$ iterations
(Lines $2-29$), which reports the accuracy values of each instance of
classifier during learning, and the final OPF classifier. Lines $4-7$
execute training and initialize the auxiliary arrays and lists. The
classification of each sample $t\in Z_2$ is performed in Lines $8-18$,
updating auxiliary arrays. The condition in Line $10$ indicates that
$t$ is misclassified.

In order to define irrelevant samples, we consider all right and wrong
classifications in $Z_2$. When $t\in Z_2$ is correctly/incorrectly
classified, we add one to the number of right/wrong classifications,
$NR(r)$ or $NW(r)$, of every sample $r\in Z_1$ in the optimum path
$P^\ast(t)$ from $R(t)\in S^\ast$ to $s^\ast$ (Lines $14-18$).
Additionally, Lines $11-13$ update the number of false positive and
false negative arrays, $FP$ and $FN$, for accuracy computation, and
insert $t$ in the list $LE(\lambda(t))$ of errors if $t$ has never
been in $Z_1$ ($t\not \in TR$).

\begin{nicealgo}{alg:learning}
\naTITLE{Learning algorithm}
\naPREAMBLE
\naINPUT{Training and evaluation sets labeled by $\lambda$, $Z_1$ and $Z_2$, number $T$ of iterations, and the pair $(v,d)$ for feature vector and distance computations.}
\naOUTPUT{Learning curve ${\cal L}$ and the last OPF classifier, represented by the predecessor map $P$, cost map $C$, and label map $L$.}
\naAUX{False positive and false negative arrays, $FP$ and $FN$, of sizes $c$, lists, $LI$ and $LE$, of irrelevant samples and error samples for each class, arrays for the number of right and wrong classifications, $NR$ and $NW$, of sizes $|Z_1|$, variables $r$ for sample, and set $TR$ to avoid samples of $Z_2$ return to $Z_1$.}
\naBODY
\na{$TR\mget \emptyset$.}
\naBEGIN{For each iteration $I=1,2,\ldots,T$, do}
\na{$TR \mget TR \cup Z_1$.} 
\na{Compute $S^\ast \subset Z_1$ as in Section~\ref{ssec:training} and $P$, $L$, $C$ by Algorithm~\ref{alg:opf}.}
\na{For each sample $s\in Z_1$, do $NR(s)\mget 0$ and $NW(s)\mget 0$.}
\naBEGIN{For each class $i=1,2,\ldots,c$, do}
\naENDN{1}{$FP(i)\mget 0$, $FN(i)\mget 0$, $LI(i)\mget \emptyset$ and $LE(i)\mget \emptyset$.}
\naBEGIN{For each sample $t\in Z_2$, do}
\na{Find $s^\ast\in Z_1$ that satisfies Equation~\ref{eq:classification} and set $r\mget s^\ast$.}
\naBEGIN{If $L(s^\ast)\neq \lambda(t)$, then}
\na{$FP(L(s^\ast))  \mget FP(L(s^\ast))  + 1$.}
\na{$FN(\lambda(t)) \mget FN(\lambda(t)) + 1$.}
\na{if $t\not \in TR$, then $LE(\lambda(t)) \mget LE(\lambda(t)) \cup \{t\}$.}
\naBEGIN{While $r \neq nil$, do}
\naENDN{2}{$NW(r)\mget NW(r) + 1$ and $r\mget P(r)$.}
\naBEGIN{Else}
\naBEGIN{While $r \neq nil$, do}
\naENDN{3}{$NR(r)\mget NR(r) + 1$ and $r\mget P(r)$.}
\na{Compute ${\cal L}(I)$ by Equation~\ref{eq:lr}.}
\naBEGIN{For each $s\in Z_1$, do}
\naBEGIN{If $NW(s) > NR(s)$, then}
\naENDN{2}{$LI(\lambda(s))\mget LI(\lambda(s))\cup \{s\}$.}
\naBEGIN{For each class $i=1,2,\ldots,c$, do}
\naBEGIN{While $|LI(i)| > 0$ and $|LE(i)| > 0$, do}
\na{$LI(i)\mget LI(i)\backslash\{s\}$ and $LE(i)\mget LE(i)\backslash\{t\}$.}
\naENDN{1}{Replace $s\in Z_1$ by $t\in Z_2$.}
\naBEGIN{While $|LI(i)| > 0$, do}
\na{$LI(i)\mget LI(i)\backslash\{s\}$.}
\naENDN{3}{Find $t\in Z_2\backslash TR$, with $\lambda(t)=i$, and replace it by $s\in Z_1$.}
\na{Compute $S^\ast\subset Z_1$ as in Section~\ref{ssec:training} and $P$, $L$, $C$ by Algorithm~\ref{alg:opf}.}
\end{nicealgo}

Line $19$ computes the accuracy at iteration $I$ and stores it in the
learning curve ${\cal L}$. The accuracy ${\cal L}(I)$ of a
given iteration $I$, $I=1,2,\ldots, T$, is measured by taking into
account that the classes may have different sizes in $Z_2$ (similar
definition is applied for $Z_3$). Let $NZ_2(i)$, $i=1,2,\ldots,c$, be
the number of samples in $Z_2$ from each class $i$. We define
\begin{equation}
e_{i,1}=\frac{FP(i)}{|Z_2|-\left|NZ_2(i)\right|}\ \text{ and }\ e_{i,2}=\frac{FN(i)}{\left|NZ_2(i)\right|},\ i=1,\ldots,c
\end{equation}
where $FP(i)$ and $FN(i)$ are the false positives and false negatives,
respectively. That is, $FP(i)$ is the number of samples from other
classes that were classified as being from the class $i$ in $Z_2$,
and $FN(i)$ is the number of samples from the class $i$ that were
incorrectly classified as being from other classes in $Z_2$. The
errors $e_{i,1}$ and $e_{i,2}$ are used to define
\begin{equation}
E(i)=e_{i,1}+e_{i,2},
\end{equation}
where $E(i)$ is the partial sum error of class $i$. Finally, the
accuracy ${\cal L}(I)$ of the classification is written as
\begin{equation}
\label{eq:lr}
{\cal L}(I)=\frac{2c-\sum_{i=1}^{c}E(i)}{2c}=1-\frac{\sum_{i=1}^{c}E(i)}{2c}.
\end{equation}

Lines $20-22$ identify as irrelevant samples in $Z_1$ those with
number of incorrect classifications higher than the number of correct
classifications. Lines $23-29$ remove elements from the lists of
irrelevant samples and errors, $LI$ and $LE$, for each class, and
first replace errors by irrelevant samples then replace the remaining
irrelevant samples (if any) by other samples of $Z_2$ that have never
been in $Z_1$.

Outliers degrade the project of any classifier. They will be usually
identified as irrelevant prototypes, being moved from $Z_1$ to $Z_2$.
Finally, Line $30$ performs the training over the last set $Z_1$ to
output the designed OPF classifier.

After learning, the classification of any sample $t\in Z_3$ is done by
simply finding $s^\ast\in Z_1$ that satisfies
Equation~\ref{eq:classification} and assigning label $L(s^\ast)$ as
the class of $t$.


\section{Results}
\label{sec:results}

We compare the OPF classifier with support vector machines
(SVM~\cite{Vapnik:92}) using four databases with outliers and
non-separable classes: Cone-torus from~\cite{Kuncheva:04}, Painted
database (Figure~\ref{fig:painted}a), MPEG-7 shape
database~\cite{MPEG-7:02}, and WM/GM (white matter/gray matter)
database~\cite{Collins:1998}. The cone-torus database contains 400
samples and 3 non-separable classes while the painted database
contains 5,867 samples with outliers and 4 classes. In both cases, the
feature vectors are the sample $(x,y)$ coordinates. The MPEG-7
database contains 1,400 2D shapes and 70 classes. To increase overlap
(difficulty) between classes, we simply adopt the 126 most significant
coeficients in the Fourier transform of the shapes as feature
vector. The WM/GM database contains 1.5M voxels of WM and GM (2
classes) from MR-T1 images of phantoms with various levels of noise
and inhomogeneity to produce outliers. The images and ground truth are
available from~\cite{Collins:1998}, and the feature vector is the
lowest and highest values around the voxel, and its intensity
value. In all cases, function $d$ is the Euclidean metric.

\begin{figure}[ht]
\centering
\subfigure[]{\includegraphics[height=3cm]{\fullpaperpdirectory/images/painted.pdf}}
\subfigure[]{\includegraphics[height=3.5cm]{\fullpaperpdirectory/images/accuracy_opf.pdf}}
\caption{(a) Painted database with outliers. (b) OPF learning curve on $Z_2$.}
\label{fig:painted}
\end{figure}


For all databases, we ramdomly selected the same percentage of samples
from each class to create $Z_1$, $Z_2$ and $Z_3$. These percentages
were 30\% for $Z_1$, 30\% for $Z_2$, and 40\% for $Z_3$ in the first
three databases. Only the WM/GM database used 0.1\% for $Z_1$, 19.9\%
for $Z_2$ and 80\% for $Z_3$.

For $Z_1$ and $Z_2$, we runned 10 iterations of
Algorithm~\ref{alg:learning} to output the OPF classifier for test on
$Z_3$. Figure~\ref{fig:painted}b shows the learing curves for all
databases. Note the usually non-decreasing behavior of the curves
after the outliers be detected as irrelevant samples and moved to
$Z_2$.

In SVM, we used a Gaussian kernel and computed support vectors
for 10 new instances of $Z_1$ and $Z_2$ by ramdomly replacing samples
between them, keeping the original proportions, and took the
configuration with highest accuracy for test on $Z_3$.

The above learning and testing processes of SVM and OPF were also
repeated for 10 distinct initial sets $Z_1$, $Z_2$, and $Z_3$ to
compute mean and standard deviation of their accuracies over $Z_3$
(Table~\ref{tab:Stats}). OPF was usually more accurate and
from 3 to 20 times faster than SVM.
 
\begin{table}[htb]
  \begin{center}
    \begin{tabular}{|c||c|c||c|c|}
      \hline
      & \multicolumn{2}{c||}{ OPF accuracy } & \multicolumn{2}{c|}{
        SVM accuracy } \\
      \cline{2-5}
      Database & mean & std dev & mean & std dev \\
      \hline
      \hline
      Cone-torus & 0.8757 & 0.0218 & 0.8147 & 0.0145 \\
      Painted & 0.9838 & 0.0144 & 0.8763 & 0.0030 \\
      MPEG-7 & 0.6925 & 0.0049 & 0.5869 & 0.0088 \\
      WM/GM & 0.9088 & 0.0006 & 0.9072 & 0.0009 \\
      \hline
    \end{tabular}
  \end{center}
  \caption{Mean and standard deviation of the accuracies for each database.}
  \label{tab:Stats}
\end{table}

\section{Conclusions and future work}
\label{sec:conclusion}

We use the IFT algorithm in sample feature spaces and propose pattern
classifiers based on optimum path forests rooted at prototypes of
training sets. The OPF classifier finds prototypes with none zero
classification errors in the training sets and learns from errors in
evaluation sets. Unseen test sets are used to assess OPF in
comparision with SVM. From the learning curves of the OPF and its
results on the test sets, we may conclude it is a robust classifier
and usually more accurate than SVM for databases with outliers and
non-separable classes.

We are currently evaluating OPF with other databases and its accuracy
is usually higher than using SVM and ANN-MLP. Future works include to
report these results and the extension of OPF to unsupervised
classification.

%----------------------------------------

%
% BIBLIOGRAPHY
% for details, see ftp://ftp.ams.org/pub/tex/amsrefs/amsrdoc.pdf
%
\begin{bibsection}
\begin{biblist}
		\bib{Collins:1998}{article}{
		  author={Collins, D.}, 
		  author={Zijdenbos, A.}, 
		  author={Kollokian, V.}, 
		  author={Sled, J.}, 
		  author={Kabani, N.}, 
		  author={Holmes, C.}, 
		  author={Evans, A.},
		  date={1998},
		  title={Design and Construction of a Realistic Digital Brain Phantom},
		  journal={IEEE Trans. on Medical Imaging},
		  volume={17},
		  number={3},
		  pages={463--468},
		  note={Available from: <\url{http://www.bic.mni.mcgill.ca/brainweb}>.}
		}
		\bib{Duda:00}{book}{
		  author={Duda, R.O.},
		  author={Hart, P.E.},
		  author={Stork, D.G.},
		  date={2000},
		  title={Pattern Classification},
		  %address={},
		  edition={2},
		  publisher={Wiley-Interscience}
		}
		\bib{Falcao:04}{article}{
		  author={Falc{\~{a}}o, A.X.},
		  author={Stolfi, J.},
		  author={Lotufo, R.A.},
		  date={2004},
		  title={The image foresting transform: theory, algorithms, and applications},
		  journal={IEEE Trans. on PAMI},
		  volume={26},
		  number={1},
		  pages={19--29}
		}
		\bib{Vapnik:92}{article}{
		  author={Boser, B.E.},
		  author={Guyon, I.M.},
		  author={Vapnik, V.N.},
		  title={A training algorithm for optimal margin classifiers},
		  booktitle={Proc. 5th Workshop on Computational Learning Theory},
		  date = {1992},
		  %isbn = {0-89791-497-X},
		  pages = {144--152},
		  %location = {Pittsburgh, Pennsylvania, United States},
%		  note = {doi: <\url{http://doi.acm.org/10.1145/130385.130401}>.},
		  publisher = {ACM Press},
		  address = {New York, NY, USA}
		}
		\bib{Zahn:71}{article}{
		  author = {Zahn, C.T.},
		  title = {Graph-Theoretical Methods for Detecting and Describing Gestalt Clusters},
		  journal = {IEEE Trans. on Computers},
		  date={1971},
		  volume={C-20},
		  pages={68--86},
		  number={1}
		  %month = {Jan.},
		  %owner = {user},
		  %timestamp = {2006.12.15}
		}
		\bib{Hubert:74}{article}{
		  author = {Hubert, L. J.},
		  title = {Some applications of graph theory to clustering},
		  journal = {Psychometrika},
		  volume = {39},
		  number = {3},
		  date = {1974},
		  %issn = {0033-3123},
		  pages = {283--309}
		}
%		\bib{Wu:93}{article}{
%		  author = {Wu, Z.},
%		  author = {Leahy, R.},
%		  title = {An Optimal Graph Theoretic Approach to Data Clustering: Theory and Its Application to Image Segmentation},
%		  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
%		  volume = {15},
%		  number = {11},
%		  date = {1993},
%		  %issn = {0162-8828},
%		  pages = {1101--1113},
%%		  note = {doi: <\url{http://doi.ieeecomputersociety.org/10.1109/34.244673}>.},
%		  publisher = {IEEE Computer Society},
%		  address = {Los Alamitos, CA, USA}
%		}
		\bib{Malik:00}{article}{
		  author = {Shi, Jiambo},
		  author = {Malik, Jitendra},
		  title = {Normalized cuts and image segmentation},
		  journal = {IEEE Trans. on Pattern Analysis and Machine Intelligence},
		  volume = {22}, 
		  number = {8}, 
		  pages = {888--905}, 
		  %month={Aug},
		  date = {2000}
		}
		\bib{Blum:01}{article}{
		  author    = {Blum, A.},
		  author    = {Chawla, S.},
		  title     = {Learning from Labeled and Unlabeled Data using Graph Mincuts.},
		  booktitle = {ICML '01: Proceedings of the 18rd international conference on Machine learning},
		  date      = {2001},
		  pages     = {19--26}
		}
%		\bib{Hanneke:06}{article}{
%		  author = {Hanneke, S.},
%		  title = {An analysis of graph cut size for transductive learning},
%		  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
%		  date = {2006},
%		  %isbn = {1-59593-383-2},
%		  pages = {393--399},
%		  %location = {Pittsburgh, Pennsylvania},
%		  note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143894}>.},
%		  publisher = {ACM Press},
%		  address = {New York, NY, USA}
%		}
%		\bib{Haykin:94}{book}{
%		  author = {Haykin, S.},
%		  title = {Neural networks: a comprehensive foundation},
%		  publisher = {Prentice Hall},
%		  date = {1994}
%		}
		\bib{Kuncheva:04}{book}{
		  author = {Kuncheva, L. I.},
		  title = {Combining Pattern Classifiers: Methods and Algorithms},
		  publisher = {Wiley-Interscience},
		  date = {2004}
		}
		\bib{Collobert:04}{article}{
		  author = {Collobert, R.},
		  author = {Bengio, S.},
		  title = {Links between perceptrons, MLPs and SVMs},
		  booktitle = {ICML '04: Proceedings of the twenty-first international conference on Machine learning},
		  date = {2004},
		  %isbn = {1-58113-828-5},
		  pages = {23},
		  %location = {Banff, Alberta, Canada},
%		  note = {doi: <\url{http://doi.acm.org/10.1145/1015330.1015415}>.},
		  publisher = {ACM Press},
		  address = {New York, NY, USA}
		}
		\bib{Cormen:90}{book}{
		  author = {Cormen, T.},
		  author = {Leiserson, C.},
		  author = {Rivest, R.},
		  title = {Introduction to Algorithms},
		  publisher = {MIT},
		  date = {1990}
		}
%		\bib{Torres:04}{article}{
%		  author = {Torres, R.},
%		  author = {Falc{\~{a}}o, A. X.},
%		  author = {Costa, L.F.},
%		  title = {A graph-based approach for multiscale shape analysis},
%		  journal = {Pattern Recognition},
%		  date = {2004},
%		  volume = {37},
%		  pages = {1163--1174}
%		}
                \bib{Reyzin:06}{inproceedings}{
                  author = {Reyzin, L.},
                  author = {Schapire, R. E.},
                  title = {How boosting the margin can also boost classifier complexity},
                  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
                  year = {2006},
                  isbn = {1-59593-383-2},
                  pages = {753--760},
                  %location = {Pittsburgh, Pennsylvania},
%                  note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143939}>.},
                  publisher = {ACM Press},
                  address = {New York, NY, USA}
                }
                \bib{Tang:06}{inproceedings}{
                  author = {Tang, B.},
                  author = {Mazzoni, D.},
                  title = {Multiclass reduced-set support vector machines},
                  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
                  year = {2006},
                  isbn = {1-59593-383-2},
                  pages = {921--928},
                  %location = {Pittsburgh, Pennsylvania},
%                  note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143960}>.},
                  publisher = {ACM Press},
                  address = {New York, NY, USA}
                }
                \bib{Panda:06}{inproceedings}{
                  author = {Panda, N.},
                  author = {Chang, E. Y.},
                  author = {Wu, G.},
                  title = {Concept boundary detection for speeding up SVMs},
                  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
                  year = {2006},
                  isbn = {1-59593-383-2},
                  pages = {681--688},
                  %location = {Pittsburgh, Pennsylvania},
                  publisher = {ACM Press},
                  address = {New York, NY, USA}
                }
                \bib{MPEG-7:02}{article}{
                  author = {MPEG-7},
                  title = {MPEG-7: The Generic Multimedia Content Description Standard, Part 1},
                  journal = {IEEE MultiMedia},
                  volume = {09},
                  number = {2},
                  year = {2002},
                  issn = {1070-986X},
                  pages = {78-87},
%                  note = {doi: <\url{http://doi.ieeecomputersociety.org/10.1109/MMUL.2002.10016}>.},
                  publisher = {IEEE Computer Society},
                  address = {Los Alamitos, CA, USA}
                }
                \bib{LotufoMM:00}{incollection}{
                  author = {Lotufo, R. A.},
                  author = {Falc{\~{a}}o, A. X.},
                  booktitle = {Mathematical Morphology and its Applications to Image and Signal Processing},
                  title     = {The ordered queue and the optimality of the watershed approaches},
                  publisher = {Kluwer},
                  volume    = {18},
                  %month     = {Jun},
                  pages     = {341--350},
                  year      = {2000}
                }
                \bib{site-libsvm}{misc}{
                  title = {LIBSVM: a Library for Support Vector Machines},
                  author = {Chang, C.},
                  author = {Lin, C.},
                  year = {2001},
                  note = {Software available at \url{http://www.csie.ntu.edu.tw/~cjlin/libsvm}}
                }
                \bib{Arica:BAS:03}{article}{
                  author = {Arica, N.},
                  author = {Vural, F. T. Y.},
                  title = {{BAS: A Perceptual Shape Descriptor based on the Beam Angle Statistics}},
                  journal = {Pattern Recognition Letters},
                  year = {2003},
                  volume = {24},
                  number = {9-10},
                  pages = {1627--1639},
%                  month = {Jun}
                }
		\bib{Cousty:07}{article}{
		author = {Cousty, J.},
		author = {Bertrand, G.},
		author = {Najman, L.},
		author = {Couprie, M.},
		title = {Watersheds, minimum spanning forests, and the drop of water principle},
		number = {IGM 2007-01},
		year = {2007},
		}
%                \bib{AbbasiMK:CSS:00}{article}{
%                  author = {Abbasi, S.},
%                  author = {Mokhtarian, F.},
%                  author = {Kittler, J.},
%                  title     = {Enhancing CSS-based shape retrieval for
%                    objects with shallow concavities},
%                  journal   = {Image and Vision Computing},
%                  volume    = {18},
%                  number    = {3},
%                  year      = {2000},
%                  pages     = {199-211},
%                  month = {Feb}
%                }
%                \bib{Stehling:BIC:02}{inproceedings}{
%                  author = {Stehling, R. O.},
%                  author = {Nascimento, M. A.},
%                  author = {Falc{\~{a}}o, A. X.},
%                  title = {A compact and efficient image retrieval approach based on border/interior pixel classification},
%                  booktitle = {CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management},
%                  year = {2002},
%                  isbn = {1-58113-492-4},
%                  pages = {102--109},
%                  location = {McLean, Virginia, USA},
%                  note = {doi: <\url{http://doi.acm.org/10.1145/584792.584812}>.},
%                  publisher = {ACM Press},
%                  address = {New York, NY, USA}
%                }                
	\end{biblist}
\end{bibsection}