%% +---------------+ %% | ARTICLE TITLE | %% +---------------+ \title[]{Design of robust pattern classifiers based on optimum-path forests} %%\RunningHead{Banon and Braga-Neto}{Author instructions} %% +---------+ %% | AUTHORS | %% +---------+ \begin{Authors} \Author {João P. Papa} \Affil [IC] {State University of Campinas, Institute of Computing, Campinas, Brazil \\* \email{\{jpaulo,afalcao\}@ic.unicamp.br},pavmbr@yahoo.com.br,celso.suzuki@gmail.com}% institutional e-mail address \Author {Alexandre X. Falcão} \Affilref[IC] \Author {Paulo A. V. Miranda} \Affilref[IC] \Author {Celso T. N. Suzuki} \Affilref[IC] \Author {Nelson D. A. Mascarenhas} \Affil{Federal University of São Carlos, Department of Computing \\* \email{nelson@dc.ufscar.br}}% institutional e-mail address %% If two or more authors share an affiliation, pass an optional %% argument to the command \Affil when the affiliation is referred %% for the first time. This argument should be a key consisting %% of one or more letters (only letters). Then use the command %% \Affilref[] after any subsequent authors that share the %% affiliation. %% Example: \end{Authors} %% +-----------------------+ %% | ABSTRACT and KEYWORDS | %% +-----------------------+ %% Abstract and keywords are written inside two environments, %% the "Abstract" amd the "Keywords" environments, respectively. %% Note the uppercase "A" and "K". %% Keywords are separated with comma. The last keyword ends %% with a period. \begin{Abstract} \begin{sloppypar} We present a supervised pattern classifier based on \emph{optimum path forest}. The samples in a training set are nodes of a complete graph, whose arcs are weighted by the distances between sample feature vectors. The training builds a classifier from key samples (prototypes) of all classes, where each prototype defines an optimum path tree whose nodes are its strongest connected samples. The optimum paths are also considered to label unseen test samples with the classes of their strongest connected prototypes. We show how to find prototypes with none classification errors in the training set and propose a learning algorithm to improve accuracy over an evaluation set. The method is robust to outliers, handles non-separable classes, and can outperform support vector machines. \end{sloppypar} \end{Abstract} \begin{Keywords} \Index{supervised classifiers}, \Index{image foresting transform}, \Index{image analysis}, \Index{morphological pattern recognition}. \end{Keywords} %---------------------------------------- \section{Introduction} \label{sec:intro} \begin{sloppypar} Pattern classification methods are generally divided into supervised and unsupervised according to their learning algorithms~\cite{Duda:00}. Unsupervised techniques assume no knowledge about the classes (labels) of the samples in the training set, while these labels are exploited in supervised techniques. \end{sloppypar} We propose a method to project supervised pattern classifiers based on \emph{optimum path forests} (OPF). The design of an OPF classifier is based on labeled samples from training and evaluation sets. A test set with unseen samples is used to assess the performance of the classifier. The training samples are nodes of a complete graph in the sample feature space (all pairs of nodes are connected by one arc). See Figure~\ref{fig:example}a. The arcs are weighted by the distance between the feature vectors of their nodes. A set of prototypes (key samples) is obtained from the training set. We define a \emph{path-cost function} based on arc weights, which assigns to any path in the graph the cost of considering all samples along the path as belonging to a same class (e.g., function $f_{max}$ which assigns the maximum arc weight along the path). We then apply the IFT algorithm~\cite{Falcao:04} to partition the graph into an optimum path forest rooted at the prototypes (Figure~\ref{fig:example}b). That is, the prototypes compete among themselves and each prototype defines an optimum path tree, whose nodes are samples more strongly connected to that prototype than to any other root, according to that path-cost function. The training essentially consists of building this optimum path forest, where the samples in a given optimum path tree are assumed to have the same label of their root prototype. \begin{figure}[ht] \centering \subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1a.pdf}} \hspace{0.002\hsize} \subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1b.pdf}} \hspace{0.002\hsize} \subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1c.pdf}} \hspace{0.002\hsize} \subfigure[]{\includegraphics[width=0.26\hsize]{\fullpaperpdirectory/images/figure1d.pdf}} \caption{(a)~Complete weighted graph for a simple training set. (b)~Resulting optimum-path forest from (a) for $f_{max}$ and two given prototypes (circled nodes). The entries $(x,y)$ over the nodes are, respectively, cost and label of the samples. (c)~Test sample (gray square) and its connections (dashed lines) with the training nodes. (d)~The optimum path from the most strongly connected prototype, its label $2$, and classification cost $0.4$ are assigned to the test sample.} \label{fig:example} \end{figure} The classification of a test sample evaluates the optimum paths from the prototypes to this sample incrementally, as though it were part of the graph (Figure~\ref{fig:example}c). The optimum path from the most strongly connected prototype, its label and path cost (classification cost) are assigned to the test sample (Figure~\ref{fig:example}d). Note the difference between an OPF classifier with $f_{\max}$ and the nearest neighbor approach~\cite{Duda:00}. The test sample is assigned to a given class, even when its closest labeled sample is from another class. The same rule is used to classify evaluation samples. Before testing, we propose a learning algorithm which replaces new samples of the evaluation set by \emph{irrelevant} samples of the training set. Very often real problems limit the training set size. The learning algorithm aims to improve accuracy with this limitation. When an evaluation sample is classified, it is assigned to some optimum path in the graph. The training samples of this path have their numbers of right or wrong classifications added by one, depending on the classification result. The irrelevant samples are those with the number of wrong classifications higher than the number of right classifications. At each iteration, the learning algorithm creates new evaluation and training sets and recomputes prototypes and optimum-path forests. These prototypes guarantee none classification errors in the training set and usually improve the accuracy over the evaluation sets. The presence of outliers (samples of a given class that fall inside the region of another class) usually degrades the project of any classifier. Outliers usually become irrelevant prototypes and are moved out from the training set. The number of prototypes will not necessarily increase during learning and the most representative are usually in the frontiers between classes. The method handles non-separable classes by estimating key prototypes within the intersection regions. Section~\ref{sec:relatedworks} discusses related works. The OPF classifier is presented in Section~\ref{sec:classifier} and Section~\ref{sec:learning} presents its learning algorithm, which outputs the last designed classifier and a learning curve showing the accuracy values of the designed classifiers along its iterations. In Section~\ref{sec:results}, we compare the OPF classifier with support vector machines (SVM)~\cite{Vapnik:92}. This comparison uses databases with outliers and non-separable multiple classes, being two databases from image analysis. One contains voxels from white and gray matters in magnetic resonance images of the human brain and the other contains 2D shapes from binary images. Conclusions and future works are discussed in Section~\ref{sec:conclusion}. \section{Related works} \label{sec:relatedworks} Graph-based approaches for pattern classification are usually unsupervised. Zahn~\cite{Zahn:71} proposed an approach that computes a minimum spanning tree (MST) in the graph and removes inconsistent arcs to form clusters. Arc removal in the MST can also produce hierarchical solutions for clustering, such as the popular single-linkage approach~\cite{Hubert:74}. Other clustering techniques have been formulated as a graph-cut problem~\cites{Malik:00} with application to image segmentation, where the graph does not need to be complete. More recently, graph-cut techniques have also been used for learning~\cites{Blum:01}. Essentially, graph-based clustering methods aim to partition the graph into components (clusters), such that each component contains only samples of a same class. However, there is no guarantee that the samples in a given cluster belong to the same class, and it is hard to assign these samples to their correct class without any prior knowledge. \begin{sloppypar} Supervised approaches usually exploit prior knowledge to teach the machine how to solve the problem. Artificial neural networks (ANN)~\cite{Kuncheva:04} and support vector machines (SVM)~\cite{Vapnik:92} are among the most actively pursued approaches in the last years. An ANN multi-layer perceptron (ANN-MLP), trained by backpropagation for example, is an unstable classifier. Its accuracy may be improved at the computational cost of using multiple classifiers and algorithms (e.g., bagging and boosting) for training classifier collections~\cite{Kuncheva:04}. However, it seems that there is an unknown limit in the number of classifiers to avoid an undesirable degradation in accuracy~\cite{Reyzin:06}. ANN-MLP assumes that the classes can be separated by hyperplanes in the feature space. Such assumption is unfortunately not valid in practice. SVM was proposed to overcome the problem by assuming it is possible to separate the classes in a higher dimensional space by optimum hyperplanes. Although SVM usually provides reasonable accuracies, its computational cost rapidly increases with the training set size and the number of support vectors. \cite{Tang:06} proposed a method to reduce the number of support vectors in the multiple-classes problem. Their approach suffers from slow convergence and higher computational complexity, because they first minimize the number of support vectors in several binary SVMs, and then share these vectors among the machines. \cite{Panda:06} presented a method to reduce the training set size before computing the SVM algorithm. Their approach aims to identify and remove samples likely related to non-support vectors. However, in all SVM approaches, the assumption of separability may also not be valid in any space of finite dimension~\cite{Collobert:04}. \end{sloppypar} The role of the prototypes for the OPF classifier is very similar to the importance of the support vectors for SVM. Considering this together with the fact that SVM is among the best approaches for supervised pattern classification, we have chosen the SVM code in~\cite{site-libsvm} with a Gaussian kernel and parameters obtained by cross validation for comparison. \section{Optimum path classifier} \label{sec:classifier} Let $Z_1$, $Z_2$, and $Z_3$ be training, evaluation, and test sets with $|Z_1|$, $|Z_2|$, and $|Z_3|$ samples such as points or image elements (e.g., pixels, voxels, shapes). Let $\lambda(s)$ be the function that assigns the correct label $i$, $i=1,2,\ldots,c$, from class $i$ to any sample $s\in Z_1\cup Z_2\cup Z_3$. $Z_1$ and $Z_2$ are labeled sets used to the design of the classifier. The applications usually impose an upper limit in $|Z_1|$, then the role of $Z_2$ is to improve the accuracy of the classifier by interchanging samples with $Z_1$. $Z_3$ is used to assess the performance of the classifier and it is kept unseen during the project. Let $S \subset Z_1$ be a set of prototypes of all classes (i.e., key samples that best represent the classes). Let $v$ be an algorithm which extracts $n$ attributes (color, shape or texture properties) from any sample $s\in Z_1\cup Z_2\cup Z_3$ and returns a vector $\vec{v}(s) \in Re^n$. The distance $d(s,t)$ between two samples, $s$ and $t$, is the one between their feature vectors $\vec{v}(s)$ and $\vec{v}(t)$. One can use any valid metric (e.g., Euclidean) or a more elaborated distance algorithm~\cites{Arica:BAS:03}. Our problem consists of using $S$, $(v,d)$, $Z_1$ and $Z_2$ to project an optimal classifier which can predict the correct label $\lambda(s)$ of any sample $s\in Z_3$. We propose a classifier which creates a discrete optimal partition of the feature space such that any sample $s\in Z_3$ can be classified according to this partition. This partition is an optimum path forest (OPF) computed in $\Re^n$ by the image foresting transform (IFT) algorithm~\cite{Falcao:04}. Let $(Z_1,A)$ be a complete graph whose the nodes are the samples in $Z_1$ and any pair of samples defines an arc in $A=Z_1 \times Z_1$ (Figure~\ref{fig:example}a). The arcs do not need to be stored and so the graph does not need to be explicitly represented. A path is a sequence of distinct samples $\pi=\langle s_1,s_2,\ldots,s_k\rangle$, where $(s_i,s_{i+1})\in A$ for $1\leq i\leq k-1$. A path is said \emph{trivial} if $\pi=\langle s_1\rangle$. We assign to each path $\pi$ a cost $f(\pi)$ given by a path-cost function $f$. A path $\pi$ is said optimum if $f(\pi)\leq f(\pi')$ for any other path $\pi'$, where $\pi$ and $\pi'$ end at a same sample $s_k$. We also denote by $\pi \cdot \langle s,t \rangle$ the concatenation of a path $\pi$ with terminus at $s$ and an arc $(s,t)$. The OPF algorithm may be used with any \emph{smooth} path-cost function which can group samples with similar properties~\cite{Falcao:04}. A function $f$ is smooth in $(Z_1,A)$ when for any sample $t\in Z_1$, there exists an optimum path $\pi$ ending at $t$ which either is trivial, or has the form $\tau \cdot\langle s,t \rangle$ where \begin{itemize} \labelwidth 0pt \item $f(\tau) \leq f(\pi)$, \item $\tau$ is optimum, \item for any optimum path $\tau'$ ending at $s$, $f(\tau' \cdot \langle s,t \rangle) = f(\pi)$. \end{itemize} We will address the path-cost function $f_{max}$, because of its theoretical properties for estimating optimum prototypes: \begin{eqnarray} f_{max}(\langle s\rangle) & = & \left\{ \begin{array}{ll} 0 & \mbox{if $s\in S$,} \\ +\infty & \mbox{otherwise} \end{array}\right. \nonumber \\ f_{max}(\pi \cdot \langle s,t \rangle) & = & \max\{f_{max}(\pi),d(s,t)\} \end{eqnarray} such that $f_{max}(\pi)$ computes the maximum distance between adjacent samples in $\pi$, when $\pi$ is not a trivial path. The OPF algorithm assigns one optimum path $P^{\ast}(s)$ from $S$ to every sample $s\in Z_1$, forming an optimum path forest $P$ (a function with no cycles which assigns to each $s\in Z_1\backslash S$ its predecessor $P(s)$ in $P^{\ast}(s)$ or a marker $nil$ when $s\in S$, as shown in Figure~\ref{fig:example}b). Let $R(s)\in S$ be the root of $P^{\ast}(s)$ which can be reached from $P(s)$. The OPF algorithm computes for each $s\in Z_1$, the cost $C(s)$ of $P^{\ast}(s)$, the label $L(s)=\lambda(R(s))$, and the predecessor $P(s)$, as follows. \begin{nicealgo}{alg:opf} \naTITLE{OPF algorithm} \naPREAMBLE \naINPUT{A $\lambda$-labeled training set $Z_1$, prototypes $S\subset Z_1$ and the pair $(v,d)$ for feature vector and distance computations.} \naOUTPUT{Optimum path forest $P$, cost map $C$ and label map $L$.} \naAUX{Priority queue $Q$ and cost variable $cst$.} \naBODY \na{For each $s \in Z_1\backslash S$, set $C(s)\mget +\infty$.} \naBEGIN{For each $s \in S$, do} \naEND{$C(s)\mget 0$, $P(s)\mget nil$, $L(s)\mget \lambda(s)$, and insert $s$ in $Q$.} \naBEGIN{While $Q$ is not empty, do} \na{Remove from $Q$ a sample $s$ such that $C(s)$ is minimum.} \naBEGIN{For each $t\in Z_1$ such that $t\neq s$ and $C(t) > C(s)$, do} \na{Compute $cst \mget \max\{C(s), d(s,t)\}$.} \naBEGIN{If $cst < C(t)$, then} \na{If $C(t) \neq +\infty$, then remove $t$ from $Q$.} \naENDN{3}{$P(t)\mget s$, $L(t)\mget L(s)$, $C(t)\mget cst$, and insert $t$ in $Q$.} \end{nicealgo} Lines $1-3$ initialize maps and insert prototypes in $Q$. The main loop computes an optimum path from $S$ to every sample $s$ in a non-decreasing order of cost (Lines $4-10$). At each iteration, a path of minimum cost $C(s)$ is obtained in $P$ when we remove its last node $s$ from $Q$ (Line 5). Ties are broken in $Q$ using first-in-first-out policy. That is, when two optimum paths reach an ambiguous sample $s$ with the same minimum cost, $s$ is assigned to the first path that reached it. Note that $C(t) > C(s)$ in Line 6 is false when $t$ has been removed from $Q$ and, therefore, $C(t) \neq +\infty$ in Line 9 is true only when $t\in Q$. Lines $8-10$ evaluate if the path that reaches an adjacent node $t$ through $s$ is cheaper than the current path with terminus $t$ and update the position of $t$ in $Q$, $C(t)$, $L(t)$ and $P(t)$ accordingly. \begin{sloppypar} The OPF algorithm for $f_{max}$ is an ``IFT-watershed transform''~\cite{LotufoMM:00} computed in the $n$-dimensional feature space. Apart from this extension, the most significant contributions are the training and learning processes which find optimum prototypes (markers) in the frontier between classes and avoid \emph{outliers} (samples of a given class that fall inside the region of another class in the feature space) in the training set, increasing the accuracy of the classifier. \end{sloppypar} The label $L(s)$ may be different from $\lambda(s)$, leading to classification errors in $Z_1$. The training finds prototypes with none classification errors in $Z_1$. \subsection{Training} \label{ssec:training} \begin{sloppypar} We say that $S^\ast$ is an optimum set of prototypes when Algorithm~\ref{alg:opf} propagates the labels $L(s)=\lambda(s)$ for every $s\in Z_1$. Set $S^\ast$ can be found by exploiting the theoretical relation between \emph{Minimum Spanning Tree} (MST)~\cite{Cormen:90} and optimum path tree for $f_{max}$. The training essentially consists of finding $S^\ast$ and an OPF classifier rooted at $S^\ast$. \end{sloppypar} By computing an MST in the complete graph $(Z_1,A)$, we obtain a connected acyclic graph whose nodes are all samples in $Z_1$ and the arcs are undirected and weighted by the distance $d$ between the adjacent sample feature vectors (Figure~\ref{fig:mst-classification}a). This spanning tree is optimum in the sense that the sum of its arc weights is minimum as compared to any other spanning tree in the complete graph. In the MST, every pair of samples is connected by a single path which is optimum according to $f_{max}$. That is, for any given sample $s\in Z_1$, it is possible to direct the arcs of the MST such that the result will be an optimum path tree $P$ for $f_{max}$ rooted at $s$. \begin{figure}[ht] \centering \subfigure[]{\includegraphics[width=0.28\hsize]{\fullpaperpdirectory/images/figure2.pdf}} \hspace{0.1\hsize} \subfigure[]{\includegraphics[width=0.28\hsize]{\fullpaperpdirectory/images/figure3.pdf}} \caption{(a) MST of the graph shown in Figure~\ref{fig:example}a where the optimum prototypes share the arc of weight $0.6$. (b) The classification of the test sample (gray square) $t$ as in Figure~\ref{fig:example}c assigns the optimum path $P^\ast(t)$ from $R(t)\in S^\ast$ to $t$ passing through $s^\ast$.} \label{fig:mst-classification} \end{figure} The optimum prototypes are the closest elements in the MST with different labels in $Z_1$. By removing the arcs between different classes, their adjacent samples become prototypes in $S^\ast$ and Algorithm~\ref{alg:opf} can compute an optimum path forest with none classification errors in $Z_1$ (Figure~\ref{fig:example}b), which can be explained by the theoretical relation between minimum spanning trees and the optimum path tree obtained by OPF with $f_{max}$~\cite{Cousty:07}. Note that, a given class may be represented by multiple prototypes (i.e., optimum path trees) and there must exist at least one prototype per class. \subsection{Classification} For any sample $t\in Z_3$, we consider all arcs connecting $t$ with samples $s \in Z_1$, as though $t$ were part of the graph (Figure~\ref{fig:example}c). Considering all possible paths from $S^\ast$ to $t$, we wish to find the optimum path $P^\ast(t)$ from $S^\ast$ and label $t$ with the class $\lambda(R(t))$ of its most strongly connected prototype $R(t)\in S^\ast$ (Figure~\ref{fig:mst-classification}b). This path can be identified incrementally, by evaluating the optimum cost $C(t)$ as \begin{equation} \label{eq:classification} C(t) = \min\{ \max\{C(s),d(s,t)\}\},\ \forall s \in Z_1. \end{equation} Let the node $s^\ast\in Z_1$ be the one that satisfies the above equation (i.e., the predecessor $P(t)$ in the optimum path $P^\ast(t)$). Given that $L(s^\ast)=\lambda(R(t))$, the classification simply assigns $L(s^\ast)$ as the class of $t$. An error occurs when $L(s^\ast)\neq \lambda(t)$. Similar procedure is applied for samples in the evaluation set $Z_2$. In this case, however, we would like to use samples of $Z_2$ to learn the distribution of the classes in the feature space and improve the performance of the OPF classifier. \section{Learning Algorithm} \label{sec:learning} The performance of the OPF classifier improves when the closest samples from different classes are included in $Z_1$, because the method finds prototypes that will work as sentinels in the frontier between classes. We propose a learning algorithm to identify better prototypes from samples of $Z_2$ that have never been in $Z_1$ (Algorithm~\ref{alg:learning}). Firstly we give preference to replace \emph{irrelevant} samples of $Z_1$ by errors in $Z_2$, and secondly other samples of $Z_2$ are replaced by \emph{irrelevant} samples of $Z_1$. In both cases, we never let a sample of $Z_2$ return to $Z_1$. If the application did not impose any limitation in $|Z_1|$, the prototypes could be found from $Z_1\cup Z_2$ with none classification errors in both sets. The learning algorithm essentially tries to identify these prototypes from a few iterations of classification over $Z_2$. The algorithm outputs a \emph{learning curve} over $T$ iterations (Lines $2-29$), which reports the accuracy values of each instance of classifier during learning, and the final OPF classifier. Lines $4-7$ execute training and initialize the auxiliary arrays and lists. The classification of each sample $t\in Z_2$ is performed in Lines $8-18$, updating auxiliary arrays. The condition in Line $10$ indicates that $t$ is misclassified. In order to define irrelevant samples, we consider all right and wrong classifications in $Z_2$. When $t\in Z_2$ is correctly/incorrectly classified, we add one to the number of right/wrong classifications, $NR(r)$ or $NW(r)$, of every sample $r\in Z_1$ in the optimum path $P^\ast(t)$ from $R(t)\in S^\ast$ to $s^\ast$ (Lines $14-18$). Additionally, Lines $11-13$ update the number of false positive and false negative arrays, $FP$ and $FN$, for accuracy computation, and insert $t$ in the list $LE(\lambda(t))$ of errors if $t$ has never been in $Z_1$ ($t\not \in TR$). \begin{nicealgo}{alg:learning} \naTITLE{Learning algorithm} \naPREAMBLE \naINPUT{Training and evaluation sets labeled by $\lambda$, $Z_1$ and $Z_2$, number $T$ of iterations, and the pair $(v,d)$ for feature vector and distance computations.} \naOUTPUT{Learning curve ${\cal L}$ and the last OPF classifier, represented by the predecessor map $P$, cost map $C$, and label map $L$.} \naAUX{False positive and false negative arrays, $FP$ and $FN$, of sizes $c$, lists, $LI$ and $LE$, of irrelevant samples and error samples for each class, arrays for the number of right and wrong classifications, $NR$ and $NW$, of sizes $|Z_1|$, variables $r$ for sample, and set $TR$ to avoid samples of $Z_2$ return to $Z_1$.} \naBODY \na{$TR\mget \emptyset$.} \naBEGIN{For each iteration $I=1,2,\ldots,T$, do} \na{$TR \mget TR \cup Z_1$.} \na{Compute $S^\ast \subset Z_1$ as in Section~\ref{ssec:training} and $P$, $L$, $C$ by Algorithm~\ref{alg:opf}.} \na{For each sample $s\in Z_1$, do $NR(s)\mget 0$ and $NW(s)\mget 0$.} \naBEGIN{For each class $i=1,2,\ldots,c$, do} \naENDN{1}{$FP(i)\mget 0$, $FN(i)\mget 0$, $LI(i)\mget \emptyset$ and $LE(i)\mget \emptyset$.} \naBEGIN{For each sample $t\in Z_2$, do} \na{Find $s^\ast\in Z_1$ that satisfies Equation~\ref{eq:classification} and set $r\mget s^\ast$.} \naBEGIN{If $L(s^\ast)\neq \lambda(t)$, then} \na{$FP(L(s^\ast)) \mget FP(L(s^\ast)) + 1$.} \na{$FN(\lambda(t)) \mget FN(\lambda(t)) + 1$.} \na{if $t\not \in TR$, then $LE(\lambda(t)) \mget LE(\lambda(t)) \cup \{t\}$.} \naBEGIN{While $r \neq nil$, do} \naENDN{2}{$NW(r)\mget NW(r) + 1$ and $r\mget P(r)$.} \naBEGIN{Else} \naBEGIN{While $r \neq nil$, do} \naENDN{3}{$NR(r)\mget NR(r) + 1$ and $r\mget P(r)$.} \na{Compute ${\cal L}(I)$ by Equation~\ref{eq:lr}.} \naBEGIN{For each $s\in Z_1$, do} \naBEGIN{If $NW(s) > NR(s)$, then} \naENDN{2}{$LI(\lambda(s))\mget LI(\lambda(s))\cup \{s\}$.} \naBEGIN{For each class $i=1,2,\ldots,c$, do} \naBEGIN{While $|LI(i)| > 0$ and $|LE(i)| > 0$, do} \na{$LI(i)\mget LI(i)\backslash\{s\}$ and $LE(i)\mget LE(i)\backslash\{t\}$.} \naENDN{1}{Replace $s\in Z_1$ by $t\in Z_2$.} \naBEGIN{While $|LI(i)| > 0$, do} \na{$LI(i)\mget LI(i)\backslash\{s\}$.} \naENDN{3}{Find $t\in Z_2\backslash TR$, with $\lambda(t)=i$, and replace it by $s\in Z_1$.} \na{Compute $S^\ast\subset Z_1$ as in Section~\ref{ssec:training} and $P$, $L$, $C$ by Algorithm~\ref{alg:opf}.} \end{nicealgo} Line $19$ computes the accuracy at iteration $I$ and stores it in the learning curve ${\cal L}$. The accuracy ${\cal L}(I)$ of a given iteration $I$, $I=1,2,\ldots, T$, is measured by taking into account that the classes may have different sizes in $Z_2$ (similar definition is applied for $Z_3$). Let $NZ_2(i)$, $i=1,2,\ldots,c$, be the number of samples in $Z_2$ from each class $i$. We define \begin{equation} e_{i,1}=\frac{FP(i)}{|Z_2|-\left|NZ_2(i)\right|}\ \text{ and }\ e_{i,2}=\frac{FN(i)}{\left|NZ_2(i)\right|},\ i=1,\ldots,c \end{equation} where $FP(i)$ and $FN(i)$ are the false positives and false negatives, respectively. That is, $FP(i)$ is the number of samples from other classes that were classified as being from the class $i$ in $Z_2$, and $FN(i)$ is the number of samples from the class $i$ that were incorrectly classified as being from other classes in $Z_2$. The errors $e_{i,1}$ and $e_{i,2}$ are used to define \begin{equation} E(i)=e_{i,1}+e_{i,2}, \end{equation} where $E(i)$ is the partial sum error of class $i$. Finally, the accuracy ${\cal L}(I)$ of the classification is written as \begin{equation} \label{eq:lr} {\cal L}(I)=\frac{2c-\sum_{i=1}^{c}E(i)}{2c}=1-\frac{\sum_{i=1}^{c}E(i)}{2c}. \end{equation} Lines $20-22$ identify as irrelevant samples in $Z_1$ those with number of incorrect classifications higher than the number of correct classifications. Lines $23-29$ remove elements from the lists of irrelevant samples and errors, $LI$ and $LE$, for each class, and first replace errors by irrelevant samples then replace the remaining irrelevant samples (if any) by other samples of $Z_2$ that have never been in $Z_1$. Outliers degrade the project of any classifier. They will be usually identified as irrelevant prototypes, being moved from $Z_1$ to $Z_2$. Finally, Line $30$ performs the training over the last set $Z_1$ to output the designed OPF classifier. After learning, the classification of any sample $t\in Z_3$ is done by simply finding $s^\ast\in Z_1$ that satisfies Equation~\ref{eq:classification} and assigning label $L(s^\ast)$ as the class of $t$. \section{Results} \label{sec:results} We compare the OPF classifier with support vector machines (SVM~\cite{Vapnik:92}) using four databases with outliers and non-separable classes: Cone-torus from~\cite{Kuncheva:04}, Painted database (Figure~\ref{fig:painted}a), MPEG-7 shape database~\cite{MPEG-7:02}, and WM/GM (white matter/gray matter) database~\cite{Collins:1998}. The cone-torus database contains 400 samples and 3 non-separable classes while the painted database contains 5,867 samples with outliers and 4 classes. In both cases, the feature vectors are the sample $(x,y)$ coordinates. The MPEG-7 database contains 1,400 2D shapes and 70 classes. To increase overlap (difficulty) between classes, we simply adopt the 126 most significant coeficients in the Fourier transform of the shapes as feature vector. The WM/GM database contains 1.5M voxels of WM and GM (2 classes) from MR-T1 images of phantoms with various levels of noise and inhomogeneity to produce outliers. The images and ground truth are available from~\cite{Collins:1998}, and the feature vector is the lowest and highest values around the voxel, and its intensity value. In all cases, function $d$ is the Euclidean metric. \begin{figure}[ht] \centering \subfigure[]{\includegraphics[height=3cm]{\fullpaperpdirectory/images/painted.pdf}} \subfigure[]{\includegraphics[height=3.5cm]{\fullpaperpdirectory/images/accuracy_opf.pdf}} \caption{(a) Painted database with outliers. (b) OPF learning curve on $Z_2$.} \label{fig:painted} \end{figure} For all databases, we ramdomly selected the same percentage of samples from each class to create $Z_1$, $Z_2$ and $Z_3$. These percentages were 30\% for $Z_1$, 30\% for $Z_2$, and 40\% for $Z_3$ in the first three databases. Only the WM/GM database used 0.1\% for $Z_1$, 19.9\% for $Z_2$ and 80\% for $Z_3$. For $Z_1$ and $Z_2$, we runned 10 iterations of Algorithm~\ref{alg:learning} to output the OPF classifier for test on $Z_3$. Figure~\ref{fig:painted}b shows the learing curves for all databases. Note the usually non-decreasing behavior of the curves after the outliers be detected as irrelevant samples and moved to $Z_2$. In SVM, we used a Gaussian kernel and computed support vectors for 10 new instances of $Z_1$ and $Z_2$ by ramdomly replacing samples between them, keeping the original proportions, and took the configuration with highest accuracy for test on $Z_3$. The above learning and testing processes of SVM and OPF were also repeated for 10 distinct initial sets $Z_1$, $Z_2$, and $Z_3$ to compute mean and standard deviation of their accuracies over $Z_3$ (Table~\ref{tab:Stats}). OPF was usually more accurate and from 3 to 20 times faster than SVM. \begin{table}[htb] \begin{center} \begin{tabular}{|c||c|c||c|c|} \hline & \multicolumn{2}{c||}{ OPF accuracy } & \multicolumn{2}{c|}{ SVM accuracy } \\ \cline{2-5} Database & mean & std dev & mean & std dev \\ \hline \hline Cone-torus & 0.8757 & 0.0218 & 0.8147 & 0.0145 \\ Painted & 0.9838 & 0.0144 & 0.8763 & 0.0030 \\ MPEG-7 & 0.6925 & 0.0049 & 0.5869 & 0.0088 \\ WM/GM & 0.9088 & 0.0006 & 0.9072 & 0.0009 \\ \hline \end{tabular} \end{center} \caption{Mean and standard deviation of the accuracies for each database.} \label{tab:Stats} \end{table} \section{Conclusions and future work} \label{sec:conclusion} We use the IFT algorithm in sample feature spaces and propose pattern classifiers based on optimum path forests rooted at prototypes of training sets. The OPF classifier finds prototypes with none zero classification errors in the training sets and learns from errors in evaluation sets. Unseen test sets are used to assess OPF in comparision with SVM. From the learning curves of the OPF and its results on the test sets, we may conclude it is a robust classifier and usually more accurate than SVM for databases with outliers and non-separable classes. We are currently evaluating OPF with other databases and its accuracy is usually higher than using SVM and ANN-MLP. Future works include to report these results and the extension of OPF to unsupervised classification. %---------------------------------------- % % BIBLIOGRAPHY % for details, see ftp://ftp.ams.org/pub/tex/amsrefs/amsrdoc.pdf % \begin{bibsection} \begin{biblist} \bib{Collins:1998}{article}{ author={Collins, D.}, author={Zijdenbos, A.}, author={Kollokian, V.}, author={Sled, J.}, author={Kabani, N.}, author={Holmes, C.}, author={Evans, A.}, date={1998}, title={Design and Construction of a Realistic Digital Brain Phantom}, journal={IEEE Trans. on Medical Imaging}, volume={17}, number={3}, pages={463--468}, note={Available from: <\url{http://www.bic.mni.mcgill.ca/brainweb}>.} } \bib{Duda:00}{book}{ author={Duda, R.O.}, author={Hart, P.E.}, author={Stork, D.G.}, date={2000}, title={Pattern Classification}, %address={}, edition={2}, publisher={Wiley-Interscience} } \bib{Falcao:04}{article}{ author={Falc{\~{a}}o, A.X.}, author={Stolfi, J.}, author={Lotufo, R.A.}, date={2004}, title={The image foresting transform: theory, algorithms, and applications}, journal={IEEE Trans. on PAMI}, volume={26}, number={1}, pages={19--29} } \bib{Vapnik:92}{article}{ author={Boser, B.E.}, author={Guyon, I.M.}, author={Vapnik, V.N.}, title={A training algorithm for optimal margin classifiers}, booktitle={Proc. 5th Workshop on Computational Learning Theory}, date = {1992}, %isbn = {0-89791-497-X}, pages = {144--152}, %location = {Pittsburgh, Pennsylvania, United States}, % note = {doi: <\url{http://doi.acm.org/10.1145/130385.130401}>.}, publisher = {ACM Press}, address = {New York, NY, USA} } \bib{Zahn:71}{article}{ author = {Zahn, C.T.}, title = {Graph-Theoretical Methods for Detecting and Describing Gestalt Clusters}, journal = {IEEE Trans. on Computers}, date={1971}, volume={C-20}, pages={68--86}, number={1} %month = {Jan.}, %owner = {user}, %timestamp = {2006.12.15} } \bib{Hubert:74}{article}{ author = {Hubert, L. J.}, title = {Some applications of graph theory to clustering}, journal = {Psychometrika}, volume = {39}, number = {3}, date = {1974}, %issn = {0033-3123}, pages = {283--309} } % \bib{Wu:93}{article}{ % author = {Wu, Z.}, % author = {Leahy, R.}, % title = {An Optimal Graph Theoretic Approach to Data Clustering: Theory and Its Application to Image Segmentation}, % journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, % volume = {15}, % number = {11}, % date = {1993}, % %issn = {0162-8828}, % pages = {1101--1113}, %% note = {doi: <\url{http://doi.ieeecomputersociety.org/10.1109/34.244673}>.}, % publisher = {IEEE Computer Society}, % address = {Los Alamitos, CA, USA} % } \bib{Malik:00}{article}{ author = {Shi, Jiambo}, author = {Malik, Jitendra}, title = {Normalized cuts and image segmentation}, journal = {IEEE Trans. on Pattern Analysis and Machine Intelligence}, volume = {22}, number = {8}, pages = {888--905}, %month={Aug}, date = {2000} } \bib{Blum:01}{article}{ author = {Blum, A.}, author = {Chawla, S.}, title = {Learning from Labeled and Unlabeled Data using Graph Mincuts.}, booktitle = {ICML '01: Proceedings of the 18rd international conference on Machine learning}, date = {2001}, pages = {19--26} } % \bib{Hanneke:06}{article}{ % author = {Hanneke, S.}, % title = {An analysis of graph cut size for transductive learning}, % booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning}, % date = {2006}, % %isbn = {1-59593-383-2}, % pages = {393--399}, % %location = {Pittsburgh, Pennsylvania}, % note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143894}>.}, % publisher = {ACM Press}, % address = {New York, NY, USA} % } % \bib{Haykin:94}{book}{ % author = {Haykin, S.}, % title = {Neural networks: a comprehensive foundation}, % publisher = {Prentice Hall}, % date = {1994} % } \bib{Kuncheva:04}{book}{ author = {Kuncheva, L. I.}, title = {Combining Pattern Classifiers: Methods and Algorithms}, publisher = {Wiley-Interscience}, date = {2004} } \bib{Collobert:04}{article}{ author = {Collobert, R.}, author = {Bengio, S.}, title = {Links between perceptrons, MLPs and SVMs}, booktitle = {ICML '04: Proceedings of the twenty-first international conference on Machine learning}, date = {2004}, %isbn = {1-58113-828-5}, pages = {23}, %location = {Banff, Alberta, Canada}, % note = {doi: <\url{http://doi.acm.org/10.1145/1015330.1015415}>.}, publisher = {ACM Press}, address = {New York, NY, USA} } \bib{Cormen:90}{book}{ author = {Cormen, T.}, author = {Leiserson, C.}, author = {Rivest, R.}, title = {Introduction to Algorithms}, publisher = {MIT}, date = {1990} } % \bib{Torres:04}{article}{ % author = {Torres, R.}, % author = {Falc{\~{a}}o, A. X.}, % author = {Costa, L.F.}, % title = {A graph-based approach for multiscale shape analysis}, % journal = {Pattern Recognition}, % date = {2004}, % volume = {37}, % pages = {1163--1174} % } \bib{Reyzin:06}{inproceedings}{ author = {Reyzin, L.}, author = {Schapire, R. E.}, title = {How boosting the margin can also boost classifier complexity}, booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning}, year = {2006}, isbn = {1-59593-383-2}, pages = {753--760}, %location = {Pittsburgh, Pennsylvania}, % note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143939}>.}, publisher = {ACM Press}, address = {New York, NY, USA} } \bib{Tang:06}{inproceedings}{ author = {Tang, B.}, author = {Mazzoni, D.}, title = {Multiclass reduced-set support vector machines}, booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning}, year = {2006}, isbn = {1-59593-383-2}, pages = {921--928}, %location = {Pittsburgh, Pennsylvania}, % note = {doi: <\url{http://doi.acm.org/10.1145/1143844.1143960}>.}, publisher = {ACM Press}, address = {New York, NY, USA} } \bib{Panda:06}{inproceedings}{ author = {Panda, N.}, author = {Chang, E. Y.}, author = {Wu, G.}, title = {Concept boundary detection for speeding up SVMs}, booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning}, year = {2006}, isbn = {1-59593-383-2}, pages = {681--688}, %location = {Pittsburgh, Pennsylvania}, publisher = {ACM Press}, address = {New York, NY, USA} } \bib{MPEG-7:02}{article}{ author = {MPEG-7}, title = {MPEG-7: The Generic Multimedia Content Description Standard, Part 1}, journal = {IEEE MultiMedia}, volume = {09}, number = {2}, year = {2002}, issn = {1070-986X}, pages = {78-87}, % note = {doi: <\url{http://doi.ieeecomputersociety.org/10.1109/MMUL.2002.10016}>.}, publisher = {IEEE Computer Society}, address = {Los Alamitos, CA, USA} } \bib{LotufoMM:00}{incollection}{ author = {Lotufo, R. A.}, author = {Falc{\~{a}}o, A. X.}, booktitle = {Mathematical Morphology and its Applications to Image and Signal Processing}, title = {The ordered queue and the optimality of the watershed approaches}, publisher = {Kluwer}, volume = {18}, %month = {Jun}, pages = {341--350}, year = {2000} } \bib{site-libsvm}{misc}{ title = {LIBSVM: a Library for Support Vector Machines}, author = {Chang, C.}, author = {Lin, C.}, year = {2001}, note = {Software available at \url{http://www.csie.ntu.edu.tw/~cjlin/libsvm}} } \bib{Arica:BAS:03}{article}{ author = {Arica, N.}, author = {Vural, F. T. Y.}, title = {{BAS: A Perceptual Shape Descriptor based on the Beam Angle Statistics}}, journal = {Pattern Recognition Letters}, year = {2003}, volume = {24}, number = {9-10}, pages = {1627--1639}, % month = {Jun} } \bib{Cousty:07}{article}{ author = {Cousty, J.}, author = {Bertrand, G.}, author = {Najman, L.}, author = {Couprie, M.}, title = {Watersheds, minimum spanning forests, and the drop of water principle}, number = {IGM 2007-01}, year = {2007}, } % \bib{AbbasiMK:CSS:00}{article}{ % author = {Abbasi, S.}, % author = {Mokhtarian, F.}, % author = {Kittler, J.}, % title = {Enhancing CSS-based shape retrieval for % objects with shallow concavities}, % journal = {Image and Vision Computing}, % volume = {18}, % number = {3}, % year = {2000}, % pages = {199-211}, % month = {Feb} % } % \bib{Stehling:BIC:02}{inproceedings}{ % author = {Stehling, R. O.}, % author = {Nascimento, M. A.}, % author = {Falc{\~{a}}o, A. X.}, % title = {A compact and efficient image retrieval approach based on border/interior pixel classification}, % booktitle = {CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management}, % year = {2002}, % isbn = {1-58113-492-4}, % pages = {102--109}, % location = {McLean, Virginia, USA}, % note = {doi: <\url{http://doi.acm.org/10.1145/584792.584812}>.}, % publisher = {ACM Press}, % address = {New York, NY, USA} % } \end{biblist} \end{bibsection}