For this practical lesson, we need the last version of the package TDAmapper
which can be directly installed from Pault Pearson’s GitHub:
library(devtools)
devtools::install_github("paultpearson/TDAmapper")
Several examples presented in this tutorial are borrowed from Pault Pearson’s github.
For visualization we will use igraph
and the last version of networkD3
:
devtools::install_github("christophergandrud/networkD3")
library(TDAmapper)
library(ggplot2)
Note that a powerful library is available with Python : Python Mapper which comes with a nice a graphical user interface.
We simulate a point could sampled on parametric curve with two loops
First.Example.data = data.frame( x=2*cos(0.5*(1:100)), y=sin(1:100) )
qplot(First.Example.data$x,First.Example.data$y)
First we compute the matrix of distances for the point cloud for the Euclidean metric:
First.Example.dist = dist(First.Example.data)
For this first example we take for the filter the first coordinate.
We compute that mapper graph for the data with this filter as follows:
First.Example.mapper <- mapper(dist_object = First.Example.dist,
filter_values = First.Example.data$x,
num_intervals = 6,
percent_overlap = 50,
num_bins_when_clustering = 10)
Topological Methods for the Analysis of High Dimensional Data Sets and 3D Object Recognition : To find the number of clusters we use the edge length at which each cluster was merged. The heuristic is that the inter-point distance within each cluster would be smaller than the distance between clusters, so shorter edges are required to connect points within each cluster, but relatively longer edges are required to merge the clusters. If we look at the histogram of edge lengths in C, it is observed experimentally, that shorter edges which connect points within each cluster have a relatively smooth distribution and the edges which are required to merge the clusters are disjoint from this in the histogram. If we determine the histogram of C using k intervals, then we expect to find a set of empty interval(s) after which the edges which are required to merge the clusters appear.
The output First.Example.mapper
is an object of class TDAmapper
:
First.Example.mapper
## $adjacency
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 1 1 0 0 0 0 0
## [2,] 1 0 0 1 0 0 0 0
## [3,] 1 0 0 1 0 0 0 0
## [4,] 0 1 1 0 1 0 0 0
## [5,] 0 0 0 1 0 1 1 0
## [6,] 0 0 0 0 1 0 0 1
## [7,] 0 0 0 0 1 0 0 1
## [8,] 0 0 0 0 0 1 1 0
##
## $num_vertices
## [1] 8
##
## $level_of_vertex
## [1] 1 2 2 3 4 5 5 6
##
## $points_in_vertex
## $points_in_vertex[[1]]
## [1] 82 57 32 7 95 70 20 45 71 8 96 33 58 83 69 6 94 31 56 81 19 44 17 18 80
## [26] 43 55 68 30 5 93
##
## $points_in_vertex[[2]]
## [1] 54 4 55 17 21 46 96 58 59
##
## $points_in_vertex[[3]]
## [1] 29 92 80 67 42 71 8 33 34 84
##
## $points_in_vertex[[4]]
## [1] 79 54 4 29 16 41 47 22 66 3 91 72 9 97 34 59 84
##
## $points_in_vertex[[5]]
## [1] 85 60 35 73 10 98 15 40 41 47 22 66 3 91 28 53 78
##
## $points_in_vertex[[6]]
## [1] 86 11 10 73 48 15 65 2 27
##
## $points_in_vertex[[7]]
## [1] 61 36 99 98 23 78 40 90 52 77
##
## $points_in_vertex[[8]]
## [1] 25 50 75 61 36 11 99 24 49 74 12 100 37 62 87 27 52 77 76
## [20] 51 26 1 89 64 14 39 88 63 13 38
##
##
## $points_in_level_set
## $points_in_level_set[[1]]
## [1] 5 6 7 8 17 18 19 20 30 31 32 33 43 44 45 55 56 57 58 68 69 70 71 80 81
## [26] 82 83 93 94 95 96
##
## $points_in_level_set[[2]]
## [1] 4 8 17 21 29 33 34 42 46 54 55 58 59 67 71 80 84 92 96
##
## $points_in_level_set[[3]]
## [1] 3 4 9 16 22 29 34 41 47 54 59 66 72 79 84 91 97
##
## $points_in_level_set[[4]]
## [1] 3 10 15 22 28 35 40 41 47 53 60 66 73 78 85 91 98
##
## $points_in_level_set[[5]]
## [1] 2 10 11 15 23 27 36 40 48 52 61 65 73 77 78 86 90 98 99
##
## $points_in_level_set[[6]]
## [1] 1 11 12 13 14 24 25 26 27 36 37 38 39 49 50 51 52 61 62
## [20] 63 64 74 75 76 77 87 88 89 99 100
##
##
## $vertices_in_level_set
## $vertices_in_level_set[[1]]
## [1] 1
##
## $vertices_in_level_set[[2]]
## [1] 2 3
##
## $vertices_in_level_set[[3]]
## [1] 4
##
## $vertices_in_level_set[[4]]
## [1] 5
##
## $vertices_in_level_set[[5]]
## [1] 6 7
##
## $vertices_in_level_set[[6]]
## [1] 8
##
##
## attr(,"class")
## [1] "TDAmapper"
A neighbor graph of the adjacency matrix First.Example.mapper$adjacency
can be computed using the function graph.adjacency()
from igraph:
library(igraph)
##
## Attachement du package : 'igraph'
## Les objets suivants sont masqués depuis 'package:stats':
##
## decompose, spectrum
## L'objet suivant est masqué depuis 'package:base':
##
## union
First.Example.graph <- graph.adjacency(First.Example.mapper$adjacency, mode="undirected")
and we can visualize the mapper graph:
plot(First.Example.graph, layout = layout.auto(First.Example.graph) )
This plot can be easily improved by incorporating additional information. For instance, it can be useful to color the vertices in function of a given co-variable and to take into account the number of points in each vertex.
Mean value of First.Example.data$y in each vertex:
y.mean.vertex <- rep(0,First.Example.mapper$num_vertices)
for (i in 1:First.Example.mapper$num_vertices){
points.in.vertex <- First.Example.mapper$points_in_vertex[[i]]
y.mean.vertex[i] <-mean((First.Example.data$y[points.in.vertex]))
}
Vertex size:
vertex.size <- rep(0,First.Example.mapper$num_vertices)
for (i in 1:First.Example.mapper$num_vertices){
points.in.vertex <- First.Example.mapper$points_in_vertex[[i]]
vertex.size[i] <- length((First.Example.mapper$points_in_vertex[[i]]))
}
Mapper graph with the vertices colored in function of First.Example.data$y and vertex size proportional to the number of points inside:
y.mean.vertex.grey <- grey(1-(y.mean.vertex - min(y.mean.vertex))/(max(y.mean.vertex) - min(y.mean.vertex) ))
V(First.Example.graph)$color <- y.mean.vertex.grey
V(First.Example.graph)$size <- vertex.size
plot(First.Example.graph,main ="Mapper Graph")
legend(x=-2, y=-1, c("y small","y medium","large y"),pch=21,
col="#777777", pt.bg=grey(c(1,0.5,0)), pt.cex=2, cex=.8, bty="n", ncol=1)
We can also get an interactive representation of the mapper graph using the networkD3 package. First we need to create a data frame for vertices and a data frame for the edges with the correct variable names:
library(networkD3)
MapperNodes <- mapperVertices(First.Example.mapper, 1:100 )
MapperLinks <- mapperEdges(First.Example.mapper)
forceNetwork(Nodes = MapperNodes, Links = MapperLinks,
Source = "Linksource", Target = "Linktarget",
Value = "Linkvalue", NodeID = "Nodename",
Group = "Nodegroup", opacity = 1,
linkDistance = 10, charge = -400)
In this example, the nodes are colored in function of the categorical variable given in the group argument: MapperNodes$Nodegroup
which corresponds to the interval label of the filter co-domain:
MapperNodes
## Nodename
## 1 V1: 82, 57, 32, 7, 95, 70, 20, 45, 71, 8, 96, 33, 58, 83, 69, 6, 94, 31, 56, 81, 19, 44, 17, 18, 80, 43, 55, 68, 30, 5, 93
## 2 V2: 54, 4, 55, 17, 21, 46, 96, 58, 59
## 3 V3: 29, 92, 80, 67, 42, 71, 8, 33, 34, 84
## 4 V4: 79, 54, 4, 29, 16, 41, 47, 22, 66, 3, 91, 72, 9, 97, 34, 59, 84
## 5 V5: 85, 60, 35, 73, 10, 98, 15, 40, 41, 47, 22, 66, 3, 91, 28, 53, 78
## 6 V6: 86, 11, 10, 73, 48, 15, 65, 2, 27
## 7 V7: 61, 36, 99, 98, 23, 78, 40, 90, 52, 77
## 8 V8: 25, 50, 75, 61, 36, 11, 99, 24, 49, 74, 12, 100, 37, 62, 87, 27, 52, 77, 76, 51, 26, 1, 89, 64, 14, 39, 88, 63, 13, 38
## Nodegroup Nodesize
## 1 1 31
## 2 2 9
## 3 2 10
## 4 3 17
## 5 4 17
## 6 5 9
## 7 5 10
## 8 6 30
We will further how to play with the arguments of the function forceNetwork()
. See here for more details on the use of the networkD3
package.
This example illustrates the interest of the Mapper algorithm when some topological information can not be identified by a standard principal component analysis. Download the ElongatedTorus file on your laptop and import the data into R:
Torus = read.table(file = "./Data/ElongatedTorus.txt",sep= "",header = FALSE)
library(scatterplot3d)
scatterplot3d(Torus[,1], Torus[,2], Torus[,3])
PCA does not see the hole in the center of the data:
library(FactoMineR)
Torus.pca <- PCA(Torus,graph = FALSE)
Torus.PCA1 <- Torus.pca$ind$coord[,1]
qplot(Torus.pca$ind$coord[,1],Torus.pca$ind$coord[,2])
Mapper with a filter equal to the first coordinate does not see the hole (you can change the parameters)
Torus.dist = dist(Torus)
torus.mapper <- mapper(
dist_object = Torus.dist,
filter_values = Torus[,1],
num_intervals = 10,
percent_overlap = 50,
num_bins_when_clustering = 10)
torus.graph <- graph.adjacency(torus.mapper$adjacency, mode="undirected")
plot(torus.graph , layout = layout.auto(torus.graph),main ="Raw visualisation of \n the Mapper Graph \nfor the Torus dataset" )
But mapper with a convenient bi-dimensional filter does, at least with the “good parameters”.
torus.mapper <- mapper(
dist_object = Torus.dist,
filter_values = list(Torus[,1],Torus[,2]),
num_intervals = c(8,8),
percent_overlap = 50,
num_bins_when_clustering = 10)
torus.graph <- graph.adjacency(torus.mapper$adjacency,mode="undirected")
plot(torus.graph,
layout = layout.auto(torus.graph),
main ="Raw visualisation of \n the Mapper Graph \nfor the Torus dataset \n filter bidim" )
MapperNodes <- mapperVertices(torus.mapper, 1:length(Torus[,1]) )
MapperLinks <- mapperEdges(torus.mapper)
forceNetwork(Nodes = MapperNodes, Links = MapperLinks,
Source = "Linksource", Target = "Linktarget",
Value = "Linkvalue", NodeID = "Nodename",
Group = "Nodegroup", opacity = 1,
linkDistance = 10, charge = -10)
Example from Topological Methods for the Analysis of High Dimensional Data Sets and 3D Object Recognition :
About the data: 145 patients who had diabetes, a family history of diabetes, who wanted a physical examination, or to participate in a scientific study participated in the study. For each patient:
The original study noted that the data set consisted of a central core, and two ``flares" emanating from it. The patients in each of the flares were regarded as suffering from essentially different diseases, which correspond to the division of diabetes into the adult onset and juvenile onset forms. One way in which we wish to use Mapper is as an automatic tool for detecting such flares in the data, even in situations where projections into two or three dimensional.
library(locfit)
## locfit 1.5-9.4 2020-03-24
data(chemdiab)
summary(chemdiab)
## rw fpg ga ina
## Min. :0.7100 Min. : 70 Min. : 269.0 Min. : 10.0
## 1st Qu.:0.8800 1st Qu.: 90 1st Qu.: 352.0 1st Qu.:118.0
## Median :0.9800 Median : 97 Median : 413.0 Median :156.0
## Mean :0.9773 Mean :122 Mean : 543.6 Mean :186.1
## 3rd Qu.:1.0800 3rd Qu.:112 3rd Qu.: 558.0 3rd Qu.:221.0
## Max. :1.2000 Max. :353 Max. :1568.0 Max. :748.0
## sspg cc
## Min. : 29.0 Chemical_Diabetic:36
## 1st Qu.:100.0 Normal :76
## Median :159.0 Overt_Diabetic :33
## Mean :184.2
## 3rd Qu.:257.0
## Max. :480.0
normdiab <- chemdiab
normdiab[,1:5] <- scale(normdiab[,1:5],center=FALSE)
normdiab.dist = dist(normdiab[,1:5])
library(ks)
##
## Attachement du package : 'ks'
## L'objet suivant est masqué depuis 'package:igraph':
##
## compare
filter.kde <- kde(normdiab[,1:5],H=diag(1,nrow = 5),eval.points = normdiab[,1:5])$estimate
diab.mapper <- mapper(
dist_object = normdiab.dist,
filter_values = filter.kde,
num_intervals = 4,
percent_overlap = 50,
num_bins_when_clustering = 20)
diab.graph <- graph.adjacency(diab.mapper$adjacency, mode="undirected")
plot(diab.graph )
number of vertices :
l = length(V(diab.graph))
Mode function
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
Distribution of the cc variable in each vertex Majority vote
cc.maj.vertex <- c()
filter.kde.vertex <- c()
for (i in 1:l){
points.in.vertex <- diab.mapper$points_in_vertex[[i]]
Mode.in.vertex <- Mode(normdiab$cc[points.in.vertex])
cc.maj.vertex <- c(cc.maj.vertex,as.character(Mode.in.vertex))
filter.kde.vertex <- c(filter.kde.vertex,mean(filter.kde[points.in.vertex]))
}
Vertex size
vertex.size <- rep(0,l)
for (i in 1:l){
points.in.vertex <- diab.mapper$points_in_vertex[[i]]
vertex.size[i] <- length((diab.mapper$points_in_vertex[[i]]))
}
MapperNodes <- mapperVertices(diab.mapper, 1:nrow(normdiab) )
MapperNodes$cc.maj.vertex <- as.factor(cc.maj.vertex)
MapperNodes$filter.kde <- filter.kde.vertex
MapperNodes$Nodesize <- vertex.size
MapperLinks <- mapperEdges(diab.mapper)
forceNetwork(Nodes = MapperNodes, Links = MapperLinks,
Source = "Linksource", Target = "Linktarget",
Value = "Linkvalue", NodeID = "Nodename",
Group = "cc.maj.vertex", opacity = 1,
linkDistance = 10, charge = -400,legend = TRUE,
Nodesize = "Nodesize")
## Warning: It looks like Source/Target is not zero-indexed. This is required in
## JavaScript and so your plot may not render.
I did not manage to obtain the same mapper graph as in the original paper, probably because of the clustering algorithm which is probably different.
# we concatenate the values of cc and vertex.label for the points of all the vertices (one point can be in to different vertices)
vertex.label <- c()
cc.mapper <- c()
for (ver in 1:l){
points.in.vertex <- diab.mapper$points_in_vertex[[ver]]
vertex.label <- c(vertex.label,rep(ver,length(points.in.vertex)))
cc.mapper <- c(cc.mapper,chemdiab$cc[points.in.vertex])
}
vertex.label <- as.factor(vertex.label)
cc.mapper <- as.factor(cc.mapper)
levels(cc.mapper) <- c("Chem","Norm","Overt")
library(vcd)
## Le chargement a nécessité le package : grid
table(vertex.label,cc.mapper)
mosaic(table(vertex.label,cc.mapper),shade = TRUE)
## cc.mapper
## vertex.label Chem Norm Overt
## 1 0 0 1
## 2 0 0 3
## 3 4 0 10
## 4 4 1 1
## 5 0 0 10
## 6 0 4 0
## 7 10 2 10
## 8 0 0 1
## 9 28 74 6
Download the Trefoil dataset on your laptop and import the data into R:
Trefoil = read.table(file = "./Data/NoisyTrefoil180.txt",sep= "",header = FALSE)
summary(Trefoil)
## V1 V2 V3
## Min. :-2.77361 Min. :-3.04402 Min. :-1.07858
## 1st Qu.:-1.23060 1st Qu.:-1.26771 1st Qu.:-0.76957
## Median :-0.02093 Median : 0.29249 Median :-0.03306
## Mean :-0.01961 Mean :-0.02163 Mean :-0.02962
## 3rd Qu.: 1.18317 3rd Qu.: 1.55320 3rd Qu.: 0.63905
## Max. : 2.76019 Max. : 2.10846 Max. : 1.09504
Trefoil.dist = dist(Trefoil)
scatterplot3d(Trefoil[,1], Trefoil[,2], Trefoil[,3])
Compute the mapper graph for one given set of parameters:
Trefoil.mapper <- mapper(
dist_object = Trefoil.dist,
filter_values = Trefoil[,1],
num_intervals = 10,
percent_overlap = 30,
num_bins_when_clustering = 10)
and plot the mapper graph.
Trefoil.graph <- graph.adjacency(Trefoil.mapper$adjacency, mode="undirected")
plot(Trefoil.graph , layout = layout.auto(Trefoil.graph),main =paste("Raw visualisation of the Mapper Graph \n for the Trefoil Dataset \n " ))
Exercise: make a triple loop on the parameters values and identify the correct Mapper graph if any:
num_int_seq = seq(6,16,2)
percent_overlap_seq = seq(20,80,20)
num_bins_when_clustering_seq = seq(5,15,2)
for (num in num_int_seq){
for (perc in percent_overlap_seq){
for (num_bins in num_bins_when_clustering_seq){
Trefoil.mapper <- mapper(
dist_object= Trefoil.dist,
filter_values = Trefoil[,1],
num_intervals = num,
percent_overlap = perc,
num_bins_when_clustering = num_bins)
Trefoil.graph <- graph.adjacency(Trefoil.mapper$adjacency, mode="undirected")
mytitle = paste("num_interv",as.character(num), " perc=",as.character(perc), "num_bins",
as.character(num_bins) )
plot(Trefoil.graph , layout = layout.auto(Trefoil.graph),main =mytitle)
}}}