So my new low score as of this morning was 1339! Maybe we'll find out how good that is in class tomorrow...
I didn't get to too much today, I had to finish a presentation on Ensemble Theory and a Gaussian Mixture Model program. I ended up adding ensemble code to my GMM code and it got a bit better. One class almost done, one final remaining when this blog series ends (Thursday I think).
What I did do was add some flexibility to the partitioning balance and random shuffling of the gain buckets. The partition balancing forces a particular side if the difference becomes to great, otherwise it randomly chooses a partition. The random shuffling of the gain buckets happens during the initial fill and the roll back fill. Doing it this way allows me to insert into the buckets and randomize, once per iteration, by adding only one function:
func (b *BList) randomizeList() {
if b.nodes.Len() < 2 { return }
tmp := make( []*list.Element, b.nodes.Len() )
i := 0
for e:=b.nodes.Front(); e!=nil; e=b.nodes.Front() {
tmp[i] = e
b.nodes.Remove(e)
i++
}
for p,_ := range tmp {
r := rand.Intn(len(tmp)-p)
tmp[p],tmp[r] = tmp[r],tmp[p]
}
for p,_ := range tmp {
b.nodes.PushFront(tmp[p].Value)
}
}
The best with flexible partitions is: 1343
The best with flexible partitions and bucket shuffling is: 1335
(I'll update in the morning when the experiments finish)
For the curious, it takes around 8 hours to cover 100 random numbers with 1000 iterations each, of which each is a hill climber that takes several unknown (though not large) number of iterations.
[and that's 1 core only]
I'll let you look at my Data Mining homework, since it's past the due date now, and I'll talk about EM at a later date:
package main
import (
"fmt"
"log"
"os"
"math"
rand "math/rand"
"flag"
)
type Point struct {
val float64
lbl int
}
var seed = flag.Int("seed",23,"random seed")
var numC = flag.Int("numc",3, "number of clusters")
func main() {
flag.Parse()
fmt.Printf( "Worm's MLE/EM\n\n" )
pts,min,max := readLabeledFile( "labeled.txt" )
// upts := readUnabeledFile( "unlabeled.txt" )
m,sd,c := calcEM( pts,*numC,*seed,min,max )
err := calcClassifyError( pts,m,sd,c )
fmt.Printf( "Solo Err: %f\n\n", err)
rsrc := rand.NewSource(int64(*seed))
rgen := rand.New(rsrc)
var means,sdevs,cprobs [][]float64
I,J := 100,10
L := len(pts)
sset := make([]Point,L)
errSum := 0.0
for i:=0; i<I; i++ {
rseed := rgen.Int()
aveErr := 0.0
for j:=0; j<J; j++ {
for p:=0; p<L; p++ {
r := rgen.Intn(L)
sset[p] = pts[r]
}
m,sd,c := calcEM( pts,*numC,rseed,min,max )
err := calcClassifyError( pts,m,sd,c )
means = append(means,m)
sdevs = append(sdevs,sd)
cprobs = append(cprobs,c)
aveErr += err
errSum += err
}
// fmt.Printf( "Seed(%d): %f\n", rseed,aveErr/float64(J) )
}
fmt.Printf( "\n\nAveErr: %f\n\n", errSum/float64(I*J) )
ERR := calcClassifyErrorEnsemble(pts,means,sdevs,cprobs)
fmt.Printf( "EnsembleErr: %f\n\n", ERR )
}
func calcMLE( pts []Point ) {
fmt.Printf( "Q1: MLE of labeled points\n---------------------------\n" )
var means,sdevs [3]float64
var cnts [3]int
for _,p := range pts {
pos := p.lbl
means[pos] += p.val
cnts[pos] += 1
}
for i:=0; i<3; i++ {
means[i] /= float64(cnts[i])
}
for _,p := range pts {
pos := p.lbl
val := means[pos] - p.val
sdevs[pos] += val*val
}
for i:=0; i<3; i++ {
sdevs[i] /= float64(cnts[i]-1)
sdevs[i] = math.Sqrt(sdevs[i])
}
fmt.Printf( "cnts: %v\nmeans: %v\nsdevs: %v\n", cnts, means,sdevs )
fmt.Printf( "\n\n" )
}
func calcEM( pts []Point, k, seed int, min,max float64 ) (MEAN,SDEV,CPROB []float64) {
// fmt.Printf( "Q2: EM of unlabeled points\n---------------------------\n" )
rand.Seed(int64(seed))
means := make( []float64, k )
sdevs := make( []float64, k )
cprob := make( []float64, k )
evals := make( [][]float64, len(pts) )
for i:=0; i<len(pts); i++ {
evals[i] = make( []float64, k )
}
diff := max-min
for i:=0; i<k; i++ {
means[i] = (rand.Float64()*diff)+min
sdevs[i] = rand.Float64()+0.5
cprob[i] = 1.0/float64(k)
}
// fmt.Printf( "Initial Guesses:\nmeans: %v\nsdevs: %v\n\n", means, sdevs )
for I:=0; I<20; I++ {
// E-step
for p,P := range pts {
sum := 0.0
for c:=0; c<k; c++ {
val := calcPDF( P.val, means[c], sdevs[c], cprob[c] )
sum += val
evals[p][c] = val
}
for c:=0; c<k; c++ {
evals[p][c] /= sum
}
}
// M-step
for c:=0; c<k; c++ {
means[c] = 0.0
}
sdevs := make( []float64, k )
csums := make( []float64, k )
// update means
for p,P := range pts {
for c:=0; c<k; c++ {
csums[c] += evals[p][c]
means[c] += evals[p][c] * P.val
}
}
for c:=0; c<k; c++ {
means[c] /= csums[c]
cprob[c] = csums[c] / float64(len(pts))
}
// update std devs
for p,P := range pts {
for c:=0; c<k; c++ {
val := means[c] - P.val
sdevs[c] += val*val * evals[p][c]
}
}
for c:=0; c<k; c++ {
sdevs[c] /= float64(csums[c]-1)
sdevs[c] = math.Sqrt(sdevs[c])
}
// fmt.Printf( "Iteration%d:\nprobs: %v\nmeans: %v\nsdevs: %v\n\n", I,csums, means, sdevs )
}
// sort means & sdevs
for i:=1; i<len(means); i++ {
for j:=i; j>0 && means[j]<means[j-1]; j-- {
m,sd,c := means[j],sdevs[j],cprob[j]
means[j],sdevs[j],cprob[j] = means[j-1],sdevs[j-1],cprob[j-1]
means[j-1],sdevs[j-1],cprob[j-1] = m,sd,c
}
}
return means,sdevs,cprob
}
func calcClassifyError( pts []Point, mean,sdev,cprob []float64 ) (error float64) {
k := len(mean)
evals := make( [][]float64, len(pts) )
for i:=0; i<len(pts); i++ {
evals[i] = make( []float64, k )
}
for p,P := range pts {
sum := 0.0
for c:=0; c<k; c++ {
val := calcPDF( P.val, mean[c], sdev[c], cprob[c] )
sum += val
evals[p][c] = val
}
for c:=0; c<k; c++ {
evals[p][c] /= sum
}
}
r,w := 0,0
for p,P := range pts {
max,maxI := 0.0, 0
for c:=0; c<k; c++ {
if evals[p][c] > max {
max = evals[p][c]
maxI = c
}
}
if P.lbl == maxI {
r++
} else {
w++
}
}
err := float64(w)/float64(r+w)
return err
}
func calcClassifyErrorEnsemble( pts []Point, MEAN,SDEV,CPROB [][]float64 ) (error float64) {
evals := make( [][]float64, len(pts) )
classes := make( []map[int]int,len(MEAN[0]) )
for i:=0; i<len(MEAN[0]); i++ {
classes[i] = make(map[int]int)
}
for C:=0; C<len(MEAN); C++ {
mean := MEAN[C]
sdev := SDEV[C]
cprob := CPROB[C]
k := len(mean)
for i:=0; i<len(pts); i++ {
evals[i] = make( []float64, k )
}
for p,P := range pts {
sum := 0.0
for c:=0; c<k; c++ {
val := calcPDF( P.val, mean[c], sdev[c], cprob[c] )
sum += val
evals[p][c] = val
}
for c:=0; c<k; c++ {
evals[p][c] /= sum
}
}
for p,_ := range pts {
max,maxI := 0.0, 0
for c:=0; c<k; c++ {
if evals[p][c] > max {
max = evals[p][c]
maxI = c
}
}
classes[maxI][p]++
}
}
r,w := 0,0
for p,P := range pts {
max,maxI := 0, 0
for c:=0; c<len(MEAN[0]); c++ {
val := classes[c][p]
if val>max {
max = val
maxI = c
}
}
if P.lbl == maxI {
r++
} else {
w++
}
}
err := float64(w)/float64(r+w)
return err
}
func calcPDF( x,m,s,p float64 ) float64 {
diff := (x-m)
expo := (-1.0*diff*diff)/(s*s*2.0)
cons := 1.0/(s*math.Sqrt(2.0*math.Pi))
return cons*math.Exp(expo)
}
func readLabeledFile( fn string ) (pts []Point, min,max float64 ){
file, err := os.Open(fn) // For read access.
if err != nil {
log.Fatal(err)
}
pts = make( []Point, 0, 300 )
min = math.MaxFloat64
max = math.SmallestNonzeroFloat64
for {
var p Point
_, serr := fmt.Fscanf( file, "%f %d", &(p.val), &(p.lbl) )
if serr != nil {
// log.Fatal(serr)
break
}
if p.lbl == 2 {
p.lbl = 1
} else if p.lbl == 5 {
p.lbl = 2
}
pts = append(pts,p)
if p.val < min { min = p.val }
if p.val > max { max = p.val }
}
fmt.Printf( "min: %f\nmax: %f\n", min, max )
return
}
func readUnabeledFile( fn string ) (pts []Point ){
file, err := os.Open(fn) // For read access.
if err != nil {
log.Fatal(err)
}
pts = make( []Point, 0, 300 )
for {
var p Point
_, serr := fmt.Fscanf( file, "%f", &(p.val) )
if serr != nil {
// log.Fatal(serr)
break
}
pts = append(pts,p)
}
return
}