Monday, May 7, 2012

Graph Partitioning 3 - An Update

So my new low score as of this morning was 1339! Maybe we'll find out how good that is in class tomorrow...

I didn't get to too much today, I had to finish a presentation on Ensemble Theory and a Gaussian Mixture Model program. I ended up adding ensemble code to my GMM code and it got a bit better. One class almost done, one final remaining when this blog series ends (Thursday I think).

What I did do was add some flexibility to the partitioning balance and random shuffling of the gain buckets. The partition balancing forces a particular side if the difference becomes to great, otherwise it randomly chooses a partition. The random shuffling of the gain buckets happens during the initial fill and the roll back fill. Doing it this way allows me to insert into the buckets and randomize, once per iteration,  by adding only one function:

 func (b *BList) randomizeList() {
  if b.nodes.Len() < 2 { return }
  tmp := make( []*list.Element, b.nodes.Len() )
  i := 0
  for e:=b.nodes.Front(); e!=nil; e=b.nodes.Front() {
    tmp[i] = e
    b.nodes.Remove(e)
    i++
  }
  for p,_ := range tmp {
    r := rand.Intn(len(tmp)-p)
    tmp[p],tmp[r] = tmp[r],tmp[p]
  }
  for p,_ := range tmp {
    b.nodes.PushFront(tmp[p].Value)
  }
}


The best with flexible partitions is: 1343
The best with flexible partitions and bucket shuffling is: 1335

(I'll update in the morning when the experiments finish)
For the curious, it takes around 8 hours to cover 100 random numbers with 1000 iterations each, of which each is a hill climber that takes several unknown (though not large) number of iterations.

[and that's 1 core only]




I'll let you look at my Data Mining homework, since it's past the due date now, and I'll talk about EM at a later date:

 package main  
 import (  
  "fmt"  
  "log"  
  "os"  
  "math"  
  rand "math/rand"  
  "flag"  
 )  
 type Point struct {  
  val float64  
  lbl int  
 }  
 var seed = flag.Int("seed",23,"random seed")  
 var numC = flag.Int("numc",3, "number of clusters")  
 func main() {  
  flag.Parse()  
  fmt.Printf( "Worm's MLE/EM\n\n" )  
  pts,min,max := readLabeledFile( "labeled.txt" )  
 //  upts := readUnabeledFile( "unlabeled.txt" )  
  m,sd,c := calcEM( pts,*numC,*seed,min,max )  
  err := calcClassifyError( pts,m,sd,c )  
  fmt.Printf( "Solo Err: %f\n\n", err)  
  rsrc := rand.NewSource(int64(*seed))  
  rgen := rand.New(rsrc)  
  var means,sdevs,cprobs [][]float64  
  I,J := 100,10  
  L := len(pts)  
  sset := make([]Point,L)  
  errSum := 0.0  
  for i:=0; i<I; i++ {  
   rseed := rgen.Int()  
   aveErr := 0.0  
   for j:=0; j<J; j++ {  
    for p:=0; p<L; p++ {  
     r := rgen.Intn(L)  
     sset[p] = pts[r]  
    }  
    m,sd,c := calcEM( pts,*numC,rseed,min,max )  
    err := calcClassifyError( pts,m,sd,c )  
    means = append(means,m)  
    sdevs = append(sdevs,sd)  
    cprobs = append(cprobs,c)  
    aveErr += err  
    errSum += err  
   }  
 //   fmt.Printf( "Seed(%d): %f\n", rseed,aveErr/float64(J) )  
  }  
  fmt.Printf( "\n\nAveErr: %f\n\n", errSum/float64(I*J) )  
  ERR := calcClassifyErrorEnsemble(pts,means,sdevs,cprobs)  
  fmt.Printf( "EnsembleErr: %f\n\n", ERR )  
 }  
 func calcMLE( pts []Point ) {  
  fmt.Printf( "Q1: MLE of labeled points\n---------------------------\n" )  
  var means,sdevs [3]float64  
  var cnts [3]int  
  for _,p := range pts {  
   pos := p.lbl  
   means[pos] += p.val  
   cnts[pos] += 1  
  }  
  for i:=0; i<3; i++ {  
   means[i] /= float64(cnts[i])  
  }  
  for _,p := range pts {  
   pos := p.lbl  
   val := means[pos] - p.val  
   sdevs[pos] += val*val  
  }  
  for i:=0; i<3; i++ {  
   sdevs[i] /= float64(cnts[i]-1)  
   sdevs[i] = math.Sqrt(sdevs[i])  
  }  
  fmt.Printf( "cnts: %v\nmeans: %v\nsdevs: %v\n", cnts, means,sdevs )  
  fmt.Printf( "\n\n" )  
 }  
 func calcEM( pts []Point, k, seed int, min,max float64 ) (MEAN,SDEV,CPROB []float64) {  
  //  fmt.Printf( "Q2: EM of unlabeled points\n---------------------------\n" )  
  rand.Seed(int64(seed))  
  means := make( []float64, k )  
  sdevs := make( []float64, k )  
  cprob := make( []float64, k )  
  evals := make( [][]float64, len(pts) )  
  for i:=0; i<len(pts); i++ {  
   evals[i] = make( []float64, k )  
  }  
  diff := max-min  
  for i:=0; i<k; i++ {  
   means[i] = (rand.Float64()*diff)+min  
   sdevs[i] = rand.Float64()+0.5  
   cprob[i] = 1.0/float64(k)  
  }  
  //  fmt.Printf( "Initial Guesses:\nmeans: %v\nsdevs: %v\n\n", means, sdevs )  
  for I:=0; I<20; I++ {  
   // E-step  
   for p,P := range pts {  
    sum := 0.0  
    for c:=0; c<k; c++ {  
     val := calcPDF( P.val, means[c], sdevs[c], cprob[c] )  
     sum += val  
     evals[p][c] = val  
    }  
    for c:=0; c<k; c++ {  
     evals[p][c] /= sum  
    }  
   }  
   // M-step  
   for c:=0; c<k; c++ {  
    means[c] = 0.0  
   }  
   sdevs := make( []float64, k )  
   csums := make( []float64, k )  
   // update means  
   for p,P := range pts {  
    for c:=0; c<k; c++ {  
     csums[c] += evals[p][c]  
     means[c] += evals[p][c] * P.val  
    }  
   }  
   for c:=0; c<k; c++ {  
    means[c] /= csums[c]  
    cprob[c] = csums[c] / float64(len(pts))  
   }  
   // update std devs  
   for p,P := range pts {  
    for c:=0; c<k; c++ {  
     val := means[c] - P.val  
     sdevs[c] += val*val * evals[p][c]  
    }  
   }  
   for c:=0; c<k; c++ {  
    sdevs[c] /= float64(csums[c]-1)  
    sdevs[c] = math.Sqrt(sdevs[c])  
   }  
   //   fmt.Printf( "Iteration%d:\nprobs: %v\nmeans: %v\nsdevs: %v\n\n", I,csums, means, sdevs )  
  }  
  // sort means & sdevs  
  for i:=1; i<len(means); i++ {  
   for j:=i; j>0 && means[j]<means[j-1]; j-- {  
    m,sd,c := means[j],sdevs[j],cprob[j]  
    means[j],sdevs[j],cprob[j] = means[j-1],sdevs[j-1],cprob[j-1]  
    means[j-1],sdevs[j-1],cprob[j-1] = m,sd,c  
   }  
  }  
  return means,sdevs,cprob  
 }  
 func calcClassifyError( pts []Point, mean,sdev,cprob []float64 ) (error float64) {  
  k := len(mean)  
  evals := make( [][]float64, len(pts) )  
  for i:=0; i<len(pts); i++ {  
   evals[i] = make( []float64, k )  
  }  
  for p,P := range pts {  
   sum := 0.0  
   for c:=0; c<k; c++ {  
    val := calcPDF( P.val, mean[c], sdev[c], cprob[c] )  
    sum += val  
    evals[p][c] = val  
   }  
   for c:=0; c<k; c++ {  
    evals[p][c] /= sum  
   }  
  }  
  r,w := 0,0  
  for p,P := range pts {  
   max,maxI := 0.0, 0  
   for c:=0; c<k; c++ {  
    if evals[p][c] > max {  
     max = evals[p][c]  
     maxI = c  
    }  
   }  
   if P.lbl == maxI {  
    r++  
   } else {  
    w++  
   }  
  }  
  err := float64(w)/float64(r+w)  
  return err  
 }  
 func calcClassifyErrorEnsemble( pts []Point, MEAN,SDEV,CPROB [][]float64 ) (error float64) {  
  evals := make( [][]float64, len(pts) )  
  classes := make( []map[int]int,len(MEAN[0]) )  
  for i:=0; i<len(MEAN[0]); i++ {  
   classes[i] = make(map[int]int)  
  }  
  for C:=0; C<len(MEAN); C++ {  
   mean := MEAN[C]  
   sdev := SDEV[C]  
   cprob := CPROB[C]  
   k := len(mean)  
   for i:=0; i<len(pts); i++ {  
    evals[i] = make( []float64, k )  
   }  
   for p,P := range pts {  
    sum := 0.0  
    for c:=0; c<k; c++ {  
     val := calcPDF( P.val, mean[c], sdev[c], cprob[c] )  
     sum += val  
     evals[p][c] = val  
    }  
    for c:=0; c<k; c++ {  
     evals[p][c] /= sum  
    }  
   }  
   for p,_ := range pts {  
    max,maxI := 0.0, 0  
    for c:=0; c<k; c++ {  
     if evals[p][c] > max {  
      max = evals[p][c]  
      maxI = c  
     }  
    }  
    classes[maxI][p]++  
   }  
  }  
  r,w := 0,0  
  for p,P := range pts {  
   max,maxI := 0, 0  
   for c:=0; c<len(MEAN[0]); c++ {  
    val := classes[c][p]  
    if val>max {  
     max = val  
     maxI = c  
    }  
   }  
   if P.lbl == maxI {  
    r++  
   } else {  
    w++  
   }  
  }  
  err := float64(w)/float64(r+w)  
  return err  
 }  
 func calcPDF( x,m,s,p float64 ) float64 {  
  diff := (x-m)  
  expo := (-1.0*diff*diff)/(s*s*2.0)  
  cons := 1.0/(s*math.Sqrt(2.0*math.Pi))  
  return cons*math.Exp(expo)  
 }  
 func readLabeledFile( fn string ) (pts []Point, min,max float64 ){  
  file, err := os.Open(fn) // For read access.  
  if err != nil {  
   log.Fatal(err)  
  }  
  pts = make( []Point, 0, 300 )  
  min = math.MaxFloat64  
  max = math.SmallestNonzeroFloat64  
  for {  
   var p Point  
   _, serr := fmt.Fscanf( file, "%f %d", &(p.val), &(p.lbl) )  
   if serr != nil {  
 //    log.Fatal(serr)  
    break  
   }  
   if p.lbl == 2 {  
    p.lbl = 1  
   } else if p.lbl == 5 {  
    p.lbl = 2  
   }  
   pts = append(pts,p)  
   if p.val < min { min = p.val }  
   if p.val > max { max = p.val }  
  }  
  fmt.Printf( "min: %f\nmax: %f\n", min, max )  
  return  
 }  
 func readUnabeledFile( fn string ) (pts []Point ){  
  file, err := os.Open(fn) // For read access.  
  if err != nil {  
   log.Fatal(err)  
  }  
  pts = make( []Point, 0, 300 )  
  for {  
   var p Point  
   _, serr := fmt.Fscanf( file, "%f", &(p.val) )  
   if serr != nil {  
 //    log.Fatal(serr)  
    break  
   }  
   pts = append(pts,p)  
  }  
  return  
 }  


1 comment: