Home » Source Code » » CoForest.java

CoForest.java ( File view )

From:
Description:...
  • By d420240 2018-01-30
  • View(s):0
  • Download(s):0
  • Point(s): 1
			package coforest;

/**
 * Description: CoForest is a semi-supervised algorithm, which exploits the power of ensemble learning and available
 *              large amount of unlabeled data to produce hypothesis with better performance.
 *
 * Reference:   M. Li, Z.-H. Zhou. Improve computer-aided diagnosis with machine learning techniques using undiagnosed
 *              samples. IEEE Transactions on Systems, Man and Cybernetics - Part A: Systems and Humans, 2007, 37(6).
 *
 * ATTN:        This package is free for academic usage. You can run it at your own risk.
 *	     	For other purposes, please contact Prof. Zhi-Hua Zhou (zhouzh@nju.edu.cn).
 *
 * Requirement: To use this package, the whole WEKA environment (ver 3.4) must be available.
 *	        refer: I.H. Witten and E. Frank. Data Mining: Practical Machine Learning
 *		Tools and Techniques with Java Implementations. Morgan Kaufmann,
 *		San Francisco, CA, 2000.
 *
 * Data format: Both the input and output formats are the same as those used by WEKA.
 *
 * ATTN2:       This package was developed by Mr. Ming Li (lim@lamda.nju.edu.cn). There
 *		is a ReadMe file provided for roughly explaining the codes. But for any
 *		problem concerning the code, please feel free to contact with Mr. Li.
 *
 */


import java.io.*;
import java.util.*;

import weka.core.*;
import weka.classifiers.*;
import weka.classifiers.trees.*;

public class CoForest
{

  /** Random Forest */
  protected Classifier[] m_classifiers = null;

  /** The number component */
  protected int m_numClassifiers = 10;

  /** The random seed */
  protected int m_seed = 1;

  /** Number of features to consider in random feature selection.
      If less than 1 will use int(logM+1) ) */
  protected int m_numFeatures = 0;

  /** Final number of features that were considered in last build. */
  protected int m_KValue = 0;

  /** confidence threshold */
  protected double m_threshold = 0.75;

  private int m_numOriginalLabeledInsts = 0;  // 标记样本数量 



  /**
   * The constructor
   */
  public CoForest()
  {

  
}

  /**
   * Set the seed for initiating the random object used inside this class
   *
   * @param s int -- The seed
   */
  public void setSeed(int s)
  {

    m_seed = s;
  
}

  /**
   * Set the number of trees used in Random Forest.
   *
   * @param s int -- Value to assign to numTrees.
   */
  public void setNumClassifiers(int n)
  {

    m_numClassifiers = n;
  
}

  /**
   * Get the number of trees used in Random Forest
   *
   * @return int -- The number of trees.
   */
  public int getNumClassifiers()
  {

    return m_numClassifiers;
  
}

  /**
   * Set the number of features to use in random selection.
   *
   * @param n int -- Value to assign to m_numFeatures.
   */
  public void setNumFeatures(int n)
  {

    m_numFeatures = n;
  
}

  /**
   * Get the number of featrues to use in random selection.
   *
   * @return int -- The number of features
   */
  public int getNumFeatures()
  {

    return m_numFeatures;
  
}

  /**
   * Resample instances w.r.t the weight
   *
   * @param data Instances -- the original data set
   * @param random Random -- the random object
   * @param sampled boolean[] -- the output parameter, indicating whether the instance is sampled
   * @return Instances
   */
  public final Instances resampleWithWeights( Instances data, Random random, boolean[] sampled )
  {

    double[] weights = new double[data.numInstances()];
    for( int i=0; i<weights.length; i++ ) 
    {

      weights[i] = data.instance(i).weight();
    
}
    
    Instances newData = new Instances(data, data.numInstances());
    if (data.numInstances() == 0) {

      return newData;
    
}
    
    double[] probabilities = new double[data.numInstances()];
    double sumProbs = 0, sumOfWeights = Utils.sum(weights);
    
    for(int i = 0; i < data.numInstances(); i++) {

      sumProbs += random.nextDouble();
      probabilities[i] = sumProbs;
    
}
    
    Utils.normalize(probabilities, sumProbs / sumOfWeights);//probabilities为随机产生的置信度之和,归一化后与真实的权值做比较,

    // Make sure that rounding errors don't mess things up
    probabilities[data.numInstances() - 1] = sumOfWeights;
    int k = 0; int l = 0;
    sumProbs = 0;
    while ((k < data.numInstances() && (l < data.numInstances()))) {

      if (weights[l] < 0) {

        throw new IllegalArgumentException("Weights have to be positive.");
      
}
      sumProbs += weights[l];
      while ((k < data.numInstances()) &&//probabilities[k] <= sumProbs则将第l个样本加入到新取样的集合中作为其第k个样本,并将其权值初始化为1
             (probabilities[k] <= sumProbs)) {

        newData.add(data.instance(l));
        sampled[l] = true;
        newData.instance(k).setWeight(1);
        k++;
      
}
      l++;
    
}
    return newData;
  
}

  /**
   * Returns the probability label of a given instance
   *
   * @param inst Instance -- The instance
   * @return double[] -- The probability label
   * @throws Exception -- Some exception
   */
  public double[] distributionForInstance(Instance inst) throws Exception	// 返回给定实例的概率标签,每个分类器对其进行分类,然后将不同分类结果各自累加,数组进行规范化后返回
  {

    double[] res = new double[inst.numClasses()];
    
    for( int i=0; i<m_classifiers.length; i++ ) 							// 基础分类器的个数(6)
    {

      double[] distr = m_classifiers[i].distributionForInstance(inst);		// 返回一个instance在所有类上的概率
      
      for( int j=0; j<res.length; j++ )
        res[j] += distr[j];
    
}
    
    Utils.normalize(res);
    
    return res;
  
}

  /**
   * Classifies a given instance 	协同分类器对一个样本分类
   * 
   * @param inst Instance  -- The instance
   * @return 		double -- The class value
   * @throws 	Exception  -- Some Exception
   */
  public double classifyInstance(Instance inst) throws Exception
  {

    double[] distr = distributionForInstance(inst);		// 返回样本的类别分布概率(基础分类器结果累加)
    return Utils.maxIndex(distr);					
  
}

  /**
   * Build the classifiers using Co-Forest algorithm
   *
   * @param labeled Instances -- Labeled training set
   * @param unlabeled Instances -- unlabeled training set
   * @throws Exception -- certain exception
   */
  public void buildClassifier(Instances labeled, Instances unlabeled) throws Exception//使用Co-Forest建立分类器
  {

	  // 设置相关参数
	  double[]       err = new double[m_numClassifiers];	  // m_numClassifiers为森林中树的棵树,即分类器的个数
	  double[] err_prime = new double[m_numClassifiers];	  // 前一次每一个分类器错误率的阈值
	  double[]   s_prime = new double[m_numClassifiers];	  // 前一次迭代新标记样本置信度之和
	  boolean[][] inbags = new boolean[m_numClassifiers][];   // 标记某个样本l是否作为训练器i的训练集,resampleWithWeights()中标记
	  Random        rand = new Random(m_seed);
	  
	  m_numOriginalLabeledInsts = labeled.numInstances();	  // 标记样本数
	  
	  RandomTree          rTree = new RandomTree();			  // 生成随机树 

    // set up the random tree options
    m_KValue = m_numFeatures;								  // 就是每次分裂考虑的属性集大小,一般就是取log2(m)+1
    if( m_KValue<1 )
    {

    	m_KValue = (int)Utils.log2(labeled.numAttributes())+1;// Math.log(d)/log2    	
    
}
    
    rTree.setKValue(m_KValue);

    m_classifiers = Classifier.makeCopies(rTree, m_numClassifiers);
    Instances[] labeleds = new Instances[m_numClassifiers];
    int[]      randSeeds = new int[m_numClassifiers];

    /**
     * 步骤1、采用已标记的样本集(有放回,随机抽样)生成基础分类器(随机树)
    */
    for( int i=0; i<m_numClassifiers; i++)								// 对于每一个分类器,重新抽样构成其有标记样本,然后据此建立分类器
    {
   
    	randSeeds[i] = rand.nextInt();
    	((RandomTree)m_classifiers[i]).setSeed(randSeeds[i]);
    	
    	inbags[i]   = new boolean[labeled.numInstances()];
    	labeleds[i] = resampleWithWeights(labeled, rand, inbags[i]);	// 重新抽样构成第i个分类器的有标记样本,权重初始为1
    	
    	m_classifiers[i].buildClassifier(labeleds[i]);					// 利用随机抽样的已标记样本集去训练基础分类器
    	
    	err_prime[i] = 0.5;												// 前一次1个基础分类器的错误率
    	s_prime[i]   = 0;												// 前一次迭代新标记样本置信度之和
    
}

    /**
     * 步骤2、
   */    
    boolean bChanged = true;
    while(bChanged)
    {

    	bChanged          = false;
    	boolean[] bUpdate = new boolean[m_classifiers.length];
    	Instances[] Li    = new Instances[m_numClassifiers];    // H*-hi给无标签样本添加标记后,加入hi的有标签样本

    	for( int i=0; i<m_numClassifiers; i++ )
    	{

    		err[i] = measureError(labeled, inbags, i);			// 计算基础分类器的错误率(使用该分类器外的其他分类器和袋外样本进行验证)
    		Li[i]  = new Instances(labeled, 0);

    		/** if( e_i<e'_i ) */
    		if( err[i]<err_prime[i] ) // 当前迭代的错误率低于比上一次(寻优)
    		{

    			if( s_prime[i]==0 )
    			{

    				s_prime[i] = Math.min(unlabeled.sumOfWeights()/10, 100);	// 意义:对置信度进行初始化		
    			
}

    			/** Subsample U for each hi */
    			double weight = 0;
    			unlabeled.randomize(rand);										                // 产生未标记样本随机序列
    			
    			// 将未标记样本集中的数据加入候选扩充训练集(当加入的未标记样本权重之和>阈值,结束)
    			// 阈值计算见公式-6(保证下次迭代效果优于本次的基本条件)
    			int numWeightsAfterSubsample = (int)Math.ceil(err_prime[i]*s_prime[i]/err[i]-1);
          		for( int k=0; k<unlabeled.numInstances(); k++ )
    			{

 
...
...
(Not finished, please download and read the complete file)
			
...
Expand> <Close

Want complete source code? Download it here

Point(s): 1

Download
0 lines left, continue to read
Sponsored links

File list

Tips: You can preview the content of files by clicking file names^_^
Name Size Date
CoForest.java17.64 kB2017-05-19|16:41
...
Sponsored links

CoForest.java (5.97 kB)

Need 1 point
Your Point(s)

Your Point isn't enough.

Get point immediately by PayPal

More(Debit card / Credit card / PayPal Credit / Online Banking)

Submit your source codes. Get more point

LOGIN

Don't have an account? Register now
Need any help?
Mail to: support@codeforge.com

切换到中文版?

CodeForge Chinese Version
CodeForge English Version

Where are you going?

^_^"Oops ...

Sorry!This guy is mysterious, its blog hasn't been opened, try another, please!
OK

Warm tip!

CodeForge to FavoriteFavorite by Ctrl+D