当前位置：移动技术网 > IT编程>开发语言>Java > Java实现的决策树算法完整实例

Java实现的决策树算法完整实例

2019年07月19日 | 移动技术网IT编程 | 我要评论

本文实例讲述了java实现的决策树算法。分享给大家供大家参考，具体如下：

决策树算法是一种逼近离散函数值的方法。它是一种典型的分类方法，首先对数据进行处理，利用归纳算法生成可读的规则和决策树，然后使用决策对新数据进行分析。本质上决策树是通过一系列规则对数据进行分类的过程。

决策树构造可以分两步进行。第一步，决策树的生成：由训练样本集生成决策树的过程。一般情况下，训练样本数据集是根据实际需要有历史的、有一定综合程度的，用于数据分析处理的数据集。第二步，决策树的剪枝：决策树的剪枝是对上一阶段生成的决策树进行检验、校正和修下的过程，主要是用新的样本数据集（称为测试数据集）中的数据校验决策树生成过程中产生的初步规则，将那些影响预衡准确性的分枝剪除。

java实现代码如下：

package demo;
import java.util.hashmap;
import java.util.linkedlist;
import java.util.list;
import java.util.map;
import java.util.map.entry;
import java.util.set;
public class dicisiontree {
  public static void main(string[] args) throws exception {
    system.out.print("移动技术网测试结果：");
    string[] attrnames = new string[] { "age", "income", "student",
        "credit_rating" };
    // 读取样本集
    map<object, list<sample>> samples = readsamples(attrnames);
    // 生成决策树
    object decisiontree = generatedecisiontree(samples, attrnames);
    // 输出决策树
    outputdecisiontree(decisiontree, 0, null);
  }
  /**
   * 读取已分类的样本集，返回map：分类 -> 属于该分类的样本的列表
   */
  static map<object, list<sample>> readsamples(string[] attrnames) {
    // 样本属性及其所属分类（数组中的最后一个元素为样本所属分类）
    object[][] rawdata = new object[][] {
        { "<30 ", "high ", "no ", "fair   ", "0" },
        { "<30 ", "high ", "no ", "excellent", "0" },
        { "30-40", "high ", "no ", "fair   ", "1" },
        { ">40 ", "medium", "no ", "fair   ", "1" },
        { ">40 ", "low  ", "yes", "fair   ", "1" },
        { ">40 ", "low  ", "yes", "excellent", "0" },
        { "30-40", "low  ", "yes", "excellent", "1" },
        { "<30 ", "medium", "no ", "fair   ", "0" },
        { "<30 ", "low  ", "yes", "fair   ", "1" },
        { ">40 ", "medium", "yes", "fair   ", "1" },
        { "<30 ", "medium", "yes", "excellent", "1" },
        { "30-40", "medium", "no ", "excellent", "1" },
        { "30-40", "high ", "yes", "fair   ", "1" },
        { ">40 ", "medium", "no ", "excellent", "0" } };
    // 读取样本属性及其所属分类，构造表示样本的sample对象，并按分类划分样本集
    map<object, list<sample>> ret = new hashmap<object, list<sample>>();
    for (object[] row : rawdata) {
      sample sample = new sample();
      int i = 0;
      for (int n = row.length - 1; i < n; i++)
        sample.setattribute(attrnames[i], row[i]);
      sample.setcategory(row[i]);
      list<sample> samples = ret.get(row[i]);
      if (samples == null) {
        samples = new linkedlist<sample>();
        ret.put(row[i], samples);
      }
      samples.add(sample);
    }
    return ret;
  }
  /**
   * 构造决策树
   */
  static object generatedecisiontree(
      map<object, list<sample>> categorytosamples, string[] attrnames) {
    // 如果只有一个样本，将该样本所属分类作为新样本的分类
    if (categorytosamples.size() == 1)
      return categorytosamples.keyset().iterator().next();
    // 如果没有供决策的属性，则将样本集中具有最多样本的分类作为新样本的分类，即投票选举出分类
    if (attrnames.length == 0) {
      int max = 0;
      object maxcategory = null;
      for (entry<object, list<sample>> entry : categorytosamples
          .entryset()) {
        int cur = entry.getvalue().size();
        if (cur > max) {
          max = cur;
          maxcategory = entry.getkey();
        }
      }
      return maxcategory;
    }
    // 选取测试属性
    object[] rst = choosebesttestattribute(categorytosamples, attrnames);
    // 决策树根结点，分支属性为选取的测试属性
    tree tree = new tree(attrnames[(integer) rst[0]]);
    // 已用过的测试属性不应再次被选为测试属性
    string[] suba = new string[attrnames.length - 1];
    for (int i = 0, j = 0; i < attrnames.length; i++)
      if (i != (integer) rst[0])
        suba[j++] = attrnames[i];
    // 根据分支属性生成分支
    @suppresswarnings("unchecked")
    map<object, map<object, list<sample>>> splits =
    /* new line */(map<object, map<object, list<sample>>>) rst[2];
    for (entry<object, map<object, list<sample>>> entry : splits.entryset()) {
      object attrvalue = entry.getkey();
      map<object, list<sample>> split = entry.getvalue();
      object child = generatedecisiontree(split, suba);
      tree.setchild(attrvalue, child);
    }
    return tree;
  }
  /**
   * 选取最优测试属性。最优是指如果根据选取的测试属性分支，则从各分支确定新样本
   * 的分类需要的信息量之和最小，这等价于确定新样本的测试属性获得的信息增益最大
   * 返回数组：选取的属性下标、信息量之和、map(属性值->(分类->样本列表))
   */
  static object[] choosebesttestattribute(
      map<object, list<sample>> categorytosamples, string[] attrnames) {
    int minindex = -1; // 最优属性下标
    double minvalue = double.max_value; // 最小信息量
    map<object, map<object, list<sample>>> minsplits = null; // 最优分支方案
    // 对每一个属性，计算将其作为测试属性的情况下在各分支确定新样本的分类需要的信息量之和，选取最小为最优
    for (int attrindex = 0; attrindex < attrnames.length; attrindex++) {
      int allcount = 0; // 统计样本总数的计数器
      // 按当前属性构建map：属性值->(分类->样本列表)
      map<object, map<object, list<sample>>> cursplits =
      /* new line */new hashmap<object, map<object, list<sample>>>();
      for (entry<object, list<sample>> entry : categorytosamples
          .entryset()) {
        object category = entry.getkey();
        list<sample> samples = entry.getvalue();
        for (sample sample : samples) {
          object attrvalue = sample
              .getattribute(attrnames[attrindex]);
          map<object, list<sample>> split = cursplits.get(attrvalue);
          if (split == null) {
            split = new hashmap<object, list<sample>>();
            cursplits.put(attrvalue, split);
          }
          list<sample> splitsamples = split.get(category);
          if (splitsamples == null) {
            splitsamples = new linkedlist<sample>();
            split.put(category, splitsamples);
          }
          splitsamples.add(sample);
        }
        allcount += samples.size();
      }
      // 计算将当前属性作为测试属性的情况下在各分支确定新样本的分类需要的信息量之和
      double curvalue = 0.0; // 计数器：累加各分支
      for (map<object, list<sample>> splits : cursplits.values()) {
        double persplitcount = 0;
        for (list<sample> list : splits.values())
          persplitcount += list.size(); // 累计当前分支样本数
        double persplitvalue = 0.0; // 计数器：当前分支
        for (list<sample> list : splits.values()) {
          double p = list.size() / persplitcount;
          persplitvalue -= p * (math.log(p) / math.log(2));
        }
        curvalue += (persplitcount / allcount) * persplitvalue;
      }
      // 选取最小为最优
      if (minvalue > curvalue) {
        minindex = attrindex;
        minvalue = curvalue;
        minsplits = cursplits;
      }
    }
    return new object[] { minindex, minvalue, minsplits };
  }
  /**
   * 将决策树输出到标准输出
   */
  static void outputdecisiontree(object obj, int level, object from) {
    for (int i = 0; i < level; i++)
      system.out.print("|-----");
    if (from != null)
      system.out.printf("(%s):", from);
    if (obj instanceof tree) {
      tree tree = (tree) obj;
      string attrname = tree.getattribute();
      system.out.printf("[%s = ?]\n", attrname);
      for (object attrvalue : tree.getattributevalues()) {
        object child = tree.getchild(attrvalue);
        outputdecisiontree(child, level + 1, attrname + " = "
            + attrvalue);
      }
    } else {
      system.out.printf("[category = %s]\n", obj);
    }
  }
  /**
   * 样本，包含多个属性和一个指明样本所属分类的分类值
   */
  static class sample {
    private map<string, object> attributes = new hashmap<string, object>();
    private object category;
    public object getattribute(string name) {
      return attributes.get(name);
    }
    public void setattribute(string name, object value) {
      attributes.put(name, value);
    }
    public object getcategory() {
      return category;
    }
    public void setcategory(object category) {
      this.category = category;
    }
    public string tostring() {
      return attributes.tostring();
    }
  }
  /**
   * 决策树（非叶结点），决策树中的每个非叶结点都引导了一棵决策树
   * 每个非叶结点包含一个分支属性和多个分支，分支属性的每个值对应一个分支，该分支引导了一棵子决策树
   */
  static class tree {
    private string attribute;
    private map<object, object> children = new hashmap<object, object>();
    public tree(string attribute) {
      this.attribute = attribute;
    }
    public string getattribute() {
      return attribute;
    }
    public object getchild(object attrvalue) {
      return children.get(attrvalue);
    }
    public void setchild(object attrvalue, object child) {
      children.put(attrvalue, child);
    }
    public set<object> getattributevalues() {
      return children.keyset();
    }
  }
}

运行结果：

更多关于java算法相关内容感兴趣的读者可查看本站专题：《java数据结构与算法教程》、《java操作dom节点技巧总结》、《java文件与目录操作技巧汇总》和《java缓存操作技巧汇总》

希望本文所述对大家java程序设计有所帮助。

您可能感兴趣的文章:

如对本文有疑问，点击进行留言回复！！

云服务和SOA架构以及微服务架构的区别及联系

截止目前，如果之前有看我文章的，关于SSM框架的原理，应该都差不多理解了，毕竟都是看我写过源码的人了，接下来会进... [阅读全文]
大厂Java面试题解(45)-设计一个高并发系统

1 为什么面试官爱问这种面试题？因为招聘中大家都有这个要求。技术强的人，在互联网公司肯定负责过高并发模块，那夺取... [阅读全文]
Github上收藏83.5K的Java学习+面试指南，你不来学习一下？

这是这位大佬整理的所有Java学习指南的目录，学习Java,这一套足够了。一次性给你佩奇~基础容器并发JVM其他... [阅读全文]
Java之JSTL的基础运用

Java之JSTL的基础运用jstl简介jstl的jar包的导入if标签choose标签foreach标签1.普... [阅读全文]
Gitee多人协作项目开发使用的git指令以及合并分支

Gitee多人协作进行项目开发使用的git指令：多人仓库创建完成之后就可以开始进行多人项目开发的工作了git指令... [阅读全文]
Python第三章异常处理

Python第三章异常处理异常处理语句try - except 语句try-except-elsetry-ex... [阅读全文]
Java多线程CAS操作原理代码实例解析

cas操作号称无锁优化，也叫作自旋；对于一些常见的操作需要加锁，然后jdk就提供了一些以atomic开头的类，这些类内部自动带了锁，当然这里的锁并非是用sync... [阅读全文]
详解Java 包扫描实现和应用(Jar篇)

如果你曾经使用过 spring, 那你已经配过包扫描路径吧，那包扫描是怎么实现的呢？让我们自己写个包扫描上篇文章中介绍了使用 file 遍历的方式去进行包扫描... [阅读全文]
使用jenkins+maven+git发布jar包过程详解

1、新建maven项目2、配置git仓库3、在远程机器上执行脚本，这一步需要先配置能ssh远程机器a、安装publish over ssh 插件b、jenkin... [阅读全文]
Javaweb resin4如何配置端口虚拟目录

在java web容器大家族中，resin可以算的上最轻巧最快速的服务器了。我个人非常喜欢在产品开发阶段使用resin来测试和调试，因为开发阶段需要频繁地重启服... [阅读全文]

网友评论


验证码：

Java实现的决策树算法完整实例

2019年07月19日 | 移动技术网IT编程 | 我要评论

您可能感兴趣的文章:

相关文章:

网友评论