R语言进修系列(数据挖掘之决定树算法实现–ID3代码篇)
1、帮助类,用于计较进程和功效存储/// <summary> /// 决定树节点. /// </summary> public class DecisionTreeNode { /// <summary> /// 范例:分支或叶子 /// </summary> public string Type { get; set; } /// <summary> /// 要害字一般存当前属性因子 /// </summary> public string Key { get; set; } /// <summary> /// 判定值,叶子节点有效. /// </summary> public string DecisionValue { get; set; } /// <summary> /// 前一个属性因子,可以看作是分支条件. /// </summary> public string ParentFactor { get; set; } /// <summary> /// 当前节点的样本数量, /// </summary> public int CalcCount { get; set; } /// <summary> /// 当前节点的样本索引荟萃. /// </summary> public List<int> DataIndexes {get;set;} /// <summary> /// 分支节点荟萃. /// </summary> public Dictionary<string, DecisionTreeNode> Children { get; private set; } /// <summary> /// 父节点 /// </summary> public DecisionTreeNode Parent { get; set; } public DecisionTreeNode() { DataIndexes = new List<int>(); Children = new Dictionary<string, DecisionTreeNode>(); } } /// <summary> /// 用于计较进程存放数据.用数组不是很利便,这里回收字典,可以淘汰轮回次数. /// </summary> public class CalcNode { public string Key { get; set; } public string Type { get; set; } public int CalcCount { get; set; } public List<int> DataIndexes {get;set;} public Dictionary<string, CalcNode> Children { get; private set; } public CalcNode() { DataIndexes = new List<int>(); Children = new Dictionary<string, CalcNode>(); } public void AddChildren(string Key,string AType,int AIndex, int Count = 1) { if (Children.ContainsKey(Key) == false) { Children.Add(Key, new CalcNode()); } Children[Key].Key = Key; Children[Key].Type = AType; Children[Key].CalcCount += Count; Children[Key].DataIndexes.Add(AIndex); } }2、算法类,注释较量具体,有时间再写一篇道理文章
/// <summary> /// 决定树算法类,不适合持续性值。 /// </summary> public class DecisionTreeAlg { private string PrefixString = ” “; /// <summary> /// 构建决定树,决定分类属性约定放在第1列。 /// </summary> /// <param name=”Inputs”>行暗示属性,列为值,留意列等长</param> /// <param name=”PNode”>父节点</param> /// <param name=”PropertyNames”>测试属性名称</param> /// <param name=”TestProperties”>当前可用测试属性索引</param> /// <param name=”DefaultClassFactor”>缺省鉴别决定分类因子</param> /// <param name=”CallLevel”>用来测试输出节制,无实际浸染</param> /// <param name=”OutContents”>输出内容,为调试用</param> /// <param name=”PropertyFactors”>属性因子</param> public void BuildDecisionTree(int CallLevel, ref string OutContents, string[][] Inputs, DecisionTreeNode PNode, string[] PropertyNames, List<int> TestProperties, string DefaultClassFactor, Dictionary<string, List<string>> PropertyFactors) { string thePrefix = PrefixString.Substring(0, CallLevel * 2); CallLevel++; //假如没有测试属性,将当前节点设为叶子节点,选择高概率分类,然后返回 if (TestProperties.Count <= 1) { PNode.Type = “叶子”; PNode.DecisionValue = DefaultClassFactor; return; } //假如没有进修样本集,将当前节点设为叶子节点,选择高概率分类,然后返回 if (PNode.DataIndexes.Count <= 0) { PNode.Type = “叶子”; PNode.DecisionValue = DefaultClassFactor; return; }
if (PropertyFactors == null) { PropertyFactors = new Dictionary<string, List<string>>(); } //筹备存储遍历时的计数存储布局 Dictionary<string, CalcNode> thePropertyCount = new Dictionary<string, CalcNode>(); foreach (var theProIndex in TestProperties) { thePropertyCount.Add(PropertyNames[theProIndex], new CalcNode() { Key = PropertyNames[theProIndex] }); if (PropertyFactors.ContainsKey(PropertyNames[theProIndex]) == false) { PropertyFactors.Add(PropertyNames[theProIndex], new List<string>()); } } //遍历当前可遍历的数据,举办统计,为计较各属性熵做筹备 for (int n = 0; n < PNode.DataIndexes.Count; n++) { int theI = PNode.DataIndexes[n]; for (int k = 0; k < TestProperties.Count; k++) { int theJ = TestProperties[k]; var thePropertyCalcNode = thePropertyCount[PropertyNames[theJ]]; //对当前属性计数 thePropertyCalcNode.CalcCount++; //对第j个属性的当前因子计数 thePropertyCalcNode.AddChildren(Inputs[theJ][theI], “测试属性因子”, theI, 1); //对第j个属性的当前因子的主分类因子计数 thePropertyCalcNode.Children[Inputs[theJ][theI]].AddChildren(Inputs[0][theI], “主分类因子”, theI, 1); //统计归纳各属性因子,回收这种方法可以淘汰轮回. if (PropertyFactors[PropertyNames[theJ]].Contains(Inputs[theJ][theI]) == false) { PropertyFactors[PropertyNames[theJ]].Add(Inputs[theJ][theI]); } } } //计较信息增益量,获取具有较大信息增益属性 string theDefaultClassFactor = DefaultClassFactor; //初始化较大测试属性熵值. double theMaxEA = double.MinValue; //记录具有较大熵值属性的索引位置 int theMaxPropertyIndex = TestProperties[1]; //总信息熵值,其实就是分类属性的熵值. double theTotalEA = 0.0; //记录总的样本数,用于估算概率. double theTotalSimple = 0;
for(int theI=0;theI<TestProperties.Count;theI++) { int thePIndex_1 = TestProperties[theI]; if (thePIndex_1 == 0) { //主分类熵值计较,计较公式与测试属性有所差异. CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]]; double theCount = theCalcNode.CalcCount; theTotalSimple = theCount; double theMaxSubCount = -1; theTotalEA = 0.0; //求和(-Pj*log2(Pj)) foreach (var theSubNode in theCalcNode.Children) { if (theSubNode.Value.CalcCount > 0) { double thePj = theSubNode.Value.CalcCount / theCount; theTotalEA += 0 – thePj * Math.Log(thePj, 2); } if (theMaxSubCount < theSubNode.Value.CalcCount) { theMaxSubCount = theSubNode.Value.CalcCount; theDefaultClassFactor = theSubNode.Key; } //测试输出,跟踪计较路径. OutContents += “\r\n” + thePrefix + theCalcNode.CalcCount + “:: ” + PropertyNames[thePIndex_1] + “:: ” + theSubNode.Value.Type + ” :: ” + theSubNode.Key + ” :: ” + theSubNode.Value.CalcCount;
} } else { //测试属性熵值计较。 CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]]; double theJEA = 0.0; foreach (var theSubNode_1 in theCalcNode.Children) { if (theSubNode_1.Value.CalcCount > 0) { double theSjCount = theSubNode_1.Value.CalcCount; double theSj_1 = theSjCount / theTotalSimple; double theSj_2 = 0.0; foreach (var theSubNode_2 in theSubNode_1.Value.Children) { if (theSubNode_2.Value.CalcCount > 0) { double thePj_1 = Convert.ToDouble(theSubNode_2.Value.CalcCount) / theSjCount; theSj_2 += 0.0 – thePj_1 * Math.Log(thePj_1, 2); } OutContents += “\r\n” + thePrefix + theCalcNode.CalcCount + “:: ” + PropertyNames[thePIndex_1] + ” :: ” + theSubNode_1.Value.Type + ” :: ” + theSubNode_1.Key + ” :: ” + theSubNode_1.Value.CalcCount + theSubNode_2.Value.Type + ” :: ” + theSubNode_2.Key + ” :: ” + theSubNode_2.Value.CalcCount; } theJEA += theSj_1 * theSj_2; } } theJEA = theTotalEA – theJEA; //只记录较大熵值属性信息. if (theMaxEA < theJEA) { theMaxEA = theJEA; theMaxPropertyIndex = thePIndex_1; } } } //假如分类因子只有一个,则置当前节点为叶子节点,配置鉴定为当前分类因子,然后返回 if (thePropertyCount[PropertyNames[0]].Children.Count <= 1) { PNode.Type = “叶子”; PNode.DecisionValue = theDefaultClassFactor; return; } //具有多个分类因子,还剩有测试属性,则设当前节点为分支节点,筹备分支. PNode.Type = “分支”; //1选取较大增益信息量测试属性,做分支处理惩罚,做处理惩罚,留意属性一旦处理惩罚,将不在后续节点中再处理惩罚 //因此需要在测试属性荟萃中删除所选测试属性.留意保持分类属性在开始索引处(0). PNode.Key = PropertyNames[theMaxPropertyIndex];
CalcNode theCalcNode_2 = thePropertyCount[PropertyNames[theMaxPropertyIndex]]; List<string> theFactors = PropertyFactors[PropertyNames[theMaxPropertyIndex]]; List<int> theAvailableTestPs = new List<int>(); for (int i = 0; i < TestProperties.Count; i++) { if (theMaxPropertyIndex != TestProperties[i]) { theAvailableTestPs.Add(TestProperties[i]); } } //对所选测试属性的所有因子举办处理惩罚. foreach (var theFactor_1 in theFactors) { //假如当前因子不在计较中,则添加一个叶子节点,鉴定为高概率分类。 if (theCalcNode_2.Children.ContainsKey(theFactor_1) == false) { DecisionTreeNode theNode_1 = new DecisionTreeNode(); theNode_1.ParentFactor = theFactor_1; theNode_1.CalcCount = 0; theNode_1.DecisionValue = theDefaultClassFactor; theNode_1.Parent = PNode; theNode_1.Key = theFactor_1; theNode_1.Type = “叶子”; PNode.Children.Add(theFactor_1, theNode_1); continue; } //假如当前因子存在,但不存在样本,则添加一个叶子节点,鉴定为高概率分类。 if (theCalcNode_2.Children[theFactor_1].CalcCount<=0) { DecisionTreeNode theNode_1 = new DecisionTreeNode(); theNode_1.ParentFactor = theFactor_1; theNode_1.CalcCount = 0; theNode_1.DecisionValue = theDefaultClassFactor; theNode_1.Parent = PNode; theNode_1.Type = “叶子”; theNode_1.Key = theFactor_1; PNode.Children.Add(theFactor_1, theNode_1); continue; } //假如存在,且有进修样本,则添加一个节点,并以此节点递归处理惩罚. DecisionTreeNode theNode_2 = new DecisionTreeNode(); theNode_2.ParentFactor = theFactor_1; theNode_2.Parent = PNode; theNode_2.Key = theFactor_1; theNode_2.CalcCount = theCalcNode_2.Children[theFactor_1].CalcCount; theNode_2.DataIndexes.AddRange(theCalcNode_2.Children[theFactor_1].DataIndexes); PNode.Children.Add(theFactor_1, theNode_2); BuildDecisionTree(CallLevel, ref OutContents, Inputs, theNode_2, PropertyNames, theAvailableTestPs, theDefaultClassFactor, PropertyFactors); } }
}3、测试代码:
private void button1_Click(object sender, EventArgs e) { DecisionTreeAlg theAlg = new DecisionTreeAlg(); string[][] theInputs = new string[4][]; theInputs[0] = new string[] { “no”, “yes”, “yes”, “yes”, “yes”, “yes”, “no”, “yes”, “yes”, “no” }; theInputs[1] = new string[] { “s”, “s”, “l”, “m”, “l”, “m”, “m”, “l”, “m”, “s” }; theInputs[2] = new string[] { “s”, “l”, “m”, “m”, “m”, “l”, “s”, “m”, “s”, “s” }; theInputs[3] = new string[] { “no”, “yes”, “yes”, “yes”, “no”, “no”, “no”, “no”, “no”, “yes” };
string[] thePropertyName = new string[] {“是否真实帐号”,”日志密度”,”挚友密度”,”是否真实头像” };
DecisionTreeNode theRootNode = new DecisionTreeNode(); theRootNode.DataIndexes.AddRange(new List<int>() { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
List<int> theTestPs = new List<int>() { 0, 1, 2, 3 }; string theOuts = “”; theAlg.BuildDecisionTree(0,ref theOuts, theInputs, theRootNode, thePropertyName, theTestPs, “”, null); this.treeView1.Nodes.Clear(); TreeNode theRoot = new TreeNode(); this.treeView1.Nodes.Add(theRoot); VisitTree(theRoot, theRootNode); this.textBox1.Text = theOuts; } private void VisitTree(TreeNode PNode, DecisionTreeNode PDNode) { PNode.Text = PDNode.Key + “(” + PDNode.Type + “)[鉴定:”+PDNode.DecisionValue +”]”; foreach (var theNode in PDNode.Children.Values) { TreeNode theTmpNode = new TreeNode(); PNode.Nodes.Add(theTmpNode); VisitTree(theTmpNode, theNode); } }