1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
| import pandas as pd import sys
def c1_create(dataset): """ 将dataset中的所有单个部件A, B, C, D排序整理成[['A'], ['B'], ['C'], ['D']], 返回时再转成frozenset, frozenset与set的区别是可以作为字典的键。 """ c1 = [] for transaction in dataset: for item in transaction: if not [item] in c1: c1.append([item]) c1.sort() return list(map(frozenset, c1))
def apriori_gen(lk, k): """由初始候选项集的集合Lk生成新的生成候选项集""" retlist = [] lenlk = len(lk) for i in range(lenlk): for j in range(i + 1, lenlk): l1 = list(lk[i]) l2 = list(lk[j]) l1.sort() l2.sort() if l1[: k - 2] == l2[: k - 2]: retlist.append(lk[i] | lk[j]) return retlist
def support_cal(d, ck, minsupport): """计算备选项集的支持度,将满足minSupport要求的项集加入L,将项集支持度字典返回为supportData""" sscnt = {} for tid in d: for can in ck: if can.issubset(tid): if can not in sscnt: sscnt[can] = d[tid] else: sscnt[can] += d[tid] numitems = float(sum(d.values())) retlist = [] supportdata = {} for key in sscnt: support = sscnt[key] / numitems if support >= minsupport: retlist.insert(0, key) supportdata[key] = support return retlist, supportdata
def conf_cal(freqset, supportdata, rulelist, minconf): """计算置信度和提升度,生成规则的内部函数""" for item in freqset: conseq = frozenset([item]) conf = supportdata[freqset] / supportdata[freqset - conseq] lift = supportdata[freqset] / (supportdata[conseq] * supportdata[freqset - conseq]) if conf >= minconf and lift > 1: rulelist.append((freqset - conseq, conseq, round(supportdata[freqset], 4), conf, lift)) return rulelist
def rules_gen(l, supportdata, minconf): """生成规则""" bigrulelist = [] step = int(40 / len(l)) for i in range(1, len(l)): for freqset in l[i]: conf_cal(freqset, supportdata, bigrulelist, minconf) print('正在进行部件组合的关联分析:' + str(i) + '/' + str(len(l)) + '#' + str(50 + i * step)) sys.stdout.flush() return bigrulelist
def apriori_main(dataset, minsupport): """用Apriori算法挖掘频繁集,maxconflen是最长集合的长度,他是挖掘可能的最大深度""" step = max(int(40 / maxconflen), 1) c1 = c1_create(dataset) l1, supportdata = support_cal(dataset, c1, minsupport) print('正在进行迭代分析:1/' + str(maxconflen) + '#' + str(10 + step)) sys.stdout.flush() la = [l1] k = 2 while len(la[k - 2]) > 0: ck = apriori_gen(la[k - 2], k) lk, supk = support_cal(dataset, ck, minsupport) supportdata.update(supk) la.append(lk) print('正在进行迭代分析:' + str(k) + '/' + str(maxconflen) + '#' + str(10 + k * step)) sys.stdout.flush() k += 1 del la[-1] return la, supportdata
if __name__ == '__main__': support = 0.3 confidence = 0.7 dataSource = [['r', 'z', 'h', 'j', 'p'], ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'], ['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] dataSet = {} maxconflen = 0 for order in dataSource: if frozenset(order) in dataSet: dataSet[frozenset(order)] += 1 else: dataSet[frozenset(order)] = 1 if len(order) > maxconflen: maxconflen = len(order)
L, supportData = apriori_main(dataSet, support) rules = pd.DataFrame(rules_gen(L, supportData, confidence)) rules.rename(index=str, columns={0: 'LHS', 1: 'RHS', 2: 'Support', 3: 'Confidence', 4: 'Lift'}, inplace=True) print(rules)
|