1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
| from multiprocessing import Pool from numpy import linspace from math import ceil import pandas as pd import sys
POOL_NUM = 4
class TreeNode: """FP树节点类"""
def __init__(self, name, count, parent): self.name = name self.count = count self.parent = parent self.children = {} self.link = None
def increase(self, count): self.count += count
def display(self, index=1): print(' ' * index, self.name, ' ', self.count) for child in self.children.values(): child.display(index + 1)
def update_header(node, target): """更新频繁集链表指针""" while node.link is not None: node = node.link node.link = target
def update_fptree(items, intree, headertable, count): """添加新的配置(items,count)到FP树(intree)""" if items[0] in intree.children: intree.children[items[0]].increase(count) else: intree.children[items[0]] = TreeNode(items[0], count, intree) if headertable[items[0]][1] is None: headertable[items[0]][1] = intree.children[items[0]] else: update_header(headertable[items[0]][1], intree.children[items[0]]) if len(items) > 1: update_fptree(items[1::], intree.children[items[0]], headertable, count)
def create_fptree(dataset, minnum): """创建FP树,使用最少出现的频次""" headertable = {} for rec in dataset: for item in rec: headertable[item] = headertable.get(item, 0) + dataset[rec] for k in list(headertable.keys()): if headertable[k] < minnum: del (headertable[k]) freq_itemset = set(headertable.keys())
if len(freq_itemset) == 0: return None, None
for k in headertable: headertable[k] = [headertable[k], None] rettree = TreeNode('Null Set', 1, None) for transet, count in dataset.items(): localdict = {} for item in transet: if item in freq_itemset: localdict[item] = headertable[item][0] if len(localdict) > 0: item_ordered = [v[0] for v in sorted(localdict.items(), key=lambda q: (q[1], q[0]), reverse=True)] update_fptree(item_ordered, rettree, headertable, count) return rettree, headertable
def ascend_fptree(leafnode, prefixpath): """递归回溯FP数,补全一条前缀路径,挖掘条件模式基的内部函数""" if leafnode.parent is not None: prefixpath.append(leafnode.name) ascend_fptree(leafnode.parent, prefixpath)
def find_prefixpath(basepart, myheadertab): """挖掘条件模式基""" node = myheadertab[basepart][1] condparts = {} while node is not None: prefixpath = [] ascend_fptree(node, prefixpath) if len(prefixpath) > 1: condparts[frozenset(prefixpath[1:])] = node.count node = node.link return condparts
def mine_fptree(headertable, minnums, prefix, freq_itemlist, needprint): """递归挖掘频繁项集,preFix,freq_itemlist存储已挖掘到的频繁集,留空就好""" biglist = [(v[0], v[1][0]) for v in sorted(headertable.items(), key=lambda q: (q[1][0], q[0]))] biglistnum = len(biglist) printdict = {} if needprint: checkpoints = linspace(11, 49, biglistnum, dtype=int) printdict[0] = checkpoints[0] for i in range(1, biglistnum): if checkpoints[i] != checkpoints[i - 1]: printdict[i] = checkpoints[i] for i in range(biglistnum): if i in printdict.keys(): print('正在由条件模式基挖掘频繁项#' + str(printdict[i])) basepart = biglist[i] newfreqset = prefix.copy() newfreqset.add(basepart) freq_itemlist.append(newfreqset) condpartbases = find_prefixpath(basepart[0], headertable) mycondtree, myhead = create_fptree(condpartbases, minnums) if myhead is not None: mine_fptree(myhead, minnums, newfreqset, freq_itemlist, False)
def conf_cal(freqset, supportdata, partsnum, rulelist, minconf): """计算置信度和提升度,生成规则的内部函数""" sd = supportdata[0] supportdata0 = supportdata[partsnum - 2] supportdata1 = supportdata[partsnum - 1] for item in freqset: conseq = frozenset([item]) conf = supportdata1[freqset] / supportdata0[freqset - conseq] lift = supportdata1[freqset] / (sd[conseq] * supportdata0[freqset - conseq]) if conf >= minconf and lift > 1: rulelist.append((freqset - conseq, conseq, round(supportdata1[freqset], 4), conf, lift)) return rulelist
def rules_gen(l, supportdata, minconf, partsnum): """生成规则""" ln = l[partsnum - 1] maxnum = len(l) bigrulelist = [] for freqset in ln: conf_cal(freqset, supportdata, partsnum, bigrulelist, minconf) print('完成' + str(partsnum) + '部件组合的关联分析#' + str(50 + int(partsnum / maxnum * 40))) sys.stdout.flush() return bigrulelist
def encode_dataset(datasource): """编码保证在建立和挖掘fp树的过程中item排序的一致性""" dataset = {} encodedict = {} decodelist = [] maxconflen = 0 i = 0 for order in datasource: maxconflen = max(maxconflen, len(order)) for j in range(len(order)): try: order[j] = encodedict[order[j]] except KeyError: encodedict[order[j]] = i decodelist.append(order[j]) order[j] = i i += 1 if frozenset(order) in dataset: dataset[frozenset(order)] += 1 else: dataset[frozenset(order)] = 1 return dataset, maxconflen, decodelist
def convert_freqitems(decodelist, freqitems, totalnums, maxconflen): """解码频繁项集""" la = [] supportdata = {} for i in range(maxconflen): la.append([]) supportdata[i] = {} for item in freqitems: conf = frozenset([decodelist[q[0]] for q in item]) la[len(conf) - 1].append(conf) supportdata[len(conf) - 1][conf] = min([q[1] for q in item]) / totalnums ll = [l1 for l1 in la if l1 != []] for key in list(supportdata.keys()): if supportdata[key] == {}: del (supportdata[key]) return ll, supportdata
if __name__ == '__main__': support = 0.3 confidence = 0.7 dataSource = [['r', 'z', 'h', 'j', 'p'], ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'], ['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
dataSet, maxConfLen, decodeList = encode_dataset(dataSource)
supportNum = ceil(support * sum(dataSet.values())) myFPtree, myHeaderTab = create_fptree(dataSet, supportNum)
freqItems = [] mine_fptree(myHeaderTab, supportNum, set([]), freqItems, True) L, supportData = convert_freqitems(decodeList, freqItems, sum(dataSet.values()), maxConfLen) resLst = [[]] * (len(L) - 1) p = Pool(POOL_NUM) for n in range(1, len(L)): res = p.apply_async(rules_gen, args=(L, supportData, confidence, n + 1,)) resLst[n - 1] = res p.close() p.join() rule = [] for n in range(len(resLst)): rule += resLst[n].get()
rules = pd.DataFrame(rule) rules.rename(index=str, columns={0: 'LHS', 1: 'RHS', 2: 'Support', 3: 'Confidence', 4: 'Lift'}, inplace=True) print(rules)
|