首页 » Python » python pandas 实现RFM模型[转]

python pandas 实现RFM模型[转]

 

原文地址:https://www.cnblogs.com/Erick-L/p/10755956.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import numpy as np


df = pd.read_csv('./zue_164466.csv')

df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')
df['dateDiff'] = pd.to_datetime('today')-df['ptdate']
df['dateDiff'] = df['dateDiff'].dt.days

R_Agg = df.groupby(by=['user_email','product_name'])['dateDiff'].agg({'RecencyAgg': np.min})

F_Agg = df.groupby(by=['user_email','product_name'])['ptdate'].agg({'FrequencyAgg': np.size})

M_Agg = df.groupby(by=['user_email','product_name'])['totalcost'].agg({'MonetaryAgg': np.sum})

aggData = R_Agg.join(F_Agg).join(M_Agg)

bins = aggData.RecencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [5, 4, 3, 2, 1]
R_S = pd.cut(aggData.RecencyAgg, bins, labels=labels)

#
bins = aggData.FrequencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5]
F_S = pd.cut(aggData.FrequencyAgg, bins, labels=labels)


bins = aggData.MonetaryAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5]
M_S = pd.cut(aggData.MonetaryAgg,bins, labels=labels)


aggData['R_S']=R_S
aggData['F_S']=F_S
aggData['M_S']=M_S


aggData['RFM'] = 100*R_S.astype(int) + 10*F_S.astype(int) + 1*M_S.astype(int)


bins = aggData.RFM.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5, 6, 7, 8]
aggData['level'] = pd.cut(aggData.RFM, bins, labels=labels)

aggData = aggData.reset_index()

fe = aggData.sort_values(['level', 'RFM'], ascending=[0, 0])

dd=aggData.groupby(by=['level'])['user_email','product_name'].agg({'size':np.size})

print(fe.head())
fe.to_csv('./rfm_data.csv',index=False)
print("---------------")
print(dd)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-

'''
描述:案例-基于RFM的用户价值度模型
程序输入:sales.csv
程序输出:RFM得分数据写本地文件sales_rfm_score.csv和数据表(sales_rfm_score)
'''
# 导入库
import time  # 导入时间库
import numpy as np  # 导入numpy库
import pandas as pd  # 导入pandas库
import mysql.connector  # 导入mysql连接库

# 读取数据
dtypes = {'ORDERDATE': object, 'ORDERID': object, 'AMOUNTINFO': np.float32}  # 设置每列数据类型
raw_data = pd.read_csv('sales.csv', dtype=dtypes, index_col='USERID')  # 读取数据文件

# 数据审查和校验
# 数据概览
print ('Data Overview:')
print (raw_data.head(4))  # 打印原始数据前4条
print ('-' * 30)
print ('Data DESC:')
print (raw_data.describe())  # 打印原始数据基本描述性信息
print ('-' * 60)

# 缺失值审查
na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
print ('NA Cols:')
print (na_cols)  # 查看具有缺失值的列
print ('-' * 30)
na_lines = raw_data.isnull().any(axis=1)  # 查看每一行是否具有缺失值
print ('NA Recors:')
print ('Total number of NA lines is: {0}'.format(na_lines.sum()))  # 查看具有缺失值的行总记录数
print (raw_data[na_lines])  # 只查看具有缺失值的行信息
print ('-' * 60)

# 数据异常、格式转换和处理
# 异常值处理
sales_data = raw_data.dropna()  # 丢弃带有缺失值的行记录
sales_data = sales_data[sales_data['AMOUNTINFO'] > 1]  # 丢弃订单金额<=1的记录

# 日期格式转换
sales_data['ORDERDATE'] = pd.to_datetime(sales_data['ORDERDATE'], format='%Y-%m-%d')  # 将字符串转换为日期格式
print ('Raw Dtypes:')
print (sales_data.dtypes)  # 打印输出数据框所有列的数据类型
print ('-' * 60)

# 数据转换
recency_value = sales_data['ORDERDATE'].groupby(sales_data.index).max()  # 计算原始最近一次订单时间
frequency_value = sales_data['ORDERDATE'].groupby(sales_data.index).count()  # 计算原始订单频率
monetary_value = sales_data['AMOUNTINFO'].groupby(sales_data.index).sum()  # 计算原始订单总金额

# 计算RFM得分
# 分别计算R、F、M得分
deadline_date = pd.datetime(2017, 01, 01)  # 指定一个时间节点,用于计算其他时间与该时间的距离
r_interval = (deadline_date - recency_value).dt.days  # 计算R间隔
r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 计算R得分
f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 计算F得分
m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 计算M得分

# R、F、M数据合并
rfm_list = [r_score, f_score, m_score]  # 将r、f、m三个维度组成列表
rfm_cols = ['r_score', 'f_score', 'm_score']  # 设置r、f、m三个维度列名
rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                      index=frequency_value.index)  # 建立r、f、m数据框
print ('RFM Score Overview:')
print (rfm_pd.head(4))
print ('-' * 60)

# 计算RFM总得分
# 方法一:加权得分
rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1
# 方法二:RFM组合
rfm_pd_tmp = rfm_pd.copy()
rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('string')
rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('string')
rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('string')
rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])

# 打印输出和保存结果
# 打印结果
print ('Final RFM Scores Overview:')
print (rfm_pd.head(4))  # 打印数据前4项结果
print ('-' * 30)
print ('Final RFM Scores DESC:')
print (rfm_pd.describe())

# 保存RFM得分到本地文件
rfm_pd.to_csv('sales_rfm_score.csv')  # 保存数据为csv

# 保存RFM得分到MySQL数据库
# 设置要写库的数据库连接信息
table_name = 'sales_rfm_score'  # 要写库的表名
# 数据库基本信息
config = {'host': '127.0.0.1',  # 默认127.0.0.1
          'user': 'root',  # 用户名
          'password': '123456',  # 密码
          'port': 3306,  # 端口,默认为3306
          'database': 'python_data',  # 数据库名称
          'charset': 'gb2312'  # 字符编码
          }
con = mysql.connector.connect(**config)  # 建立mysql连接
cursor = con.cursor()  # 获得游标
# 查找数据库是否存在目标表,如果没有则新建
cursor.execute("show tables")  #
table_object = cursor.fetchall()  # 通过fetchall方法获得所有数据
table_list = []  # 创建库列表
for t in table_object:  # 循环读出所有库
    table_list.append(t[0])  # 每个每个库追加到列表
if not table_name in table_list:  # 如果目标表没有创建
    cursor.execute('''
    CREATE TABLE %s (
    userid               VARCHAR(20),
    r_score               int(2),
    f_score              int(2),
    m_score              int(2),
    rfm_wscore              DECIMAL(10,2),
    rfm_comb              VARCHAR(10),
    insert_date              VARCHAR(20)
    )ENGINE=InnoDB DEFAULT CHARSET=gb2312
    ''' % table_name)  # 创建新表
# 将数据写入数据库
user_id = rfm_pd.index  # 索引列
rfm_wscore = rfm_pd['rfm_wscore']  # RFM加权得分列
rfm_comb = rfm_pd['rfm_comb']  # RFM组合得分列
timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))  # 写库日期
print ('Begin to insert data into table {0}...'.format(table_name))  # 输出开始写库的提示信息
for i in range(rfm_pd.shape[0]):  # 设置循环次数并依次循环
    insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % \
                 (table_name, user_id[i], r_score.iloc[i], f_score.iloc[i], m_score.iloc[i], rfm_wscore.iloc[i],
                  rfm_comb.iloc[i], timestamp)  # 写库SQL依据
    cursor.execute(insert_sql)  # 执行SQL语句,execute函数里面要用双引号
    con.commit()  # 提交命令
cursor.close()  # 关闭游标
con.close()  # 关闭数据库连接
print ('Finish inserting, total records is: %d' % (i + 1))  # 打印写库结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import numpy as np


df = pd.read_excel('./zue_edu.xlsx',sheet_name='Q2未消费的',index_col='user_id')
df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')

recency_value = df['ptdate'].groupby(df['product_name']).max()  # 计算原始最近一次订单时间
r_interval = (pd.to_datetime('2019-05-06',format='%Y-%m-%d') - recency_value).dt.days  # 计算R间隔
frequency_value = df['ptdate'].groupby([df['user_email'],df['product_name']]).count()  # 计算原始订单频率
monetary_value = df['cost'].groupby(df['product_name']).sum()  # 计算原始订单总金额
frequency_value = frequency_value.groupby('product_name').max()

data_list = [recency_value,r_interval, frequency_value, monetary_value]
data_pd = pd.DataFrame(np.array(data_list).transpose(), columns=['上次消费时间','已停投天数', '有消费天数', '消费金额'] ,
                      index=frequency_value.index)  # 建立原始数据框


# 计算RFM得分
# 分别计算R、F、M得分
r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 计算R得分
f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 计算F得分
m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 计算M得分

# R、F、M数据合并
rfm_list = [r_score, f_score, m_score]  # 将r、f、m三个维度组成列表
rfm_cols = ['r_score', 'f_score', 'm_score']  # 设置r、f、m三个维度列名
rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                      index=frequency_value.index)  # 建立r、f、m数据框

# 计算RFM总得分
# 方法一:加权得分
rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1

# 方法二:RFM组合
rfm_pd_tmp = rfm_pd.copy()
rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('str')
rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('str')
rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('str')
rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])

rfm_pd['rfm_comb'] = rfm_pd['rfm_comb'].astype('int')

rfm_pd['最近级别'] = rfm_pd['r_score'].replace({5:'优',4:'良',3:'一般',2:'差',1:'非常差'})
rfm_pd['频次'] = rfm_pd['f_score'].replace({5:'优',4:'良',3:'一般',2:'差',1:'非常差'})
rfm_pd['金额'] = rfm_pd['m_score'].replace({5:'优',4:'良',3:'一般',2:'差',1:'非常差'})

bins = rfm_pd.rfm_wscore.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
bins[0] = 0  #起点为0
labels = [1, 2, 3, 4, 5, 6, 7, 8]
rfm_pd['level'] = pd.cut(rfm_pd.rfm_wscore, bins, labels=labels)

rfm_pd = rfm_pd.reset_index()
data_pd = data_pd.reset_index()
fe = rfm_pd.sort_values('rfm_comb', ascending=0)
fina = fe.merge(data_pd,on='product_name')

# 打印结果
print(fina.head())
fina.to_excel('./rfm_edu_Q2non.xlsx',index=False)

原文链接:python pandas 实现RFM模型[转],转载请注明来源!

0