1.1. Pandas分析步骤
- 载入数据
- 将 外链点击数 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT reference_url,
count(*)
FROM log
GROUP BY reference_url
ORDER BY count(*)
LIMIT 0, 100;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
“”“解析文件中的每一行并生成一个迭代器”“”
for path in pathes:
with open(path, ‘r’) as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def load_data(self, path):
“”“通过给的文件路径加载数据生成 DataFrame”“”
self.df = pd.DataFrame(self._log_line_iter(path))
def url_ref_stat(self):
“”“统计外链点击情况”“”
group_by_cols = [‘reference_url’] # 需要分组的列,只计算和显示该列
# 直接统计次数
url_ref_grp = self.df[group_by_cols].groupby(
self.df[‘reference_url’])
return url_ref_grp.agg([‘count’])[‘reference_url’].sort_values(by=‘count’, ascending=False)
def main():
file_pathes = [‘www.trustauth.cn.access.log’]
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# 统计外链点击情况
print pd_ng_log_stat.url_ref_stat()
if __name__ == ‘__main__’:
main()
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python pd_ng_log_stat.py
count
reference_url
– 574546
136
[231 rows x 1 columns]
|
文章转载来自:trustauth.cn