1.1. Pandas分析步骤
- 载入数据
- 将 浏览工具排名 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT browser,
count(*)
FROM log
GROUP BY browser
ORDER BY count(*)
LIMIT 0, 100;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
“”“解析文件中的每一行并生成一个迭代器”“”
for path in pathes:
with open(path, ‘r’) as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def load_data(self, path):
“”“通过给的文件路径加载数据生成 DataFrame”“”
self.df = pd.DataFrame(self._log_line_iter(path))
def browser_stat(self):
“”“统计不同浏览器访问次数”“”
group_by_cols = [‘browser’] # 需要分组的列,只计算和显示该列
# 直接统计次数
url_req_grp = self.df[group_by_cols].groupby(
self.df[‘browser’])
return url_req_grp.agg([‘count’])[‘browser’].nlargest(100, ‘count’)
def main():
file_pathes = [‘www.trustauth.cn.access.log’]
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# 统计 统计不同浏览器访问次数
print pd_ng_log_stat.browser_stat()
if __name__ == ‘__main__’:
main()
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python pd_ng_log_stat.py
count
browser
......
P1 4.1.2) 613
601
[100 rows x 1 columns]
|
文章转载来自:trustauth.cn