有这么一堆数据:
#test.log
a, 1.324171
b, 0.000126
c, 1.970941
a, 1.469649
b, 0.000124
c, 0.512929
a, 1.290920
b, 0.000118
c, 0.259524
a, 0.495958
b, 0.000123
c, 0.910949
a, 1.268038
b, 0.000118
c, 1.016419
a, 1.856081
b, 0.000120
c, 1.400075
a, 1.314131
b, 0.000140
想要用 python 把左边的 key 一样的合并,但 value 要取它所有的和,还有平均值
搞了半天,发现搞不定,也是尴尬,以下还是个半成品,搞不下去了,报错,求大神指点一些简单方法
def two(file):
arr = []
with open(file, "r", encoding="utf-8") as f:
for i in f.readlines():
a = i.replace("\n", '').strip()
if a.split(",")[0] not in arr:
arr.append(a.split(",")[0])
ser = -1
while True:
ser += 1
try:
if a.split(",")[ser] == arr[ser]:
print(a.split(",")[ser])
except IndexError:
print("end!")
break
two("test.log")
1
linxzh1989 2018-06-15 10:16:56 +08:00
pandas groupby sum?
|
2
aborigine 2018-06-15 10:18:07 +08:00 via iPhone
了解一下 pandas ?导入生成个 dataframe 就行了
|
5
deepreader 2018-06-15 10:21:27 +08:00
Dictionary 会用么?
|
6
wsds OP @deepreader 抱歉,字典我知道,但以我的水平真心实现不了
|
7
mentalkiller 2018-06-15 10:25:19 +08:00
@wsds #6 dict 是 python 内置的数据结构啊,不需要你自己实现
|
8
wsds OP @mentalkiller 我的意思不是实现字典的功能,我的意思是用字典,我实现不了我的需求
|
9
hahastudio 2018-06-15 10:28:58 +08:00 1
|
10
lixm 2018-06-15 10:30:15 +08:00 1
|
11
ybping 2018-06-15 10:30:23 +08:00 via iPhone
dict 了解一下
|
12
scriptB0y 2018-06-15 10:30:33 +08:00 1
In [18]: data = """
...: a, 1.324171 ...: b, 0.000126 ...: c, 1.970941 ...: a, 1.469649 ...: b, 0.000124 ...: c, 0.512929 ...: a, 1.290920 ...: b, 0.000118 ...: c, 0.259524 ...: a, 0.495958 ...: b, 0.000123 ...: c, 0.910949 ...: a, 1.268038 ...: b, 0.000118 ...: c, 1.016419 ...: a, 1.856081 ...: b, 0.000120 ...: c, 1.400075 ...: a, 1.314131 ...: b, 0.000140 ...: """ In [19]: result = {} ...: for line in data.splitlines(): ...: if not line: continue ...: key, value = line.split(",") ...: result.setdefault(key, []).append(float(value)) ...: In [20]: for key, values in result.items(): ...: print(f"{key}: avg: {sum(values) / len(values)}, sum: {sum(values)}") ...: a: avg: 1.2884211428571428, sum: 9.018948 b: avg: 0.00012414285714285714, sum: 0.000869 c: avg: 1.0118061666666667, sum: 6.070837 https://gist.github.com/laixintao/f4a186cea6c28fcf3dc696100458c410 |
13
wsds OP |
14
wplct 2018-06-15 10:35:15 +08:00
def two(file):
data = {} with open(file, "r") as f: while True: s = f.readline() if s is None or not s: break print(s.split(', ')) k, v = s.split(', ') v = float(v) if k not in data: data[k] = { 'num': 1, 'sum': v, 'avg': v } else: data[k]['num'] += 1 data[k]['sum'] += v data[k]['avg'] = data[k]['sum'] / data[k]['num'] print(data) two('test.txt') |
16
araraloren 2018-06-15 10:43:41 +08:00 3
|
17
bufpay 2018-06-15 10:51:56 +08:00
其实一个 for 循环就可以了
|
18
E1n 2018-06-15 11:03:54 +08:00 via Android
@araraloren perl 好用,用 awk 能实现吗。。
|
19
araraloren 2018-06-15 11:05:43 +08:00
@E1n awk 肯定能实现,不过我只懂基本的 awk 脚本
|
20
imagechans 2018-06-15 11:22:38 +08:00
def stand(file):
datas = [str(line).replace("\n","").strip().split(',')[1] for line in open(file)] s = sum([float(d) for d in datas]) m = s / len(datas) print(s,m) stand("test.log") |
21
imagechans 2018-06-15 11:24:19 +08:00
@imagechans 这是按照我自己的习惯写的
|
22
billgreen1 2018-06-15 11:43:25 +08:00 1
< test.log | awk -F"," '{total[$1]+=$2; occurence[$1]+=1}END{for (key in total) printf("%s\t %s \t %s\n", key, total[key], total[key]/occurence[key])}'
|
23
slimbloody 2018-06-15 12:09:58 +08:00
default_dict
|
24
arthasgxy 2018-06-15 12:18:33 +08:00
|
25
arthasgxy 2018-06-15 12:36:07 +08:00
|
27
xpresslink 2018-06-15 13:59:48 +08:00
>>> data = """a, 1.324171
b, 0.000126 c, 1.970941 a, 1.469649 b, 0.000124 c, 0.512929 a, 1.290920 b, 0.000118 c, 0.259524 a, 0.495958 b, 0.000123 c, 0.910949 a, 1.268038 b, 0.000118 c, 1.016419 a, 1.856081 b, 0.000120 c, 1.400075 a, 1.314131 b, 0.000140""" >>> import csv >>> from itertools import groupby >>> from operator import itemgetter as ig >>> {k:sum(map(lambda x:float(ig(1)(x)), v)) for k, v in groupby(sorted(csv.reader(iter(data.splitlines())), key=ig(0)), key=ig(0))} {'a': 9.018948, 'b': 0.000869, 'c': 6.070837} >>> |
28
Xiaobaixiao 2018-06-15 15:14:39 +08:00
from collections import defaultdict
logData='''a, 1.324171 b, 0.000126 c, 1.970941 a, 1.469649 b, 0.000124 c, 0.512929 a, 1.290920 b, 0.000118 c, 0.259524 a, 0.495958 b, 0.000123 c, 0.910949 a, 1.268038 b, 0.000118 c, 1.016419 a, 1.856081 b, 0.000120 c, 1.400075 a, 1.314131 b, 0.000140''' def solve(logData): logList=[] logDict=defaultdict(int) for line in logData.splitlines(): newList = line.split(',') k=newList[0] v=float(newList[1]) logList.append(k) logDict[k]+=v for k,v in logDict.items(): avg = v/logList.count(k) print("{0} 总和:{1} , 平均值:{2}".format(k,v,avg)) >>> solve(logData) a 总和:9.018948 , 平均值:1.2884211428571428 b 总和:0.000869 , 平均值:0.00012414285714285714 c 总和:6.070837 , 平均值:1.0118061666666667 |
29
zhang0320 2018-06-15 15:20:00 +08:00
with open('data.txt', 'r') as f:
data = f.read() dict_data = {} for i in data.split('\n'): if i.split(',')[0] not in dict_data: dict_data[i.split(',')[0]]=float(i.split(',')[1]) dict_data[i.split(',')[0]]=dict_data[i.split(',')[0]]+float(i.split(',')[1]) print(dict_data) 我是个新手菜鸟,不知道这种想法对不对。。。 |
30
zhang0320 2018-06-15 15:24:06 +08:00
with open('data.txt', 'r') as f:
data = f.read() dict_data = {} for i in data.split('\n'): if i.split(',')[0] not in dict_data: dict_data[i.split(',')[0]] = float(i.split(',')[1]) else: dict_data[i.split(',')[0]] = dict_data[i.split(',')[0]] + float(i.split(',')[1]) print(dict_data) 不好意思 忘了个 else: |
31
Alexhex 2018-06-15 15:26:18 +08:00
拷到 Excel 里一个数据透视表搞定美滋滋。
|
32
ful1v1dcker 2018-06-15 15:31:59 +08:00
什么鬼,V 站 markdown 失效了?全是代码坨啊。。。
|
33
reself 2018-06-15 15:33:22 +08:00 via Android
@araraloren 哈哈 perl 自带代码混淆
|
34
araraloren 2018-06-15 15:37:12 +08:00
@ful1v1dcker 本来就是这样,只支持楼主的。。不然还需要什么 chrome markdown 插件(外面那个帖子)
|
35
JCZ2MkKb5S8ZX9pq 2018-06-15 15:57:18 +08:00
d = {}
逐行 k, v = (i.strip() for i in text.split(',')) d.setdefault(k, []) # 建个列表 d[k].append(float(v)) 这样直观一点,然后求值啥的慢慢折腾呗。 |
37
gnozix 2018-06-15 17:58:27 +08:00
我意思一下:
```python temp = {} for i, j in dict or tuple: if i in temp: temp[i] = float(j) else: temp[i] += float(j) ``` |
38
ful1v1dcker 2018-06-15 17:58:45 +08:00
@araraloren 噢好吧,果然没用最垃圾只有更垃圾
|
39
yaorc 2018-06-15 18:26:46 +08:00
def pivot_table():
with open('data.txt', 'r') as f: all_data = f.readlines()[1:] keys = [] result = {} count = {} for data in all_data: content = data.split(',') key = content[0] value = float(content[1].strip()) # 添加键 if key not in keys: keys.append(key) result[key] = value count[key] = 1 else: result[key] += value count[key] += 1 print('元素:', keys) print('元素个数:', count) print('和:', result) print('\n 统计信息(元素,和,平均值):') for k, v in result.items(): avg = v / count[k] print(k, v, avg) ----------输出----------- 元素: ['a', 'b', 'c'] 元素个数: {'a': 7, 'b': 7, 'c': 6} 和: {'a': 9.018948, 'b': 0.000869, 'c': 6.070837} 统计信息(元素,和,平均值): a 9.018948 1.2884211428571428 b 0.000869 0.00012414285714285714 c 6.070837 1.0118061666666667 |
40
gpj22pYlv2qYiZ8U 2018-06-15 20:53:16 +08:00
def two(file):
num_dict = {} with open(file, "r", encoding="utf-8") as f: for i in f.readlines(): a = i.replace("\n", '').strip() line_list = a.split(",") if line_list[0] not in num_dict: num_dict[line_list[0]] = [line_list[1], 1] else: num_dict[line_list[0]] = [float(num_dict[line_list[0]][0]) + float(line_list[1]), int(num_dict[line_list[0]][1]) + 1] for x in num_dict: num_dict[x].append(num_dict[x][0] / num_dict[x][1]) print(num_dict) two("/Users/yourname/program/test/test.log") |
41
UnluckyNinja 2018-06-15 21:49:22 +08:00
def text = '''a, 1.324171
b, 0.000126 c, 1.970941 a, 1.469649 b, 0.000124 c, 0.512929 a, 1.290920 b, 0.000118 c, 0.259524 a, 0.495958 b, 0.000123 c, 0.910949 a, 1.268038 b, 0.000118 c, 1.016419 a, 1.856081 b, 0.000120 c, 1.400075 a, 1.314131 b, 0.000140 ''' // file.readLines().collect{ text.readLines().collect{ it.split(',')*.trim() }.groupBy{ it[0] }.collectEntries{k, vList -> [(k): [sum: def sum = vList.sum{ it[1] as BigDecimal }, average: sum / vList.size()]] } /* result: [a:[sum:9.018948, average:1.2884211429], b:[sum:0.000869, average:0.0001241429], c:[sum:6.070837, average:1.0118061667]] groovy 写的,groovy 有的 python 肯定有,语法方法名啥的改一下应该就差不多了 */ |
42
Binb 2018-06-16 10:16:44 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*- a = {} with open('test.log','r') as f: for i in f: j = i.split(',') a.setdefault(j[0],[]).append(float(j[1])) result = {} for k,v in a.items(): s = sum(v) result.setdefault(k,[]).append(s) result[k].append(s/len(v)) print result # {'a': [9.018948, 1.2884211428571428], 'c': [6.070837, 1.0118061666666667], 'b': [0.000869, 0.00012414285714285714]} |
43
NICCEEEE 2018-06-16 17:21:39 +08:00
import collections
d = """a, 1.324171 b, 0.000126 c, 1.970941 a, 1.469649 b, 0.000124 c, 0.512929 a, 1.290920 b, 0.000118 c, 0.259524 a, 0.495958 b, 0.000123 c, 0.910949 a, 1.268038 b, 0.000118 c, 1.016419 a, 1.856081 b, 0.000120 c, 1.400075 a, 1.314131 b, 0.000140""" L = [(i[0], i[3:]) for i in d.split('\n')] data_dict = collections.defaultdict(int) for i, j in L: data_dict[i] += float(j) print(data_dict) |
44
yangxiaoyong 2018-06-16 20:33:05 +08:00 via Android
讲下原理的东西,map reduce 可以了解一下,首先把数据分组归类
map (lambda x: { value: x.key, key: x.key, count: 1}) 按上面的把数据按 key 分组放好 然后执行归约函数,将数据集合归约为一个最终结果 reduce(lambda acc, curr: merge(acc,curr), mapdata ) merge 根据 key 将相同 key 的数值相加得到总合,count 相加得到次数,总和除以次数可以得平均值 最后的结果应该是 {a: { value,count,avg}} 手机码字,凑合看吧 |
45
bugcoder 2018-06-17 08:24:41 +08:00
没人贴 pandas 的,我就献个丑吧:
··· import pandas as pd data_file = 'data.txt' data_df = pd.read_csv(data_file, comment='#', names=['key', 'value']) sums = data_df.groupby('key').sum() means = data_df.groupby('key').mean() ··· |
47
biglazycat 2020-09-06 22:09:24 +08:00
convert_list = {}
for line in open('test.log'): k, v = line.split(',') convert_list.setdefault(k,[]).append(float(v.strip())) # print(convert_list) for k, v in convert_list.items(): total_sum = sum(v) avg = total_sum / len(v) print(total_sum) print(avg) |