从这里找的一个宝贝源码,可以大大缓解内存问题。https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/code
# @from: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/code
# @liscense: Apache 2.0
# @author: weijian
def reduce_mem_usage(props):
# 计算当前内存
start_mem_usg = props.memory_usage().sum() / 1024 ** 2
print("Memory usage of the dataframe is :", start_mem_usg, "MB")
# 哪些列包含空值,空值用-999填充。why:因为np.nan当做float处理
NAlist = []
for col in props.columns:
# 这里只过滤了objectd格式,如果你的代码中还包含其他类型,请一并过滤
if (props[col].dtypes != object):
print("**************************")
print("columns: ", col)
print("dtype before", props[col].dtype)
# 判断是否是int类型
isInt = False
mmax = props[col].max()
mmin = props[col].min()
# Integer does not support NA, therefore Na needs to be filled
if not np.isfinite(props[col]).all():
NAlist.append(col)
props[col].fillna(-999, inplace=True) # 用-999填充
# test if column can be converted to an integer
asint = props[col].fillna(0).astype(np.int64)
result = np.fabs(props[col] - asint)
result = result.sum()
if result < 0.01: # 绝对误差和小于0.01认为可以转换的,要根据task修改
isInt = True
# make interger / unsigned Integer datatypes
if isInt:
if mmin >= 0: # 最小值大于0,转换成无符号整型
if mmax <= 255:
props[col] = props[col].astype(np.uint8)
elif mmax <= 65535:
props[col] = props[col].astype(np.uint16)
elif mmax <= 4294967295:
props[col] = props[col].astype(np.uint32)
else:
props[col] = props[col].astype(np.uint64)
else: # 转换成有符号整型
if mmin > np.iinfo(np.int8).min and mmax < np.iinfo(np.int8).max:
props[col] = props[col].astype(np.int8)
elif mmin > np.iinfo(np.int16).min and mmax < np.iinfo(np.int16).max:
props[col] = props[col].astype(np.int16)
elif mmin > np.iinfo(np.int32).min and mmax < np.iinfo(np.int32).max:
props[col] = props[col].astype(np.int32)
elif mmin > np.iinfo(np.int64).min and mmax < np.iinfo(np.int64).max:
props[col] = props[col].astype(np.int64)
else: # 注意:这里对于float都转换成float16,需要根据你的情况自己更改
props[col] = props[col].astype(np.float16)
print("dtype after", props[col].dtype)
print("********************************")
print("___MEMORY USAGE AFTER COMPLETION:___")
mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage is: ",mem_usg," MB")
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
return props, NAlist
原文地址:https://www.cnblogs.com/duoba/p/12431544.html
时间: 2024-10-07 23:16:40