1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
|
""" @File : txt_deduplication.py @Contact : wangweiqing@pachiratech.com
@Modify Time @Author @Version @Description ------------ ------- -------- ----------- 2020/11/16 17:54 wangwq 1.0 None """ import logging
SOURCE_FILE = '//User//william//Desktop//allname.txt' TARGET_FILE = SOURCE_FILE + '.new' RM_FILE = SOURCE_FILE + '.remove'
def distinct_txt(): repetition_dict = {} content_set = set()
row = 0
with open(SOURCE_FILE, 'r',encoding = "utf-8") as f: with open(TARGET_FILE, 'w') as g: for line in f.readlines(): if line in content_set: repetition_dict[row + 1] = line else: g.write(line) content_set.add(line) row += 1
length = len(str(row + 1)) length = length if length > 2 else 3 with open(RM_FILE, 'w') as f: f.write("此文件记录了重复行消息。\nrow | content \n") for row, content in repetition_dict.items(): f.write("{0:>{1}} | {2}".format(row, length, content))
print('success')
if __name__ == '__main__': try: print('去重开始。。。') distinct_txt() print('去重结束。success') print('请查看如下文件,原文件:%s,去重后的文件:%s, 删除行的文件:%s' %(SOURCE_FILE, TARGET_FILE, RM_FILE)) except FileNotFoundError as e: logging.error("去重失败,原因:%s", e) except Exception as e: logging.error("去重失败,原因:%s", e)
|