两文本取交集

近期又接触到pandas, 简直相见恨晚

import pandas as pd

df1 = pd.read_excel("aa.xlsx")
df2 = pd.read_excel("bb.xlsx")

up1 = df1.loc[df1['regulated'] == "up"]  # 筛选出regulated列为up的行
up1.dropna(subset=['gene'], inplace=True)  # 删除gene为空的行

up2 = df2.loc[df2['regulated'] == "up"]
up2.dropna(subset=['gene'], inplace=True)

down1 = df1.loc[df1['regulated'] == "down"]
down1.dropna(subset=['gene'], inplace=True)

down2 = df2.loc[df2['regulated'] == "down"]
down2.dropna(subset=['gene'], inplace=True)

up12 = up1.loc[up1['gene'].isin(up2['gene'])].drop_duplicates(['gene']).sort_values(by='gene')  # 去重排序
up12.to_excel("upa.xlsx", index=False)

up21 = up2.loc[up2['gene'].isin(up1['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
up21.to_excel("upb.xlsx", index=False)

down12 = down1.loc[down1['gene'].isin(down2['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
down12.to_excel("downa.xlsx", index=False)

down21 = down2.loc[down2['gene'].isin(down1['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
down21.to_excel("downb.xlsx", index=False)

简直神器~


额, python做这个真心效果大赞! aa.txt 和 bb.txt 中间有一列内容有交集, 需要把这个交集和差集提出来

with open("aa.txt") as f1, open("bb.txt") as f2:
    a_dict = {}
    for line in f1.readlines():
        _lst = line.strip().split("@")
        if _lst[1] and _lst[1] != "gene":
            a_dict[_lst[1]] = line

    b_dict = {}
    for line in f2.readlines():
        _lst = line.strip().split("@")
        if _lst[1] and _lst[1] != "gene":
            b_dict[_lst[1]] = line

    new_dict = dict((i, a_dict[i]) for i in a_dict if i in b_dict)
    with open("12.csv", "w") as fh:
        fh.write(
            "\tgene\t#ID\tT01_Count\tT02_Count\tT01_FPKM\tT02_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
        for key in new_dict:
            fh.write(new_dict[key].replace("@", "\t"))

    new_dict = dict((i, a_dict[i]) for i in a_dict if i not in b_dict)
    with open("12_diff.csv", "w") as fh:
        fh.write(
            "\tgene\t#ID\tT01_Count\tT02_Count\tT01_FPKM\tT02_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
        for key in new_dict:
            fh.write(new_dict[key].replace("@", "\t"))

    new_dict = dict((i, b_dict[i]) for i in a_dict if i in b_dict)
    with open("13.csv", "w") as fh:
        fh.write(
            "\tgene\t#ID\tT01_Count\tT03_Count\tT01_FPKM\tT03_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
        for key in new_dict:
            fh.write(new_dict[key].replace("@", "\t"))
    new_dict = dict((i, b_dict[i]) for i in b_dict if i not in a_dict)
    with open("13_diff.csv", "w") as fh:
        fh.write(
            "\tgene\t#ID\tT01_Count\tT03_Count\tT01_FPKM\tT03_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
        for key in new_dict:
            fh.write(new_dict[key].replace("@", "\t"))

基本上就是先解析到需要比较的这一列, 已此列值为key, 每整行为value, 所以这两个文件可以分别建两个字典, 然后取字典key的交集, 输出value写入结果文件~