python代码到pubmed上搜索关键字,统计每年发文章数量

python代码到pubmed上搜索关键字,统计每年发文章数量



代码如下:

from Bio import Entrez
import time
import pandas as pd
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed

# 设置你的邮箱(NCBI 要求)
Entrez.email = "your_email@example.com"
Entrez.tool = "PubMedYearStatsTool"

def search_pubmed(keyword, retmax=10000):
    print(f"Searching PubMed for '{keyword}'...")
    handle = Entrez.esearch(db="pubmed", term=keyword, retmax=retmax, datetype="pdat")
    record = Entrez.read(handle)
    id_list = record["IdList"]
    print(f"Found {len(id_list)} articles.")
    return id_list

def fetch_summary_batch(article_ids):
    """获取一个批次的摘要信息"""
    try:
        handle = Entrez.esummary(db="pubmed", id=",".join(article_ids), retmode="xml")
        summaries = Entrez.read(handle)
        time.sleep(0.5)  # 控制请求频率
        return summaries
    except Exception as e:
        print(f"Error fetching batch: {e}")
        return []

def fetch_all_summaries(id_list, batch_size=200):
    """并发获取所有摘要信息"""
    batches = [id_list[i:i + batch_size] for i in range(0, len(id_list), batch_size)]
    all_summaries = []

    with ThreadPoolExecutor(max_workers=4) as executor:  # 最多并发 4 个线程
        future_to_batch = {executor.submit(fetch_summary_batch, batch): batch for batch in batches}
        for future in as_completed(future_to_batch):
            summaries = future.result()
            if summaries:
                all_summaries.extend(summaries)

    return all_summaries

def extract_years(summaries):
    year_count = {}
    for summary in summaries:
        try:
            pub_date = summary.get("PubDate", "")
            if pub_date:
                year = pub_date.split(" ")[0][:4]  # 提取前四位作为年份
                if year.isdigit() and len(year) == 4:
                    year_int = int(year)
                    if 2015 <= year_int <= 2025:  # 只保留 2015 - 2025 的数据
                        year_count[year] = year_count.get(year, 0) + 1
        except Exception as e:
            continue
    return year_count

def plot_yearly_trend(year_count, keyword):
    df = pd.DataFrame.from_dict(year_count, orient='index', columns=['Count'])
    df.index = df.index.astype(int)
    df.sort_index(inplace=True)

    # 过滤 2015 - 2025
    df = df[(df.index >= 2015) & (df.index <= 2025)]

    plt.figure(figsize=(7, 6))
    plt.bar(df.index, df['Count'], color='skyblue')
    plt.title(f"Number of PubMed Articles on '{keyword}' by Year (2015–2025)")
    plt.xlabel('Year')
    plt.ylabel('Number of Articles')
    plt.xticks(range(2015, 2026))
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

def main():
    keyword = input("Enter your search keyword: ")
    id_list = search_pubmed(keyword, retmax=10000)
    print("Fetching metadata (summary)...")
    summaries = fetch_all_summaries(id_list)
    print("Extracting publication years (2015–2025)...")
    year_count = extract_years(summaries)
    
    print("\nArticles per year (2015–2025):")
    for year in sorted(year_count):
        print(f"{year}: {year_count[year]}")

    plot_yearly_trend(year_count, keyword)

if __name__ == '__main__':
    main()


结果:



attachments-2025-05-gwnyLYWq681b2d2cdf782.png


手动绘制:

import pandas as pd
import matplotlib.pyplot as plt
# 数据
# data = {
#     'Year': [2025, 2024, 2023, 2022, 2021, 2020],
#     'Count': [1198, 2297, 1571, 962, 733, 502]
# }

data = {
    'Year': [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016],
    'Count': [3261, 7026, 5379, 4339, 4298, 3043, 2128, 1665, 1342, 1183]
}

# 创建 DataFrame
df = pd.DataFrame(data)
# 绘制柱状图
plt.figure(figsize=(7, 6))
bars = plt.bar(df['Year'], df['Count'], color='skyblue')
# 设置标题和坐标轴标签
keyword="single cell transcriptomics"
plt.title(f"Number of PubMed Articles on '{keyword}' by Year (2016–2025)")
plt.xlabel('Year')
plt.ylabel('Count')
# 显示网格线
plt.grid(axis='y', linestyle='--', alpha=0.7)
# 确保所有年份都显示出来
plt.xticks(df['Year'], rotation=45)  # 横轴标签倾斜角度为45度,防止重叠
# 显示数值在柱子上方
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 10, int(yval), ha='center')
# 自动调整布局,防止标签被截断
plt.tight_layout()
# 显示图表
plt.show()


attachments-2025-05-6Hcr9wXP681b353a63eab.png

  • 发表于 2025-05-07 17:52
  • 阅读 ( 315 )
  • 分类:其他

你可能感兴趣的文章

相关问题

0 条评论

请先 登录 后评论
omicsgene
omicsgene

生物信息

720 篇文章

作家榜 »

  1. omicsgene 720 文章
  2. 安生水 359 文章
  3. Daitoue 167 文章
  4. 生物女学霸 120 文章
  5. xun 87 文章
  6. rzx 83 文章
  7. 红橙子 79 文章
  8. CORNERSTONE 72 文章