代码如下:
from Bio import Entrez
import time
import pandas as pd
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
# 设置你的邮箱(NCBI 要求)
Entrez.email = "your_email@example.com"
Entrez.tool = "PubMedYearStatsTool"
def search_pubmed(keyword, retmax=10000):
print(f"Searching PubMed for '{keyword}'...")
handle = Entrez.esearch(db="pubmed", term=keyword, retmax=retmax, datetype="pdat")
record = Entrez.read(handle)
id_list = record["IdList"]
print(f"Found {len(id_list)} articles.")
return id_list
def fetch_summary_batch(article_ids):
"""获取一个批次的摘要信息"""
try:
handle = Entrez.esummary(db="pubmed", id=",".join(article_ids), retmode="xml")
summaries = Entrez.read(handle)
time.sleep(0.5) # 控制请求频率
return summaries
except Exception as e:
print(f"Error fetching batch: {e}")
return []
def fetch_all_summaries(id_list, batch_size=200):
"""并发获取所有摘要信息"""
batches = [id_list[i:i + batch_size] for i in range(0, len(id_list), batch_size)]
all_summaries = []
with ThreadPoolExecutor(max_workers=4) as executor: # 最多并发 4 个线程
future_to_batch = {executor.submit(fetch_summary_batch, batch): batch for batch in batches}
for future in as_completed(future_to_batch):
summaries = future.result()
if summaries:
all_summaries.extend(summaries)
return all_summaries
def extract_years(summaries):
year_count = {}
for summary in summaries:
try:
pub_date = summary.get("PubDate", "")
if pub_date:
year = pub_date.split(" ")[0][:4] # 提取前四位作为年份
if year.isdigit() and len(year) == 4:
year_int = int(year)
if 2015 <= year_int <= 2025: # 只保留 2015 - 2025 的数据
year_count[year] = year_count.get(year, 0) + 1
except Exception as e:
continue
return year_count
def plot_yearly_trend(year_count, keyword):
df = pd.DataFrame.from_dict(year_count, orient='index', columns=['Count'])
df.index = df.index.astype(int)
df.sort_index(inplace=True)
# 过滤 2015 - 2025
df = df[(df.index >= 2015) & (df.index <= 2025)]
plt.figure(figsize=(7, 6))
plt.bar(df.index, df['Count'], color='skyblue')
plt.title(f"Number of PubMed Articles on '{keyword}' by Year (2015–2025)")
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.xticks(range(2015, 2026))
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
def main():
keyword = input("Enter your search keyword: ")
id_list = search_pubmed(keyword, retmax=10000)
print("Fetching metadata (summary)...")
summaries = fetch_all_summaries(id_list)
print("Extracting publication years (2015–2025)...")
year_count = extract_years(summaries)
print("\nArticles per year (2015–2025):")
for year in sorted(year_count):
print(f"{year}: {year_count[year]}")
plot_yearly_trend(year_count, keyword)
if __name__ == '__main__':
main()
结果:

手动绘制:
import pandas as pd
import matplotlib.pyplot as plt
# 数据
# data = {
# 'Year': [2025, 2024, 2023, 2022, 2021, 2020],
# 'Count': [1198, 2297, 1571, 962, 733, 502]
# }
data = {
'Year': [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016],
'Count': [3261, 7026, 5379, 4339, 4298, 3043, 2128, 1665, 1342, 1183]
}
# 创建 DataFrame
df = pd.DataFrame(data)
# 绘制柱状图
plt.figure(figsize=(7, 6))
bars = plt.bar(df['Year'], df['Count'], color='skyblue')
# 设置标题和坐标轴标签
keyword="single cell transcriptomics"
plt.title(f"Number of PubMed Articles on '{keyword}' by Year (2016–2025)")
plt.xlabel('Year')
plt.ylabel('Count')
# 显示网格线
plt.grid(axis='y', linestyle='--', alpha=0.7)
# 确保所有年份都显示出来
plt.xticks(df['Year'], rotation=45) # 横轴标签倾斜角度为45度,防止重叠
# 显示数值在柱子上方
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 10, int(yval), ha='center')
# 自动调整布局,防止标签被截断
plt.tight_layout()
# 显示图表
plt.show()

如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!
