Source code for mcot.bibtex.pubmed

from metapub import PubMedArticle, PubMedFetcher, PubMedAuthor, pubmedcentral
from . import entry, file
from typing import List
import requests
from xml.etree import ElementTree
import time
from selenium import webdriver, common
import atexit
import os.path as op
from functools import lru_cache


[docs]@lru_cache() def get_fetcher(): return PubMedFetcher()
[docs]@lru_cache() def get_firefox(): fo = webdriver.FirefoxOptions() fo.headless = True firefox = webdriver.Firefox(options=fo, log_path=op.expanduser('~/.log/geckodriver.log'), timeout=5) atexit.register(lambda: firefox.quit()) return firefox
[docs]def query_mult(pmids) -> List[PubMedArticle]: """ Creates a list of pubmed articles from a single query Args: pmids: sequence of pubmed ids Returns: pubmed article of each pubmed id """ if isinstance(pmids, str): pmids = [pmids] if len(pmids) > 250: return query_mult(pmids[:250]) + query_mult(pmids[250:]) res = get_fetcher().qs.efetch({'db': 'pubmed', 'id': ','.join(pmids)}) lines = res.split(b'\n') open = (b'<PubmedArticle>', b'<PubmedBookArticle>') close = (b'</PubmedArticle>', b'</PubmedBookArticle>') start = [idx for idx, line in enumerate(lines) if any(m in line for m in open)] end = [idx + 1 for idx, line in enumerate(lines) if any(m in line for m in close)] articles = [None] * len(pmids) for idx1, idx2, pmid in zip(start, end, pmids): single_lines = lines[:start[0]] + lines[idx1:idx2] + lines[end[-1]:] article = PubMedArticle(b'\n'.join(single_lines)) articles[pmids.index(article.pmid)] = article return articles
[docs]def to_bibtex(article: PubMedArticle) -> entry.BibTexEntry: """ Converts a PubMedArticle to a bibtex entry Args: article: article metadata from pubmed Returns: bibtex entry """ convert = { 'author': 'author_list', 'number': 'issue', 'editor': 'book_editors', 'publisher': 'book_publisher', } tags = {} for tag_name in ( 'pmid', 'doi', 'abstract', 'number', 'pages', 'editor', 'author', 'volume', 'publisher', 'year', 'journal', 'title', ): value = getattr(article, convert.get(tag_name, tag_name)) if value is not None: if tag_name == 'author': value = ' and '.join([format_author(val) for val in value]) elif isinstance(value, list): value = ' '.join(value) tags[tag_name] = value.replace('%', r'\%') for tag_name in entry.required_tags[article.pubmed_type]: if isinstance(tag_name, str) and tag_name not in tags: tags[tag_name] = '?' name = article.author_list[0].last_name if len(article.author_list) > 0 else 'UNK' key = f'{name}:{article.year}' return entry.BibTexEntry(article.pubmed_type, key, **tags)
[docs]def doi_to_bibtex(doi: str) -> entry.BibTexEntry: """ Converts a DOI into a bibtex entry using doi.org :param doi: doi :return: corresponding bibtex entry """ as_text = requests.get(f'https://doi.org/{doi}', headers={'Accept': 'application/x-bibtex'}).text return file.BibTexSet.from_string(as_text)
[docs]def format_author(author: PubMedAuthor): """ Provides formatting for the author :param author: individual author :return: text for in bibtex """ if not author.last_name and not author.collective_name: raise ValueError("Author name is ill-defined") if author.collective_name: return author.collective_name if author.fore_name: return f'{author.last_name}, {author.fore_name}' if author.initials: return f'{author.last_name}, {author.initials}' return author.last_name
[docs]def pmid_from_doi(identifiers): """ Converts a sequence of identifiers into pmids :param identifiers: list of dois or pcmids :return: list of pmids """ if isinstance(identifiers, str): identifiers = [identifiers] if len(identifiers) > 25: return pmid_from_doi(identifiers[:25]) + pmid_from_doi(identifiers[25:]) time.sleep(0.33) xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers)).content records = ElementTree.fromstring(xml).findall('record') while len(records) < len(identifiers): print('bad ID:', identifiers[len(records)]) records.append(None) time.sleep(0.33) xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers[len(records):])).content records.extend(ElementTree.fromstring(xml).findall('record')) res = [None] * len(identifiers) for record in records: if ( record is not None and record.get('doi') is not None and record.get('doi') in identifiers ): res[identifiers.index(record.get('doi'))] = record.get('pmid') return res
[docs]def biorxiv_to_doi(bioarxiv_url: str) -> str: """ Returns doi associated with article If available will return published url, otherwise will return None :param bioarxiv_url: Url of the website (e.g., https://www.biorxiv.org/content/early/2018/06/14/266627) :return: DOI """ firefox = get_firefox() try: firefox.get(bioarxiv_url) except common.exceptions.TimeoutException: print(f'timeout for {bioarxiv_url}') return None text = firefox.find_element_by_class_name('pub_jnl').text if 'doi:' in text: return text.split()[text.split().index('doi:') + 1] for line in firefox.find_element_by_class_name('highwire-cite-metadata').text.splitlines(): if 'doi:' in line: return line.split()[line.split().index('doi:') + 1] return None