Commit 6b4e10b0 authored by pleasure2cu's avatar pleasure2cu
Browse files

wip initial commit

parent 4d1efc2d
import datetime
import string
from typing import List, Tuple
import langdetect
import json
import os
import nltk
import sklearn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
path = './data/biorxiv_medrxiv/pdf_json/'
class Paper:
title: str
abstract: str
abstract_processed: str
body: str
detected_language: str
def __init__(self, path: str):
with open(path, 'r') as j:
parsed_json = json.load(j)
self.title = parsed_json['metadata']['title']
self.abstract = " ".join([abstract['text'] for abstract in parsed_json['abstract']])
self.body = " ".join([body['text'] for body in parsed_json['body_text']])
self.detected_language = langdetect.detect(self.abstract + " " + self.body)
def is_english(self) -> bool:
return self.detected_language == 'en'
def perform_pre_processing(self):
self.abstract_processed = \
TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation)).fit_transform(self.abstract)
def extract_from_json(file_handle):
parsed_json = json.load(file_handle)
file_handle.close()
return \
" ".join([abstract['text'] for abstract in parsed_json['abstract']]), \
" ".join([body['text'] for body in parsed_json['body_text']])
def load_basic_data() -> Tuple[List[str], List[str]]:
file_names = os.listdir(path)
file_handles = map(lambda n: open(path + n, 'r'), file_names)
data = map(extract_from_json, file_handles)
return tuple(zip(*data))
def load_data() -> List[Paper]:
file_names = os.listdir(path)[: 50]
return [Paper(path + file_name) for file_name in file_names]
def stemming_tokenizer(text):
stemmer = PorterStemmer()
return [stemmer.stem(w) for w in word_tokenize(text)]
def main():
start_time = datetime.datetime.now()
abstracts, text_bodies = load_basic_data()
print("loading done:", datetime.datetime.now() - start_time)
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(2,2))
abstract_tfidf = tfidf_vectorizer.fit_transform(abstracts)
print("all done:", datetime.datetime.now() - start_time)
print(abstract_tfidf.shape)
if __name__ == '__main__':
main()
from most_similar_papers_via_tfidf import extract_data_from_single_json
def main():
names = ['data/noncomm_use_subset/pdf_json\\022abf5f202c04e2bbf88d829db9844746e0fb71.json', 'data/noncomm_use_subset/pdf_json\\1999185dad27d52562c1dfe77bb72f06bdaf5084.json', 'data/noncomm_use_subset/pdf_json\\2dfdbf2d6b77426866feaf93486327d372fd27c7.json', 'data/noncomm_use_subset/pdf_json\\7ed89e5691a56b6757c21984c3bbb733b5a2dfdc.json', 'data/noncomm_use_subset/pdf_json\\817a885e9613363e08ef920a9e5826a4cb9e1c1e.json', 'data/noncomm_use_subset/pdf_json\\839df627ece5b5fc7bb1ce3b4f96127677fd0494.json', 'data/noncomm_use_subset/pdf_json\\9d36c9a5c87380ec6bb1cee77ced91fd6265d343.json', 'data/noncomm_use_subset/pdf_json\\b3c71d9d7dd9758f8328933f47d7d460bf24c98e.json', 'data/noncomm_use_subset/pdf_json\\b41638f869301e2af9eef7913301d55516fcf4ce.json', 'data/noncomm_use_subset/pdf_json\\be179ef4eaf04a9b155ac385eeabc06620a791b2.json', 'data/noncomm_use_subset/pdf_json\\cb690769762bb2fc4b4d9b898b03623b589fe8c1.json', 'data/noncomm_use_subset/pdf_json\\ea08d7bb1c95436e9ed7af4ed5419cc8fc56e7b7.json', 'data/noncomm_use_subset/pdf_json\\f0e75f3b4aeda66ce88ca4a58a785c8fdf32b9ab.json', 'data/noncomm_use_subset/pdf_json\\f5faed882955e964aa9ea7ac455746bcfec521ba.json']
for name in names:
paper_id, _, abstract, body = extract_data_from_single_json(name)
print(paper_id)
print(abstract)
print(body)
print('\n')
if __name__ == '__main__':
main()
import csv
import json
import os
from typing import Tuple, List, Iterator
import langdetect
import numpy as np
from nltk.corpus import stopwords
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def extract_data_from_single_json(file_name: str) -> Tuple[str, str, str, str]:
with open(file_name, 'r', encoding='utf-8') as file_handle:
parsed_json = json.load(file_handle)
paper_id, title, abstract, body = '', '', '', ''
if 'paper_id' in parsed_json:
paper_id = parsed_json['paper_id']
if 'metadata' in parsed_json and 'title' in parsed_json['metadata']:
title = parsed_json['metadata']['title']
if 'abstract' in parsed_json:
abstract = " ".join([abstract['text'] for abstract in parsed_json['abstract']])
if 'body_text' in parsed_json:
body = " ".join([body['text'] for body in parsed_json['body_text']])
# some papers are parsed directly from Latex and have a lot of noise -> filter those
if '\\usepackage' in abstract or '\\usepackage' in body:
paper_id, title, abstract, body = '', '', '', ''
# remove really short papers
if len(abstract) + len(body) < 100:
paper_id, title, abstract, body = '', '', '', ''
return paper_id, title, abstract, body
def get_data_in_one_folder(path: str, filter_non_english: bool = True) \
-> Tuple[List[str], List[str], List[str], List[str]]:
file_names = os.listdir(path)
file_names = map(lambda n: os.path.join(path, n), file_names)
data = map(extract_data_from_single_json, file_names)
data = filter(lambda t: t[0].strip() != '', data) # check if the paper_id is empty <=> something happened during reading
if filter_non_english:
data = filter(lambda t: langdetect.detect(" ".join(t[2:])) == 'en', data)
return tuple(zip(*data))
def get_all_data(filter_non_english: bool = True) -> Tuple[List[str], List[str], List[str], List[str]]:
all_paths = [
'data/biorxiv_medrxiv/pdf_json',
# 'data/comm_use_subset/pdf_json',
# 'data/comm_use_subset/pmc_json',
# 'data/custom_license/pdf_json',
# 'data/custom_license/pmc_json',
# 'data/noncomm_use_subset/pdf_json',
# 'data/noncomm_use_subset/pmc_json'
]
all_paper_ids, all_titles, all_abstracts, all_bodies = [], [], [], []
for path in all_paths:
i, t, a, b = get_data_in_one_folder(path, filter_non_english)
all_paper_ids += i
all_titles += t
all_abstracts += a
all_bodies += b
print("{} has been processed".format(path))
return all_paper_ids, all_titles, all_abstracts, all_bodies
def get_tfidf_matrix(raw_data):
return TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.5, min_df=5).fit_transform(raw_data)
def get_indices_with_highest_similarity(query_tfidf_vector, tfidf_matrix, k: int = 5) -> List[Tuple[float, int]]:
cosine_similarities = cosine_similarity(tfidf_matrix, query_tfidf_vector).reshape(-1)
top_indices = np.argpartition(cosine_similarities, -k - 1)[-k - 1:]
indices_with_highest_similarity = [(cosine_similarities[i], i) for i in top_indices]
indices_with_highest_similarity.sort(reverse=True)
return indices_with_highest_similarity[1:]
def main():
csv_file_name = 'ids_and_titles.csv'
tfidf_matrix_file_name = 'tfidf_matrix.npz'
if tfidf_matrix_file_name not in os.listdir('./') or csv_file_name not in os.listdir('./'):
paper_ids, titles, abstracts, bodies = get_all_data(filter_non_english=False)
tfidf_matrix = get_tfidf_matrix([a + " " + b for a, b in zip(abstracts, bodies)])
# save important data for future runs
with open(csv_file_name, 'w', encoding='utf-8') as f:
wr = csv.writer(f)
wr.writerow(paper_ids)
wr.writerow(titles)
sparse.save_npz(tfidf_matrix_file_name, tfidf_matrix)
else:
tfidf_matrix = sparse.load_npz(tfidf_matrix_file_name)
with open(csv_file_name, 'r', encoding='utf-8') as f:
wr = csv.reader(f)
paper_ids = next(wr)
next(wr)
titles = next(wr)
print("Shape of tfidf matrix: {}".format(tfidf_matrix.shape))
print("Anzahl Titel: {}".format(len(titles)))
print("Anzahl unique Titel: {}".format(len(set(titles))))
while True:
query = input("\nEnter the name of your paper-id: ")
index = -1 if query not in paper_ids else paper_ids.index(query)
if index < 0:
print("The entered paper-id isn't in our archive.")
continue
best_papers_scores_and_indices = get_indices_with_highest_similarity(tfidf_matrix[index], tfidf_matrix, k=5)
print("The most similar papers are:")
for score, index in best_papers_scores_and_indices:
print("{}\t(title: {}\tsimilarity score: {})".format(paper_ids[index], titles[index], score))
if __name__ == '__main__':
main()
import json
import os
from typing import List, Dict, Tuple
from most_similar_papers_via_tfidf import extract_data_from_single_json
def get_data_in_folder(path: str, dict_of_titles: Dict[str, List[str]]) -> Dict[str, List[str]]:
print(path)
file_names = os.listdir(path)
file_names = map(lambda n: os.path.join(path, n), file_names)
for file_name in file_names:
with open(file_name, 'r') as file_handle:
parsed_json = json.load(file_handle)
title = parsed_json['metadata']['title']
if title in dict_of_titles:
dict_of_titles[title].append(file_name)
else:
dict_of_titles[title] = [file_name]
return dict_of_titles
def get_all_data() -> Dict[str, List[str]]:
all_paths = [
'data/biorxiv_medrxiv/pdf_json',
'data/comm_use_subset/pdf_json',
'data/comm_use_subset/pmc_json',
'data/custom_license/pdf_json',
'data/custom_license/pmc_json',
'data/noncomm_use_subset/pdf_json',
'data/noncomm_use_subset/pmc_json'
]
dict_of_titles: Dict[str, List[str]] = {}
for path in all_paths:
dict_of_titles = get_data_in_folder(path, dict_of_titles)
return dict_of_titles
def count_the_number_of_deep_duplicates(dict_of_titles: Dict[str, List[str]]):
asdf = get_info_of_duplicates(dict_of_titles)
total_diff_bodies = 0
total_diff = 0
for title, file_names, _ in asdf:
qwer = map(extract_data_from_single_json, file_names)
_, _, abstracts, bodies = tuple(zip(*qwer))
first = abstracts[0] + " " + bodies[0]
total_diff += 0 if all(first == i for i in map(" ".join, zip(abstracts[1:], bodies[1:]))) else 1
total_diff_bodies += 0 if all(bodies[0] == i for i in bodies[1:]) else 1
print("{} of {} titles with multiple files different bodies".format(total_diff_bodies, len(asdf)))
print("{} of {} titles with multiple files have different abstract + body".format(total_diff, len(asdf)))
def get_info_of_duplicates(dict_of_titles: Dict[str, List[str]]) -> List[Tuple[str, List[str], int]]:
asdf = map(lambda t: (t[0], t[1], len(t[1])), dict_of_titles.items())
asdf = list(filter(lambda t: t[2] > 1, asdf))
return asdf
def how_many_papers_have_duplicates_and_latex_information(dict_of_titles: Dict[str, List[str]]):
asdf = get_info_of_duplicates(dict_of_titles)
qwer = map(lambda x: x[1], asdf)
qwer = sum(qwer, [])
total_using_latex = 0
for file_name in qwer:
_, _, abstract, body = extract_data_from_single_json(file_name)
text = abstract + ' ' + body
total_using_latex += 1 if '\\usepackage' in text else 0
print('{} have latex information in there'.format(total_using_latex))
def main():
if False:
dict_of_titles = get_all_data()
with open('title_information.json', 'w', encoding='utf-8') as f:
json.dump(dict_of_titles, f, ensure_ascii=False, indent=4)
else:
with open('title_information.json', 'r', encoding='utf-8') as f:
dict_of_titles = json.load(f)
how_many_papers_have_duplicates_and_latex_information(dict_of_titles)
# count_the_number_of_deep_duplicates(dict_of_titles)
print('hello')
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment