Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
73adb1b
Merge pull request #10 from TheDataRideAlongs/fix(50m)
lmeyerov Mar 31, 2020
ec478f3
refactor(print): now via Python logger
lmeyerov Apr 1, 2020
eb4e7c6
fix(fh missing cols): handle
lmeyerov Apr 1, 2020
771c1fb
feat(fixed fh arrow schema): and jsonify nested cols bc pq writer can…
lmeyerov Apr 1, 2020
5c1d0ff
Merge pull request #11 from TheDataRideAlongs/dev/fix-fh
lmeyerov Apr 1, 2020
4bab6db
Changed the print() statements for actual pythong logging statements
bechbd Apr 1, 2020
fc851da
Merge pull request #12 from TheDataRideAlongs/logging_fix
bechbd Apr 1, 2020
262e8d8
addig gitignore
007vasy Apr 2, 2020
f2b6543
Merge branch 'master' into Issue-#1
007vasy Apr 2, 2020
c331889
prints changed to logger
007vasy Apr 2, 2020
cecb571
Added methods to get_from_neo and get_tweets_by_id
bechbd Apr 2, 2020
1436b8d
Merge pull request #14 from TheDataRideAlongs/logging_fix
bechbd Apr 2, 2020
207d32c
Use pandas when cudf does not exist
ZiyaoWei Apr 2, 2020
14cb467
Merge pull request #15 from TheDataRideAlongs/wzy/no_gpu
ZiyaoWei Apr 2, 2020
f1426b6
Prototype rehydrate pipeline
ZiyaoWei Apr 2, 2020
f14dffe
Fix bug
ZiyaoWei Apr 2, 2020
325db35
docs(README): add calendar
lmeyerov Apr 2, 2020
c48ef17
Merge branch 'master' into Issue-#1
007vasy Apr 2, 2020
3c24774
Fixed issue with limit on get_from_neo, added timeout parameter, and …
bechbd Apr 2, 2020
a323358
Merge pull request #13 from TheDataRideAlongs/Issue-#1
007vasy Apr 2, 2020
83e7dcd
Merge branch 'master' into switch_python_neo_driver
bechbd Apr 2, 2020
ba2ea00
Merge pull request #17 from TheDataRideAlongs/switch_python_neo_driver
bechbd Apr 2, 2020
83a2294
docs(tightening)
lmeyerov Apr 3, 2020
328c997
Skip when there is no data
ZiyaoWei Apr 2, 2020
b72eb75
Fix logging, add parameter for saving to Neo4j
ZiyaoWei Apr 3, 2020
256df48
Merge pull request #20 from TheDataRideAlongs/wzy/rehydratePipeline
ZiyaoWei Apr 3, 2020
4c2b1ab
docs(issue tracker): link gh projects on README
lmeyerov Apr 3, 2020
a89528a
docs(volunteers): Add legal
lmeyerov Apr 3, 2020
8ec9061
docs(README): project tracker links
lmeyerov Apr 3, 2020
80c8622
Got initial version of the unit tests working for Neo
bechbd Apr 3, 2020
609b95a
Add docker-compose.yml for prefect UI
ZiyaoWei Apr 4, 2020
8e7a88b
Merge pull request #37 from TheDataRideAlongs/wzy/dockerizePrefect
ZiyaoWei Apr 4, 2020
dea4f03
fixed getting interlnational trials, with utf8 encoding
007vasy Apr 6, 2020
55958d2
urls to config
007vasy Apr 6, 2020
51ab12f
removed consol useage from scraping data
007vasy Apr 6, 2020
22d418a
Dockerize pipeline and add instructions
ZiyaoWei Apr 5, 2020
70f196e
Merge pull request #40 from TheDataRideAlongs/wzy/dockerizePipelines
ZiyaoWei Apr 6, 2020
07409e4
confortable neo4j import setup
007vasy Apr 6, 2020
8cf111b
confortable edge inserting into neo4j
007vasy Apr 6, 2020
4e6c3ec
flexible insertion into neo4j
007vasy Apr 6, 2020
a4aa10f
add config
007vasy Apr 6, 2020
f1831f6
Made minor tweaks to get the prefect ui stuff to run correctly on the…
bechbd Apr 7, 2020
4a4970e
Merge pull request #48 from TheDataRideAlongs/update-prefect-ui-files
bechbd Apr 7, 2020
c96adf3
data scraping into class
007vasy Apr 7, 2020
c9c89f1
updated gitignore
007vasy Apr 7, 2020
3f6f5c5
all drugs and synonyms are imported
007vasy Apr 7, 2020
52be179
cleanup
007vasy Apr 7, 2020
6e554ef
refactor
007vasy Apr 7, 2020
7a030d6
filtering international studies
007vasy Apr 7, 2020
31db102
docs(README.md): infra links
lmeyerov Apr 7, 2020
530c087
Merge pull request #52 from TheDataRideAlongs/add_neo_unit_tests
bechbd Apr 8, 2020
9b137f1
Added metrics configuration
bechbd Apr 8, 2020
d73186b
Added metrics configuration
bechbd Apr 8, 2020
0d7e2e5
drug analysis
007vasy Apr 8, 2020
6df6b15
table merging WIP
007vasy Apr 9, 2020
1567927
studies normalized into one table from 2 different sources
007vasy Apr 9, 2020
98c9e19
#39 - Added method to allow for adding enrichment properties to a node
bechbd Apr 9, 2020
2a9afb0
studies to neo4j is done
007vasy Apr 9, 2020
4c17c8c
WF
007vasy Apr 9, 2020
6c4bc2b
study import fix
007vasy Apr 9, 2020
46fdf2b
drug-study links
007vasy Apr 9, 2020
9aa0a13
cudf set up
007vasy Apr 9, 2020
212a9e2
dict to cypher property code merge for make it available for others
007vasy Apr 9, 2020
d3f626e
Merge pull request #56 from TheDataRideAlongs/DictToCypherProperties
bechbd Apr 9, 2020
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
studies to neo4j is done
  • Loading branch information
007vasy committed Apr 9, 2020
commit 2a9afb0d746e2947810167fbf728a4bc79b8fa95
25 changes: 24 additions & 1 deletion modules/TempNB/DrugSynonymDataToNeo4j.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from neo4j import GraphDatabase
from typing import Optional
from pandas import DataFrame
from numpy import isnan
import logging
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
logger = logging.getLogger('ds-neo4j')
Expand All @@ -10,7 +12,9 @@ def property_type_checker(property_value):
if isinstance(property_value,int) or isinstance(property_value,float):
pass
elif isinstance(property_value,str):
property_value = '''"''' + property_value + '''"'''
property_value = '''"''' + property_value.replace('"',r"\"") + '''"'''
elif not property_value:
property_value = ""
return property_value

resp:str = ""
Expand All @@ -31,7 +35,26 @@ def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="letmein"

def close(self):
self._driver.close()

def upload_studies(self,studies:DataFrame):
node_merging_func = self._merge_node
with self._driver.session() as session:
logger.info("> Importing Studies Job is Started")
count_node = 0
prev_count_node = 0

for study in studies.T.to_dict().values():
node_type = "Study"
properties:dict = study
session.write_transaction(node_merging_func, node_type, properties)
count_node += 1
if count_node > prev_count_node + 100:
prev_count_node = count_node
logger.info("> {} nodes already imported".format(count_node))

logger.info("> Importing Studies Job is >> Done << with {} nodes imported".format(count_node))


def upload_drugs_and_synonyms(self,drug_vocab):
node_merging_func = self._merge_node
edge_merging_func = self._merge_edge
Expand Down
69 changes: 58 additions & 11 deletions modules/TempNB/IngestDrugSynonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,25 @@ def api(query,from_study,to_study,url):
response = requests.request("GET", url)
return response.json()

def apiWrapper(self,query,from_study):
def api_wrapper(self,query,from_study):
return self.api(query,from_study,from_study+99,self.url_USA)

def getAllStudiesByQuery(self,query:str) -> list:
studies:list = []
from_study = 1
temp = self.apiWrapper(query,from_study)
temp = self.api_wrapper(query,from_study)
nstudies = temp['FullStudiesResponse']['NStudiesFound']
logger.info("> {} studies found by '{}' keyword".format(nstudies,query))
if nstudies > 0:
studies = temp['FullStudiesResponse']['FullStudies']
for study_index in range(from_study+100,nstudies,100):
temp = self.apiWrapper(query,study_index)
temp = self.api_wrapper(query,study_index)
studies.extend(temp['FullStudiesResponse']['FullStudies'])

return studies

@staticmethod
def xlsHandler(r):
def xls_handler(r):
df = pd.DataFrame()
with tempfile.NamedTemporaryFile("wb") as xls_file:
xls_file.write(r.content)
Expand All @@ -79,7 +79,7 @@ def xlsHandler(r):
return df

@staticmethod
def csvZipHandler(r):
def csvzip_handler(r):
df = pd.DataFrame()
with tempfile.NamedTemporaryFile("wb",suffix='.csv.zip') as file:
file.write(r.content)
Expand All @@ -92,14 +92,47 @@ def urlToDF(url:str,respHandler) -> pd.DataFrame:
r = requests.get(url, allow_redirects=True)
return respHandler(r)

def scrapeData(self):
self.internationalstudies = self.urlToDF(self.url_international,self.xlsHandler)
self.drug_vocab_df = self.urlToDF(self.url_drugbank,self.csvZipHandler)
@staticmethod
def _convert_US_studies(US_studies:dict) -> pd.DataFrame:
list_of_US_studies:list = []
for key in US_studies.keys():
for study in US_studies[key]:
temp_dict:dict = {}

temp_dict["trial_id"] = study["Study"]["ProtocolSection"]["IdentificationModule"]["NCTId"]
temp_dict["study_url"] = "https://clinicaltrials.gov/show/" + temp_dict["trial_id"]

try:
temp_dict["intervention"] = study["Study"]["ProtocolSection"]["ArmsInterventionsModule"]["ArmGroupList"]["ArmGroup"][0]["ArmGroupInterventionList"]["ArmGroupInterventionName"][0]
except:
temp_dict["intervention"] = ""
try:
temp_dict["study_type"] = study["Study"]["ProtocolSection"]["DesignModule"]["StudyType"]
except:
temp_dict["study_type"] = ""
try:
temp_dict["target_size"] = study["Study"]["ProtocolSection"]["DesignModule"]["EnrollmentInfo"]["EnrollmentCount"]
except:
temp_dict["target_size"] = ""
try:
if "OfficialTitle" in study["Study"]["ProtocolSection"]["IdentificationModule"].keys():
temp_dict["public_title"] = study["Study"]["ProtocolSection"]["IdentificationModule"]["OfficialTitle"]
else:
temp_dict["public_title"] = study["Study"]["ProtocolSection"]["IdentificationModule"]["BriefTitle"]
except:
temp_dict["public_title"] = ""
list_of_US_studies.append(temp_dict)
US_studies_df:pd.DataFrame = pd.DataFrame(list_of_US_studies)
return US_studies_df

def _scrapeData(self):
self.internationalstudies = self.urlToDF(self.url_international,self.xls_handler)
self.drug_vocab_df = self.urlToDF(self.url_drugbank,self.csvzip_handler)
self.all_US_studies_by_keyword:dict = {}
for key in self.query_keywords:
self.all_US_studies_by_keyword[key] = self.getAllStudiesByQuery(key)

def filterData(self):
def _filterData(self):
self.drug_vocab_reduced = self.drug_vocab_df[['Common name', 'Synonyms']]
self.internationalstudies_reduced = self.internationalstudies[['TrialID', 'Intervention','Study type','web address','Target size', "Public title"]]
self.internationalstudies_reduced.columns = [col.replace(" ","_").lower() for col in self.internationalstudies_reduced.columns]
Expand All @@ -113,10 +146,24 @@ def filterData(self):
for index, row in self.drug_vocab_reduced.iterrows():
self.drug_vocab[row['Common name']] = row["Synonyms"].split("|") if isinstance(row["Synonyms"],str) else row["Synonyms"]

def saveDataToFile(self):
self.US_studies_df = self._convert_US_studies(self.all_US_studies_by_keyword)

self.all_studies_df = pd.concat([self.US_studies_df,self.internationalstudies_reduced])
self.all_studies_df.drop_duplicates(subset="trial_id",inplace=True)
self.all_studies_df.fillna("",inplace=True)
logger.info("> {} distinct studies found".format(len(self.all_studies_df)))

def save_data_to_fiile(self):
"""Saving data option for debug purposes"""
print("Only Use it for debug purposes")
logger.warning("Only Use it for debug purposes!!!")
self.internationalstudies.to_csv("internationalstudies.csv")
self.drug_vocab_df.to_csv("drug_vocab.csv")
with open('all_US_studies_by_keyword.json', 'w', encoding='utf-8') as f:
json.dump(self.all_US_studies_by_keyword, f, ensure_ascii=False, indent=4)

def auto_get_and_clean_data(self):
self._scrapeData()
self._filterData()

def create_drug_study_link(self):
pass
5 changes: 2 additions & 3 deletions modules/TempNB/IngestDrugSynonymsWF.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from DrugSynonymDataToNeo4j import DrugSynonymDataToNeo4j

drugSynonym = IngestDrugSynonyms()
drugSynonym.scrapeData()
drugSynonym.filterData()
drugSynonym.auto_get_and_clean_data()

neo4jBridge = DrugSynonymDataToNeo4j()
neo4jBridge.upload_drugs_and_synonyms(drugSynonym.drug_vocab)
# neo4jBridge.upload_drugs_and_synonyms(drugSynonym.drug_vocab)