Skip to content

Commit 3e258e1

Browse files
authored
Merge pull request #81 from TheDataRideAlongs/dev/fix-twint
Dev/fix twint
2 parents a9bdd5c + 8454f55 commit 3e258e1

46 files changed

Lines changed: 621 additions & 3819 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

‎.idea/ProjectDomino.iml‎

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎.idea/misc.xml‎

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM python:3.7
2+
RUN apt-get update \
3+
&& apt-get install -y --no-install-recommends git vim tor \
4+
&& apt-get clean \
5+
&& rm -rf /var/lib/apt/lists/*
6+
7+
8+
9+
RUN pip install prefect==0.10.1 simplejson twarc neo4j boto3==1.12.39 \
10+
pandas pyarrow urlextract git+https://github.com/homm/yurl.git@1943161973aeb3b3cf2e1e9de6671673b8356161
11+
12+
RUN echo "ok6" && pip install git+https://github.com/TheDataRideAlongs/twint.git
13+
#RUN pip install git+https://github.com/twintproject/twint.git
14+
#git+https://github.com/lmeyerov/twint.git@patch-1#egg=twint
15+
16+
#FIXME this should be part of entrypoint / service config?
17+
#RUN service tor start
18+
19+
COPY ./modules /app/ProjectDomino
20+
COPY ./infra/pipelines/docker/jobs /app
21+
22+
HEALTHCHECK --interval=60s --timeout=15s --start-period=20s \
23+
CMD curl -sf --socks5-hostname localhost:9050 https://check.torproject.org | grep Congrat
24+
25+
WORKDIR /app
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
########
2+
#
3+
# Run from git root's parent: with .env in local folder and ProjectDomino/ inside
4+
#
5+
# $ touch .env
6+
# $ sudo docker-compose -f ./ProjectDomino/infra/pipelines/docker/docker-compose.yml up -d prefect-agent
7+
#
8+
########
9+
10+
version: '3'
11+
12+
services:
13+
data-stream:
14+
build:
15+
context: ../../../
16+
dockerfile: infra/pipelines/docker/datastream-Dockerfile
17+
tty: true
18+
network_mode: 'bridge'
19+
command: sh -c "pwd && ls && service tor start && python3 /app/search_by_date_job.py"
20+
volumes:
21+
- /home/codywebb/ProjectDomino/infra/pipelines/docker/jobs/neo4jcreds.json:/secrets/neo4jcreds.json:ro
22+
environment:
23+
PREFECT__SERVER__HOST: ${PREFECT__SERVER__HOST:-http://host.docker.internal}
24+
PREFECT__SERVER__PORT: ${PREFECT__SERVER__PORT:-4200}
25+
PREFECT__SERVER__UI__HOST: ${PREFECT__SERVER__UI__HOST:-http://host.docker.internal}
26+
PREFECT__SERVER__UI__PORT: ${PREFECT__SERVER__UI__PORT:-8080}
27+
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
28+
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
29+
logging:
30+
options:
31+
tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'

‎infra/pipelines/docker/docker-compose.yml‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ services:
1414
image: prefect-agent:0.0.5
1515
build:
1616
context: ../../../
17-
dockerfile: ./infra/pipelines/docker/nonrapids-Dockerfile
17+
dockerfile: ./infra/pipelines/docker/Dockerfile
1818
container_name: prefect-agent
1919
network_mode: 'bridge'
2020
restart: unless-stopped
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[26]:
5+
6+
7+
8+
9+
10+
# In[27]:
11+
12+
13+
import logging
14+
logger = logging.getLogger()
15+
logger.setLevel(logging.INFO) #DEBUG, INFO, WARNING, ERROR, CRITICAL
16+
17+
18+
19+
20+
# In[28]:
21+
22+
23+
import json, pandas as pd
24+
from ProjectDomino.Neo4jDataAccess import Neo4jDataAccess
25+
from ProjectDomino.FirehoseJob import FirehoseJob
26+
from ProjectDomino.TwintPool import TwintPool
27+
from prefect.environments.storage import S3
28+
from prefect import Flow,task
29+
from prefect.schedules import IntervalSchedule
30+
from datetime import timedelta, datetime
31+
from random import randrange
32+
from prefect.engine.executors import DaskExecutor
33+
import time
34+
import random
35+
36+
37+
# In[29]:
38+
39+
40+
41+
42+
43+
# In[30]:
44+
45+
46+
S3_BUCKET = "wzy-project-domino"
47+
48+
49+
# In[31]:
50+
51+
52+
pd.set_option('display.max_colwidth', None)
53+
pd.set_option('display.max_rows', 500)
54+
pd.set_option('display.max_columns', 500)
55+
pd.set_option('display.width', 1000)
56+
57+
58+
# ## task
59+
60+
# In[33]:
61+
62+
63+
def random_date(start, end):
64+
delta = end - start
65+
int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
66+
random_second = randrange(int_delta)
67+
return start + timedelta(seconds=random_second)
68+
69+
def get_creds():
70+
neo4j_creds = None
71+
with open('/secrets/neo4jcreds.json') as json_file:
72+
neo4j_creds = json.load(json_file)
73+
return neo4j_creds
74+
75+
@task(log_stdout=True, skip_on_upstream_skip=True)
76+
def run_stream():
77+
creds = get_creds()
78+
start = datetime.strptime("2020-03-11 20:00:00", "%Y-%m-%d %H:%M:%S")
79+
current = datetime.strptime(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
80+
#rand_dt=random_date(start, current)
81+
#2020-10-10 16:07:30
82+
#2020-10-11 06:29:00 to 2020-10-11 06:29:30:
83+
tp = TwintPool(is_tor=True)
84+
fh = FirehoseJob(neo4j_creds=creds, PARQUET_SAMPLE_RATE_TIME_S=30, save_to_neo=True, writers={})
85+
try:
86+
search = "covid OR corona OR virus OR pandemic"
87+
job_name = "covid multi test"
88+
limit = 10000000
89+
for df in fh.search_time_range(tp=tp, Search=search, Since=str(start), Until=str(current), job_name=job_name, Limit=10000000, stride_sec=30):
90+
logger.info('got: %s', len(df) if not (df is None) else 'None')
91+
logger.info('proceed to next df')
92+
except Exception as e:
93+
logger.error("job exception", exc_info=True)
94+
raise e
95+
logger.info("job finished")
96+
97+
# In[ ]:
98+
99+
100+
schedule = IntervalSchedule(
101+
start_date=datetime(2020, 9, 5),
102+
interval=timedelta(seconds=10),
103+
)
104+
storage = S3(bucket=S3_BUCKET)
105+
106+
#with Flow("covid-19 stream", storage=storage, schedule=schedule) as flow:
107+
with Flow("covid-19 stream-single") as flow:
108+
run_stream()
109+
flow.run()
110+
111+
112+
# In[ ]:
113+
114+
115+
116+

‎infra/pipelines/docker/nonrapids-Dockerfile‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ RUN apt-get update \
66

77
COPY . .
88
RUN pip install prefect==0.10.1 simplejson twarc neo4j boto3==1.12.39 \
9-
pandas twint \
9+
pandas git+https://github.com/twintproject/twint.git@origin/master#egg=twint \
1010
&& ( prefect agent install local > supervisord.conf )
1111
RUN prefect backend server
1212
RUN ["chmod","+x","./infra/pipelines/docker/nonrapids-entrypoint.sh"]

‎infra/pipelines/docker/nonrapids-docker-compose.yml‎

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,16 @@
77
#
88
########
99

10-
version: '3.4'
10+
version: '3'
1111

1212
services:
13-
##############################################################################
14-
tor:
15-
build: ./tor
16-
network_mode: 'bridge'
17-
restart: always
18-
ports:
19-
- 127.0.0.1:9050:9050
13+
2014
##############################################################################
2115
prefect-agent:
2216
image: prefect-agent:0.0.5
2317
build:
2418
context: ../../../
2519
dockerfile: ./infra/pipelines/docker/nonrapids-Dockerfile
26-
container_name: prefect-agent
2720
network_mode: 'bridge'
2821
restart: unless-stopped
2922
environment:
@@ -35,4 +28,6 @@ services:
3528
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
3629
logging:
3730
options:
38-
tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'
31+
tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'
32+
#############################################################################################
33+

‎infra/pipelines/docker/search.sh‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
docker-compose -f datastream-docker-compose.yml build && docker-compose -f datastream-docker-compose.yml up -d && docker-compose -f datastream-docker-compose.yml logs -f -t --tail=1

‎infra/pipelines/docker/tor/Dockerfile‎

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)