TheDataRideAlongs
diff --git a/‎.idea/ProjectDomino.iml‎
Lines changed: 1 addition & 1 deletion b/‎.idea/ProjectDomino.iml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.idea/misc.xml‎
Lines changed: 1 addition & 0 deletions b/‎.idea/misc.xml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎infra/pipelines/docker/datastream-Dockerfile‎
Lines changed: 25 additions & 0 deletions b/‎infra/pipelines/docker/datastream-Dockerfile‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎infra/pipelines/docker/datastream-docker-compose.yml‎
Lines changed: 31 additions & 0 deletions b/‎infra/pipelines/docker/datastream-docker-compose.yml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎infra/pipelines/docker/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎infra/pipelines/docker/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎infra/pipelines/docker/jobs/search_by_date_job.py‎
Lines changed: 116 additions & 0 deletions b/‎infra/pipelines/docker/jobs/search_by_date_job.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎infra/pipelines/docker/nonrapids-Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎infra/pipelines/docker/nonrapids-Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎infra/pipelines/docker/nonrapids-docker-compose.yml‎
Lines changed: 5 additions & 10 deletions b/‎infra/pipelines/docker/nonrapids-docker-compose.yml‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎infra/pipelines/docker/search.sh‎
Lines changed: 3 additions & 0 deletions b/‎infra/pipelines/docker/search.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎infra/pipelines/docker/tor/Dockerfile‎
Lines changed: 0 additions & 23 deletions b/‎infra/pipelines/docker/tor/Dockerfile‎
Lines changed: 0 additions & 23 deletions
@@ -0,0 +1,25 @@
+FROM python:3.7
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends git vim tor \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+
+
+RUN pip install prefect==0.10.1 simplejson twarc neo4j boto3==1.12.39 \
+    pandas  pyarrow urlextract git+https://github.com/homm/yurl.git@1943161973aeb3b3cf2e1e9de6671673b8356161
+
+RUN echo "ok6" && pip install git+https://github.com/TheDataRideAlongs/twint.git
+#RUN pip install git+https://github.com/twintproject/twint.git
+#git+https://github.com/lmeyerov/twint.git@patch-1#egg=twint
+
+#FIXME this should be part of entrypoint / service config?
+#RUN service tor start 
+
+COPY ./modules /app/ProjectDomino
+COPY ./infra/pipelines/docker/jobs /app
+
+HEALTHCHECK --interval=60s --timeout=15s --start-period=20s \
+    CMD curl -sf --socks5-hostname localhost:9050 https://check.torproject.org | grep Congrat
+
+WORKDIR /app
@@ -0,0 +1,31 @@
+########
+#
+# Run from git root's parent: with .env in local folder and ProjectDomino/ inside
+#
+# $ touch .env
+# $ sudo docker-compose -f ./ProjectDomino/infra/pipelines/docker/docker-compose.yml up -d prefect-agent
+#
+########
+
+version: '3'
+
+services:
+  data-stream:
+      build:
+        context: ../../../
+        dockerfile: infra/pipelines/docker/datastream-Dockerfile
+      tty: true
+      network_mode: 'bridge'
+      command: sh -c "pwd && ls && service tor start && python3 /app/search_by_date_job.py"
+      volumes:
+        - /home/codywebb/ProjectDomino/infra/pipelines/docker/jobs/neo4jcreds.json:/secrets/neo4jcreds.json:ro
+      environment:
+        PREFECT__SERVER__HOST: ${PREFECT__SERVER__HOST:-http://host.docker.internal}
+        PREFECT__SERVER__PORT: ${PREFECT__SERVER__PORT:-4200}
+        PREFECT__SERVER__UI__HOST: ${PREFECT__SERVER__UI__HOST:-http://host.docker.internal}
+        PREFECT__SERVER__UI__PORT: ${PREFECT__SERVER__UI__PORT:-8080}
+        AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
+        AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
+      logging:
+        options:
+          tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'
@@ -14,7 +14,7 @@ services:
     image:  prefect-agent:0.0.5
     build:
       context: ../../../
-      dockerfile: ./infra/pipelines/docker/nonrapids-Dockerfile
+      dockerfile: ./infra/pipelines/docker/Dockerfile
     container_name: prefect-agent
     network_mode: 'bridge'
     restart: unless-stopped
 
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[26]:
+
+
+
+
+
+# In[27]:
+
+
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO) #DEBUG, INFO, WARNING, ERROR, CRITICAL
+
+
+
+
+# In[28]:
+
+
+import json, pandas as pd
+from ProjectDomino.Neo4jDataAccess import Neo4jDataAccess
+from ProjectDomino.FirehoseJob import FirehoseJob
+from ProjectDomino.TwintPool import TwintPool
+from prefect.environments.storage import S3
+from prefect import Flow,task
+from prefect.schedules import IntervalSchedule
+from datetime import timedelta, datetime
+from random import randrange
+from prefect.engine.executors import DaskExecutor
+import time
+import random
+
+
+# In[29]:
+
+
+
+
+
+# In[30]:
+
+
+S3_BUCKET = "wzy-project-domino"
+
+
+# In[31]:
+
+
+pd.set_option('display.max_colwidth', None)
+pd.set_option('display.max_rows', 500)
+pd.set_option('display.max_columns', 500)
+pd.set_option('display.width', 1000)
+
+
+# ## task
+
+# In[33]:
+
+
+def random_date(start, end):
+    delta = end - start
+    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
+    random_second = randrange(int_delta)
+    return start + timedelta(seconds=random_second)
+
+def get_creds():
+    neo4j_creds = None
+    with open('/secrets/neo4jcreds.json') as json_file:
+        neo4j_creds = json.load(json_file)
+    return neo4j_creds
+
+@task(log_stdout=True, skip_on_upstream_skip=True)
+def run_stream():
+    creds = get_creds()
+    start = datetime.strptime("2020-03-11 20:00:00", "%Y-%m-%d %H:%M:%S")
+    current = datetime.strptime(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
+    #rand_dt=random_date(start, current)
+    #2020-10-10 16:07:30
+    #2020-10-11 06:29:00 to 2020-10-11 06:29:30: 
+    tp = TwintPool(is_tor=True)
+    fh = FirehoseJob(neo4j_creds=creds, PARQUET_SAMPLE_RATE_TIME_S=30, save_to_neo=True, writers={})
+    try:
+        search = "covid OR corona OR virus OR pandemic"
+        job_name = "covid multi test"
+        limit = 10000000
+        for df in fh.search_time_range(tp=tp, Search=search, Since=str(start), Until=str(current), job_name=job_name, Limit=10000000, stride_sec=30):
+            logger.info('got: %s', len(df) if not (df is None) else 'None')
+            logger.info('proceed to next df')
+    except Exception as e:
+        logger.error("job exception", exc_info=True)
+        raise e
+    logger.info("job finished")
+
+# In[ ]:
+
+
+schedule = IntervalSchedule(
+    start_date=datetime(2020, 9, 5),
+    interval=timedelta(seconds=10),
+)
+storage = S3(bucket=S3_BUCKET)
+
+#with Flow("covid-19 stream", storage=storage, schedule=schedule) as flow:
+with Flow("covid-19 stream-single") as flow:
+    run_stream()
+flow.run()
+
+
+# In[ ]:
+
+
+
+
@@ -6,7 +6,7 @@ RUN apt-get update \
 
 COPY . . 
 RUN pip install prefect==0.10.1 simplejson twarc neo4j boto3==1.12.39 \
-    pandas twint \
+    pandas git+https://github.com/twintproject/twint.git@origin/master#egg=twint \
 && ( prefect agent install local > supervisord.conf )
 RUN prefect backend server
 RUN ["chmod","+x","./infra/pipelines/docker/nonrapids-entrypoint.sh"]
 
@@ -7,23 +7,16 @@
 #
 ########
 
-version: '3.4'
+version: '3'
 
 services:
-##############################################################################
-  tor:
-    build: ./tor
-    network_mode: 'bridge'
-    restart: always
-    ports:
-      - 127.0.0.1:9050:9050
+
 ##############################################################################
   prefect-agent:
     image:  prefect-agent:0.0.5
     build:
       context: ../../../
       dockerfile: ./infra/pipelines/docker/nonrapids-Dockerfile
-    container_name: prefect-agent
     network_mode: 'bridge'
     restart: unless-stopped
     environment:
@@ -35,4 +28,6 @@ services:
       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
     logging:
       options:
-        tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'
+        tag: 'ImageName:{{.ImageName}}/Name:{{.Name}}/ID:{{.ID}}/ImageFullID:{{.ImageFullID}}'
+#############################################################################################
+
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker-compose -f datastream-docker-compose.yml build && docker-compose -f datastream-docker-compose.yml up -d && docker-compose -f datastream-docker-compose.yml logs -f -t --tail=1
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+docker-compose -f datastream-docker-compose.yml build && docker-compose -f datastream-docker-compose.yml up -d && docker-compose -f datastream-docker-compose.yml logs -f -t --tail=1`