Skip to content

Commit 4a84f63

Browse files
authored
Merge pull request #69 from TheDataRideAlongs/twint
WIP: Twint
2 parents 52bee10 + 1ae176f commit 4a84f63

2 files changed

Lines changed: 124 additions & 2 deletions

File tree

‎modules/Neo4jDataAccess.py‎

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,9 @@ def __save_df_to_graph(self, df, job_name, job_id=None):
322322
for index, row in df.iterrows():
323323
# determine the type of tweet
324324
tweet_type = 'TWEET'
325-
if row["in_reply_to_status_id"] is not None and row["in_reply_to_status_id"] > 0:
325+
if row['tweet_type_twint']:
326+
tweet_type = row['tweet_type_twint']
327+
elif row["in_reply_to_status_id"] is not None and row["in_reply_to_status_id"] > 0:
326328
tweet_type = "REPLY"
327329
elif "quoted_status_id" in row and row["quoted_status_id"] is not None and row["quoted_status_id"] > 0:
328330
tweet_type = "QUOTE_RETWEET"
@@ -347,8 +349,11 @@ def __save_df_to_graph(self, df, job_name, job_id=None):
347349
'user_created_at': pd.Timestamp(row['user_created_at'], unit='s').to_pydatetime(),
348350
'user_profile_image_url': row['user_profile_image_url'],
349351
'reply_tweet_id': row['in_reply_to_status_id'],
352+
'conversation_id': row['conversation_id'] if 'conversation_id' in row else None,
350353
'quoted_status_id': row['quoted_status_id'],
351354
'retweet_id': row['retweet_id'] if 'retweet_id' in row else None,
355+
'geo': row['geo'] if 'geo' in row else None,
356+
'ingest_method': row['ingest_method']
352357
})
353358
except Exception as e:
354359
logging.error('params.append exn', e)
@@ -437,4 +442,4 @@ def __parse_urls(self, row, url_params, job_name, job_id=None):
437442
logging.error(inst.args) # arguments stored in .args
438443
# __str__ allows args to be printed directly,
439444
logging.error(inst)
440-
return url_params
445+
return url_params

‎modules/Twint.py‎

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import pyarrow as pa
2+
import twint
3+
from urlextract import URLExtractor
4+
from datetime import datetime, timedelta
5+
6+
7+
class TwintPool:
8+
def __init__(self, fh_job=None, job_name="noname"):
9+
self.fh = fh_job
10+
self.config = twint.Config()
11+
self.config.Limit = 100
12+
self.config.Pandas = True
13+
self.config.User_full = True
14+
self.config.Hide_output = True
15+
16+
def twint_loop(self, since, until, stride_sec=600, limit=None):
17+
def get_unix_time(time_str):
18+
return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
19+
20+
since = get_unix_time(since)
21+
until = get_unix_time(until)
22+
t = since
23+
tweets_returned = 0
24+
25+
while t < until and (not tweets_returned or tweets_returned < limit):
26+
t0 = t
27+
t1 = t + timedelta(seconds=stride_sec)
28+
self.config.Since = str(t0)
29+
self.config.Until = str(t1)
30+
twint.run.Search(self.config)
31+
tweets_returned += len(twint.storage.panda.Tweets_df)
32+
yield (twint.storage.panda.Tweets_df, t0, t1)
33+
t = t1
34+
35+
def _get_term(
36+
self,
37+
Search="IngSoc",
38+
Since="1984-04-20 13:00:00",
39+
Until="1984-04-20 13:30:00",
40+
stride_sec=600,
41+
**kwargs
42+
):
43+
self.config.Search = Search
44+
self.config.Retweets = True
45+
for k, v in kwargs.items():
46+
setattr(self.config, k, v)
47+
# self.config.Search = term
48+
for df, t0, t1 in self.twint_loop(Since, Until, stride_sec, self.config.Limit):
49+
yield (df, t0, t1)
50+
51+
52+
def _get_timeline(self, username="lmeyerov"):
53+
self.config.Username = username
54+
self.config.Retweets = True
55+
#self.config.Search = term
56+
twint.run.Search(self.config)
57+
tweets_df = twint.storage.panda.Tweets_df
58+
return tweets_df
59+
60+
def twint_df_to_neo4j_df(self, df):
61+
neo4j_df = df.rename(
62+
columns={
63+
"id": "status_id",
64+
"tweet": "full_text",
65+
"created_at": "created_at", # needs to be datetime
66+
"nlikes": "favorite_count",
67+
"nretweets": "retweet_count",
68+
"user_id_str": "user_id",
69+
"username": "user_name",
70+
"name": "user_screen_name",
71+
}
72+
)
73+
74+
def row_to_tweet_type(row):
75+
if row["quote_url"] is None or row["quote_url"] == "":
76+
return "QUOTE_RETWEET"
77+
elif row["retweet"]:
78+
return "RETWEET"
79+
elif row["id"] == row["conversation_id"]:
80+
return "TWEET"
81+
elif row["id"] != row["conversation_id"]:
82+
return "REPLY"
83+
else:
84+
raise ("wat")
85+
86+
def row_to_quoted_status_id(row):
87+
if row["quote_url"] and len(row["quote_url"]) > 0:
88+
return row["quote_url"].split("/")[-1]
89+
else:
90+
return None
91+
92+
def row_tweet_to_urls(row):
93+
extractor = URLExtract()
94+
return list(extractor.gen_urls(row["tweet"]))
95+
96+
neo4j_df["user_location"] = None
97+
neo4j_df["tweet_type_twint"] = df.apply(row_to_tweet_type, axis=1)
98+
neo4j_df["hashtags"] = df["hashtags"].apply(
99+
lambda x: [{"text": ht} for ht in x]
100+
)
101+
neo4j_df["user_followers_count"] = None
102+
neo4j_df["user_friends_count"] = None
103+
neo4j_df["user_created_at"] = None
104+
neo4j_df["user_profile_image_url"] = None
105+
neo4j_df["in_reply_to_status_id"] = None
106+
neo4j_df["user_mentions"] = [] # Todo
107+
# neo4j_df['retweet_id'] is suspiciously empty (always)
108+
109+
neo4j_df["quoted_status_id"] = df.apply(row_to_quoted_status_id, axis=1)
110+
neo4j_df["urls"] = df.apply(row_tweet_to_urls, axis=1)
111+
112+
neo4j_df["ingest_method"] = 'twint'
113+
114+
return neo4j_df
115+
116+
def to_arrow(self, tweets_df):
117+
pass

0 commit comments

Comments
 (0)