Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add files via upload
  • Loading branch information
webcoderz authored Jul 1, 2020
commit 25c1419b4946b2da1c934e77e00428068dbe77fc
125 changes: 125 additions & 0 deletions Twint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pyarrow as pa
import twint
from urlextract import URLExtract
from datetime import datetime, timedelta

import logging
logger = logging.getLogger()

class TwintPool:

def __init__(self, fh_job=None, job_name='noname'):
self.fh = fh_job
self.config = twint.Config()
self.config.Limit = 100000
self.config.Pandas = True
self.config.User_full = True
self.config.Hide_output = True


def twint_loop(self, since, until, stride_sec=600, limit=None):
def get_unix_time(time_str):
return datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
since = get_unix_time(since)
until = get_unix_time(until)
t = since
tweets_returned = 0

while t < until and (not tweets_returned or tweets_returned < limit):
t0 = t
t1 = t+timedelta(seconds=stride_sec)
self.config.Since = str(t0)
self.config.Until = str(t1)
logger.debug('Search step: %s-%s', t0, t1)
twint.run.Search(self.config)
tweets_returned += len(twint.storage.panda.Tweets_df)
if len(twint.storage.panda.Tweets_df) > 0:
logger.debug('Search hit, len %s', len(twint.storage.panda.Tweets_df))
yield (twint.storage.panda.Tweets_df, t0, t1)
else:
logger.debug('not hits on %s - %s, continuing', t0, t1)
t = t1
logger.debug('twint_loop done, hits: %s', tweets_returned)



def _get_term(self, Search="IngSoc", Since="1984-04-20 13:00:00", Until="1984-04-20 13:30:00", stride_sec=600, **kwargs):
self.config.Search = Search
self.config.Retweets = True
for k,v in kwargs.items():
setattr(self.config, k, v)
#self.config.Search = term
logger.debug('Search seq: %s-%s of %s', Search, Since, Until)
for df,t0,t1 in self.twint_loop(Since, Until, stride_sec, self.config.Limit):
yield (df, t0, t1)

def _get_timeline(self, username="lmeyerov"):
self.config.Username = username
self.config.Retweets = True
#self.config.Search = term
twint.run.Search(self.config)
tweets_df = twint.storage.panda.Tweets_df
return tweets_df

def twint_df_to_neo4j_df(self, df):
neo4j_df = df.rename(columns={
'id': 'status_id',
'tweet': 'full_text',
'created_at': 'created_at', # needs to be datetime
'nlikes': 'favorite_count',
'nretweets': 'retweet_count',
'user_id_str': 'user_id',
'username': 'user_name',
'name': 'user_screen_name'
})


def row_to_tweet_type(row):
if row['quote_url'] is None or row['quote_url'] == '':
return "QUOTE_RETWEET"
elif row['retweet']:
return "RETWEET"
elif row['id'] == row['conversation_id']:
return "TWEET"
elif row['id'] != row['conversation_id']:
return "REPLY"
else:
raise('wat')

def row_to_quoted_status_id(row):
if row['quote_url'] and len(row['quote_url']) > 0:
return row['quote_url'].split('/')[-1]
else:
return None

def row_tweet_to_urls(row):
extractor = URLExtract()
return list(extractor.gen_urls(row['tweet']))

neo4j_df['user_location'] = None
neo4j_df['tweet_type_twint'] = df.apply(row_to_tweet_type, axis=1)
neo4j_df['hashtags'] = df['hashtags'].apply(lambda x: [{'text': ht} for ht in x])
neo4j_df['user_followers_count'] = None
neo4j_df['user_friends_count'] = None
neo4j_df['user_created_at'] = None
neo4j_df['user_profile_image_url'] = None
neo4j_df['reply_tweet_id'] = None
neo4j_df['user_mentions'] = df['tweet'].str.findall('@[\w]+')
#neo4j_df['retweet_id'] is suspiciously empty (always)
neo4j_df['retweeted_status'] = None

neo4j_df['created_at'] = (neo4j_df['created_at'] / 1000).apply(lambda n: datetime.fromtimestamp(n))

neo4j_df['quoted_status_id'] = df.apply(row_to_quoted_status_id, axis=1)
neo4j_df['is_quote_status'] = neo4j_df['quoted_status_id'] != None
neo4j_df['in_reply_to_status_id'] = False
neo4j_df['urls'] = df.apply(row_tweet_to_urls, axis=1)

return neo4j_df


def to_arrow(self, tweets_df):
pass