Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d6dcaff
fixing search username function due to twint deprecation
webcoderz Aug 18, 2020
a80fd10
Merge branch 'master' of https://github.com/TheDataRideAlongs/Project…
webcoderz Aug 18, 2020
1952dec
Merge branch 'master' of https://github.com/TheDataRideAlongs/Project…
webcoderz Dec 26, 2020
dace204
user info, replies, and user timeline inline with twint and bugfix fo…
webcoderz Dec 26, 2020
a21425d
user info, replies, and user timeline inline with twint and bugfix fo…
webcoderz Dec 26, 2020
22703c3
user info, replies, and user timeline inline with twint and bugfix fo…
webcoderz Dec 26, 2020
1b25c1d
user info, replies, and user timeline inline with twint and bugfix fo…
webcoderz Dec 26, 2020
7be53b0
user info, replies, and user timeline inline with twint and bugfix fo…
webcoderz Dec 26, 2020
c15740e
changed user_Created_At twint df inferrence inline with twints user c…
webcoderz Dec 27, 2020
89ed4d6
changed user_Created_At twint df inferrence inline with twints user c…
webcoderz Dec 27, 2020
f6f2923
changed user_Created_At twint df inferrence inline with twints user c…
webcoderz Dec 27, 2020
7ea6ca5
adding quo
webcoderz Dec 28, 2020
53367f3
removed user created at from initial conversion
webcoderz Dec 28, 2020
b967545
Update Neo4jDataAccess.py
webcoderz Dec 28, 2020
604e5a6
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
d57ee6d
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
c5d6096
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
dbfa95e
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
08838d4
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
a39bdc5
enrich_user_tl_and_info function to grab a users timeline and info an…
webcoderz Dec 28, 2020
91eeed0
timestamp to date time conversion for neo
webcoderz Dec 28, 2020
8b1ae3d
timestamp to date time conversion for neo
webcoderz Dec 28, 2020
6b81244
timestamp to date time conversion for neo
webcoderz Dec 28, 2020
cebae1f
timestamp to date time conversion for neo
webcoderz Dec 30, 2020
43c2ac8
timeline writer for enrich_user_tl_and_info functionality.
webcoderz Dec 30, 2020
72843ab
timeline writer for enrich_user_tl_and_info functionality.
webcoderz Dec 30, 2020
f6a89b8
timeline writer for enrich_user_tl_and_info functionality.
webcoderz Dec 31, 2020
f5b70c9
timeline writer for enrich_user_tl_and_info functionality.
webcoderz Jan 3, 2021
21cd16f
adding flag to pull retweets
webcoderz Jan 19, 2021
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fixing search username function due to twint deprecation
  • Loading branch information
webcoderz committed Aug 18, 2020
commit d6dcaff0da7b6e3880e2d805d9211fdddeb18093
83 changes: 37 additions & 46 deletions modules/TwintPool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,34 @@
from datetime import datetime, timedelta

import logging

logger = logging.getLogger()


class TwintPool:

def __init__(self, fh_job=None, job_name='noname'):
self.fh = fh_job
self.config = twint.Config()
self.config.Limit = 100
self.config.Pandas = True
self.config.User_full = True
self.config.Hide_output = True
self.config.Verified = True
self.config.Verified = None
self.config.Username = None
self.config.Proxy_host = "tor"
self.self.config.Proxy_port = "9050"
self.config.Proxy_type = "socks5"


# self.config.User_full = True

def twint_loop(self, since, until, stride_sec=600, limit=None):
def get_unix_time(time_str):
return datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')

since = get_unix_time(since)
until = get_unix_time(until)
t = since
tweets_returned = 0

while t < until and (not tweets_returned or tweets_returned < limit):
t0 = t
t1 = t+timedelta(seconds=stride_sec)
t1 = t + timedelta(seconds=stride_sec)
self.config.Since = str(t0)
self.config.Until = str(t1)
logger.debug('Search step: %s-%s', t0, t1)
Expand All @@ -46,48 +44,43 @@ def get_unix_time(time_str):
logger.debug('not hits on %s - %s, continuing', t0, t1)
t = t1
logger.debug('twint_loop done, hits: %s', tweets_returned)


def _get_term(self, Search="IngSoc", Since="1984-04-20 13:00:00", Until="1984-04-20 13:30:00", stride_sec=600, **kwargs):
def _get_term(self, Search="IngSoc", Since="1984-04-20 13:00:00", Until="1984-04-20 13:30:00", stride_sec=600,
**kwargs):
self.config.Search = Search
self.config.Retweets = True
for k,v in kwargs.items():
for k, v in kwargs.items():
setattr(self.config, k, v)
#self.config.Search = term
# self.config.Search = term
logger.debug('Search seq: %s-%s of %s', Search, Since, Until)
for df,t0,t1 in self.twint_loop(Since, Until, stride_sec, self.config.Limit):
for df, t0, t1 in self.twint_loop(Since, Until, stride_sec, self.config.Limit):
yield (df, t0, t1)

def _get_timeline(self, username="lmeyerov"):
self.config.Username = username
self.config.Retweets = True
#self.config.Search = term
self.config.Search = "from:" + username
twint.run.Search(self.config)
tweets_df = twint.storage.panda.Tweets_df
return tweets_df



def _get_user_info(self, username):
self.config.Username = username
self.config.Limit = 1
twint.run.Lookup(self.config)
return twint.storage.panda.User_df



def twint_df_to_neo4j_df(self, df):
neo4j_df = df.rename(columns={
'id': 'status_id',
'tweet': 'full_text',
'created_at': 'created_at', # needs to be datetime
'created_at': 'created_at', # needs to be datetime
'nlikes': 'favorite_count',
'nretweets': 'retweet_count',
'user_id_str': 'user_id',
'username': 'user_name',
'name': 'user_screen_name'
})


})

def row_to_tweet_type(row):
if row['quote_url'] is None or row['quote_url'] == '':
return "QUOTE_RETWEET"
Expand All @@ -98,18 +91,18 @@ def row_to_tweet_type(row):
elif row['id'] != row['conversation_id']:
return "REPLY"
else:
raise('wat')
def row_to_quoted_status_id(row):
if row['quote_url'] and len(row['quote_url']) > 0:
return row['quote_url'].split('/')[-1]
else:
return None
raise ('wat')

# def row_to_quoted_status_id(row):
# if row['quote_url'] and len(row['quote_url']) > 0:
# return row['quote_url'].split('/')[-1]
# else:
# return None

def row_tweet_to_urls(row):
extractor = URLExtract()
return list(extractor.gen_urls(row['tweet']))

neo4j_df['user_location'] = None
neo4j_df['tweet_type_twint'] = df.apply(row_to_tweet_type, axis=1)
neo4j_df['hashtags'] = df['hashtags'].apply(lambda x: [{'text': ht} for ht in x])
Expand All @@ -119,21 +112,19 @@ def row_tweet_to_urls(row):
neo4j_df['user_profile_image_url'] = None
neo4j_df['reply_tweet_id'] = None
neo4j_df['user_mentions'] = df['tweet'].str.findall('@[\w]+')
#neo4j_df['retweet_id'] is suspiciously empty (always)
# neo4j_df['retweet_id'] is suspiciously empty (always)
neo4j_df['retweeted_status'] = None

neo4j_df['created_at'] = (neo4j_df['created_at'] / 1000).apply(lambda n: datetime.fromtimestamp(n))
neo4j_df['quoted_status_id'] = df.apply(row_to_quoted_status_id, axis=1)
neo4j_df['is_quote_status'] = neo4j_df['quoted_status_id'] != None

# neo4j_df['quoted_status_id'] = df.apply(row_to_quoted_status_id, axis=1)
# neo4j_df['is_quote_status'] = neo4j_df['quoted_status_id'] != None
neo4j_df['in_reply_to_status_id'] = False
neo4j_df['urls'] = df.apply(row_tweet_to_urls, axis=1)

return neo4j_df


def to_arrow(self, tweets_df):
pass