# -*- coding: utf-8 -*- # this script searches for 페미니스트 and 페미니즘 then saves it to a file import time import datetime import random from selenium import webdriver from bs4 import BeautifulSoup as BS import json # 페미니스트 OR 페미니즘 # this query does not get the hashtag url = "https://twitter.com/search?f=tweets&q=%ED%8E%98%EB%AF%B8%EB%8B%88%EC%8A%A4%ED%8A%B8%20OR%20%ED%8E%98%EB%AF%B8%EB%8B%88%EC%A6%98%20" # initiate days day = datetime.date(2006, 3, 22) nextday = datetime.date(2006, 3, 23) datalist = [] # our primitive scraper def scrape(url): period = "since%3A" + str(day) + "%20until%3A" + str(nextday) browser = webdriver.Chrome() browser.get(url + period) time.sleep(4 + random.random()) previousScroll = browser.execute_script("return document.body.scrollHeight;") for j in range(0, 2): browser.execute_script("window.scrollTo(0, 0);") time.sleep(random.random()) while True: browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2 + random.random()) if previousScroll != browser.execute_script("return document.body.scrollHeight;"): previousScroll = browser.execute_script("return document.body.scrollHeight;") else: break print("I think I reached the bottom. Will try to scroll down " + str(2-j) + " more time(s)") time.sleep(2 + random.random()) print("Yup, that's it for " + str(day) + " till " + str(nextday)) # retrieve tweets from html and save list html = browser.page_source.encode('utf8') bsObj = BS(html) tweets = bsObj.find_all('div', {'data-tweet-id': True}) for t in tweets: currentdata = [] currentdata.append(t['data-tweet-id']) currentdata.append(t['data-permalink-path']) currentdata.append(t['data-screen-name']) currentdata.append(t['data-name']) currentdata.append(t.find("p", { "class" : "tweet-text"}).text) d = t.find("a", { "class" : "tweet-timestamp" }).findChild()['data-time'] currentdata.append(str(datetime.datetime.fromtimestamp(float(d)))) datalist.append(currentdata) print("Added " + str(day) + " to list") browser.close() # until 2011-01-01 day = datetime.date(2006, 3, 22) nextday = datetime.date(2011, 1, 1) scrape(url) # since 2011-01-01 until 2015-02-09, month by month day = datetime.date(2011, 1, 1) nextday = datetime.date(2011, 1, 31) for i in range(0, 49): scrape(url) day += datetime.timedelta(days = 30) nextday += datetime.timedelta(days = 30) with open('feminist-OR-feminism-tweets-before-2015-02-09.txt','w') as f: json.dump(datalist,f, ensure_ascii=False) # since 2015-02-09 until now, day by day day = datetime.date(2015, 2, 9) nextday = datetime.date(2015, 2, 10) for i in range(0, 430): scrape(url) day += datetime.timedelta(days = 1) nextday += datetime.timedelta(days = 1) with open('feminist-OR-feminism-tweets.txt','w') as f: json.dump(datalist,f, ensure_ascii=False)