Using Selenium to capture images

- 3 mins

Using Selenium to capture images

When scraping a dataset from the web it is sometimes needed to also capture some visual content as well, in this snippet I present a way to do that.

import time

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


import os
import re
from shutil import copyfile
from selenium import webdriver
from io import BytesIO
from time import sleep

import base64
import pandas as pd
from tqdm.notebook import tqdm

DRIVER_LOCATION = '.chromedriver'
DATASET_PATH = './statista/datast_len_100586_free.csv'
N_STATISTICS_TO_PROCESS = 9999999999
OUT_PATH = './statista/captures_all'

options = Options()
options.add_argument('--disable-notifications')
options.add_argument("--headless")
options.add_argument("--window-size=3072,1920")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")

browser = webdriver.Chrome(DRIVER_LOCATION, chrome_options=options)

dataset = pd.read_csv(DATASET_PATH)     
for url in tqdm(dataset['url'].to_list()[:N_STATISTICS_TO_PROCESS]):

    num = url.split('/')[4]
    
    if f'{num}.png' not in os.listdir(OUT_PATH):     
        
        cleanUrl = url
        browser.get(cleanUrl)
        try:
            
            # take care of cookies button
            try:
                browser.find_element(By.ID,'onetrust-accept-btn-handler').click()
                sleep(1)
            except:
                pass
            
            # take care of "expand statistic button"
            try: 
                browser.find_element(By.XPATH,'//*[@id="statisticContainer"]/div[1]/article/div/div/div[2]/div[2]').click()
                with open(f'{OUT_PATH}/collapsed.txt','a') as f:
                    f.write(f'\n{num}')
            except:
                pass

            # based on https://stackoverflow.com/a/55676925
            element = browser.find_element(By.CLASS_NAME,'highchart-container') 

            # center the figure
            desired_y = element.location['y'] + (element.size['height'] / 2) 
            window_h = browser.execute_script('return window.innerHeight')
            window_y = browser.execute_script('return window.pageYOffset')
            current_y = (window_h / 2) + window_y
            scroll_y_by = desired_y - current_y
            browser.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)

            sleep(1.2) # this is how long it takes for the figure animations to load
            screenshot_as_bytes = element.screenshot_as_png

            with open(f'{OUT_PATH}/{num}.png', 'wb') as f:
                f.write(screenshot_as_bytes)

        except Exception as ex:
            print(ex)

browser.quit()
comments powered by Disqus
rss facebook twitter github gitlab youtube mail spotify lastfm instagram linkedin google google-plus pinterest medium vimeo stackoverflow reddit quora quora