How to scrape a website that requires login first with Python

≡放荡痞女 提交于 2019-11-26 23:29:14

This works for me:

##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text

# Browser
br = mechanize.Browser()

# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

br.addheaders = [('User-agent', 'Chrome')]

# The site we will navigate into, handling it's session
br.open('https://github.com/login')

# View available forms
for f in br.forms():
    print f

# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)

# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'

# Login
br.submit()

print(br.open('https://github.com/settings/emails').read())

You were not far off at all!

would love to add my solution alongside . this answer mainly follows the hacky / lazy approach i always follow in everything i do. went on with mainly because , i was too lazy to handle the cookies, session data etc .

this solution is of most use if you want to scrape multiple pages of a website after logging in with single account credentials (eg all your pinterest boards) . not if u want to automate authentication using multiple accounts

so my solution is selenium along with firefox profiles.

  • Create a new firefox profile you create a new firefox profile, note the location where its stored, open firefox in corresponding profile. and login to the website manually . details about firefox profiles
  • now use selenium with this profile selenium session will use the cookies and session data from firefox profile so your authentication stays.

i devised this mechanism when i came across need to scrape few pinterest pages, i have added few lines of code from the sample showing how to use the profile. suit the code according to your needs.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException


#replace with your firefox profile
fp=webdriver.FirefoxProfile('C:/Users/SJ/AppData/Roaming/Mozilla/Firefox/Profiles/hlsfrs2o.scrape')
#enter your url here
url=""
driver = webdriver.Firefox(fp)
driver.get(url)

html_source = driver.page_source

The classic way to approach this problem is:

  1. launch a browser, go to site and search for the login page
  2. inspect the source code of the page to find out: I. which one is the login form (a page can have many forms, but usually one of them is the login form) II. which are the field names used for username and password (these could vary a lot) III. if there are other fields that must be submitted (like an authentication token)
  3. write the Scrapy spider to replicate the form submission using FormRequest

Being fans of automation, we figured we could write some code to automate point 2 (which is actually the most time-consuming) and the result is login form, a library to automatically fill login forms given the login page, username and password. Here is the code of a simple spider that would use loginform to login to sites automatically.

githubloginspider.py

from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.http.request import Request
from loginform import fill_login_form
from scrapy import log
from scraping.articles import ArticleItem


class GitHubLogin(BaseSpider):

    name = 'GitHubLogin'
    allowed_domains = ['github.com']
    start_urls = ['http://github.com/login']
    login_user = 'ranvijay5686'
    login_pass = ''

    def parse(self, response):
        (args, url, method) = fill_login_form(response.url,
                response.body, self.login_user, self.login_pass)
        return FormRequest(url, method=method, formdata=args,
                           callback=self.after_login)

    def after_login(self, response):

        # for link in response.xpath("//*[@id='site-container']/div[2]/div[4]/p/a/@href").extract():

        item = ArticleItem()
        item['title'] = 'ranvijay'
        log.msg('***************    :   '
                + str(response.xpath("//form[@class='subnav-search left']/input/@value"
                ).extract()))
        item['url'] = \
            response.xpath("//*[@id='site-container']/div[1]/div/div/span/span/text()"
                           ).extract()
        yield item

items.py

from scrapy.item import Item, Field

class ArticleItem(Item):
    title = Field()
    url = Field()

loginform.py

import sys
from argparse import ArgumentParser
from collections import defaultdict
from lxml import html

__version__ = '1.0'  # also update setup.py


def _form_score(form):
    score = 0

    # In case of user/pass or user/pass/remember-me

    if len(form.inputs.keys()) in (2, 3):
        score += 10

    typecount = defaultdict(int)
    for x in form.inputs:
        type_ = (x.type if isinstance(x, html.InputElement) else 'other'
                 )
        typecount[type_] += 1

    if typecount['text'] > 1:
        score += 10
    if not typecount['text']:
        score -= 10

    if typecount['password'] == 1:
        score += 10
    if not typecount['password']:
        score -= 10

    if typecount['checkbox'] > 1:
        score -= 10
    if typecount['radio']:
        score -= 10

    return score


def _pick_form(forms):
    """Return the form most likely to be a login form"""

    return sorted(forms, key=_form_score, reverse=True)[0]


def _pick_fields(form):
    """Return the most likely field names for username and password"""

    userfield = passfield = emailfield = None
    for x in form.inputs:
        if not isinstance(x, html.InputElement):
            continue

        type_ = x.type
        if type_ == 'password' and passfield is None:
            passfield = x.name
        elif type_ == 'text' and userfield is None:
            userfield = x.name
        elif type_ == 'email' and emailfield is None:
            emailfield = x.name

    return (userfield or emailfield, passfield)


def submit_value(form):
    """Returns the value for the submit input, if any"""

    for x in form.inputs:
        if x.type == 'submit' and x.name:
            return [(x.name, x.value)]
    else:
        return []


def fill_login_form(
    url,
    body,
    username,
    password,
    ):
    doc = html.document_fromstring(body, base_url=url)
    form = _pick_form(doc.xpath('//form'))
    (userfield, passfield) = _pick_fields(form)
    form.fields[userfield] = username
    form.fields[passfield] = password
    form_values = form.form_values() + submit_value(form)
    return (form_values, form.action or form.base_url, form.method)


def main():
    ap = ArgumentParser()
    ap.add_argument('-u', '--username', default='username')
    ap.add_argument('-p', '--password', default='secret')
    ap.add_argument('url')
    args = ap.parse_args()

    try:
        import requests
    except ImportError:
        print 'requests library is required to use loginform as a tool'

    r = requests.get(args.url)
    (values, action, method) = fill_login_form(args.url, r.text,
            args.username, args.password)
    print '''url: {0}
method: {1}
payload:'''.format(action, method)
    for (k, v) in values:
        print '- {0}: {1}'.format(k, v)


if __name__ == '__main__':
    sys.exit(main())
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!