Building a Job Scrapper-2

Building a Job Scrapper-2

Python 공부 5일차

Building a Job Scrapper-2

Extracting Indeed Pages part Two

import requests
from bs4 import BeautifulSoup

indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")

indeed_soup = BeautifulSoup(indeed_result.text, "html.parser")

pagination = indeed_soup.find("div", {"class" : "pagination"})

links = pagination.find_all("a")

pages = []
for link in links:
  pages.append(link.find("span").string)
  # pages.append(link.string)는 위와 같은 결과가 나옴
  pages = pages[0:-1]
  print(pages)

// output(페이지 숫자 전부 가져오기)
['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']

Requesting Each Page

# main.py
from indeed import extract_indeed_pages, extract_indeed_jobs

last_indeed_pages = extract_indeed_pages()

indeed_jobs = extract_indeed_jobs(last_indeed_pages)

# indeed.py
import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

def extract_indeed_pages():
  result = requests.get(URL)

  soup = BeautifulSoup(result.text, "html.parser")
  
  pagination = soup.find("div", {"class" : "pagination"})

  links = pagination.find_all("a")

  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))

  max_page = pages[-1]
  return max_page

def extract_indeed_jobs(last_page):
  jobs = []
  for page in  range(last_page):
    result = requests.get(f"{URL}&start={page*LIMIT}")
    print(result.status_code)
  return jobs

// output
200
200
200
...
각 페이지를 돌면서 값을 가져오는 것을 확인할 수 있다.

Extracting Titles

# indeed.py
import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

def extract_indeed_pages():
  result = requests.get(URL)

  soup = BeautifulSoup(result.text, "html.parser")

  pagination = soup.find("div", {"class" : "pagination"})

  links = pagination.find_all("a")

  pages = []
  for link in links[:-1]:
      pages.append(int(link.string))
    max_page = pages[-1]
    return max_page

def extract_indeed_jobs(last_page):
  jobs = []
  # for page in range(last_page):
  result = requests.get(f"{URL}&start={0*LIMIT}")

  soup = BeautifulSoup(result.text, "html.parser") // html 요소 가져오기

  results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"}) // 가져온 html 요소에서 div이면서 클래스명이 jobsearch-SerpJobCard인 요소 모두 찾기

  for result in results: // for loop 돌리기

    title = result.find("div", {"class" : "title"}).find("a")["title"] // div이면서 클래스명이 title이고, a tag의 title 찾기

    print(title)
  return jobs

// output
Python Developer
Android/ Mobile Application Developer
Software Engineer
Senior Advanced Analytics Analyst
Software Engineer - Direct Hire

참고

range - 연속된 숫자를 생성해주는 함수(마지막 숫자는 포함되지 않는다. 파라미터 수의 따라 생성하는 값이 다르다.(참고)
range(10)
// output
0, 1, 2, 3, 4, 5, 6, 7, 8, 9

댓글

가장 많이 본 글