Building a Job Scrapper-3

Building a Job Scrapper-3

Python 공부 6일차

Building a Job Scrapper-3

Extracting Companies

result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
  title = result.find("div", {"class" : "title"}).find("a")["title"]
  company = result.find("span", {"class" : "company"})
  company_anchor = company.find("a")
  if company_anchor is  not  None:
  # 회사 이름이 <span>...</span>으로 되어진 것과 <span><a>...</a></span>으로 되어진 것이 있기 때문에 if문 처리함
    print(company_anchor.string)
  else:
    print(company.string)
  company = company.strip()
  # whitespace 제거해줌
  print(company)
// output
Piper Companies
Amazon Data Services, Inc.
inficare technologies
Capgemini
...

Extracting Locations and Finishing up

#indeed.py
def extract_job(html):
  title = html.find("div", {"class" : "title"}).find("a")["title"]
  company = html.find("span", {"class" : "company"})

  if company
  # company가 없는 경우에도 있기 때문에 if문으로 감싸준다.
    company_anchor = company.find("a")
    if company_anchor is  not  None:
      company = str(company_anchor.string)
    else:
      company = str(company.string)
  company = company.strip()

  # data-rc-loc이라고 하는 attribute 속성 값 가져오기
  location = html.find("div", {"class" : "recJobLoc"})['data-rc-loc']
  job_id = html["data-jk"]

  return {
    'title': title,
    'company': company,
    'location': location,
    'link': f"https://www.indeed.com/viewjob?jk={job_id}"
  }

def extract_indeed_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&start={0*LIMIT}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})

    for result in results:
      job = extract_job(result)
    jobs.append(job)
  return jobs

#main.py
from indeed import extract_indeed_pages, extract_indeed_jobs

last_indeed_pages = extract_indeed_pages()

indeed_jobs = extract_indeed_jobs(last_indeed_pages)

print(indeed_jobs)

find_all vs. find

find_all - 찾은 모든 결과를 가져옴
find - 찾으 결과의 첫 번째를 가져옴

참고

댓글

가장 많이 본 글