데이터 공부를 기록하는 공간

[crawling] 보도자료 list up 본문

STUDY/MLOPS

[crawling] 보도자료 list up

BOTTLE6 2022. 3. 14. 22:16
# version_2
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
today = datetime.datetime.now().strftime("%Y%m%d")

####################################################################
########################### 산 업 부 ################################
####################################################################
산업부 = "https://www.motie.go.kr/motie/ne/presse/press2/bbs/bbsList.do?bbs_cd_n=81" 
res = requests.get(산업부)
soup = BeautifulSoup(res.content, 'html.parser')

# 제목 / 주소 
titles = soup.select("tr > td.al > div > a")
href_front = "https://www.motie.go.kr/motie/ne/presse/press2/bbs/"
href = [href_front + title.get('href') for title in titles]
title = [title.get('title') for title in titles]
# 날짜
dates = soup.find_all("td", attrs={'class':"", 'data-device':""}) #data-device가 없는 것을 고름
date = [x.get_text() for x in dates]
# dataframe
df_1 = pd.DataFrame({"title":title, "href":href, "date":date} )
df_1['구분'] = '산업부'
df_1 
####################################################################
########################### 환 경 부 ################################
#################################################################### 
환경부 = "https://me.go.kr/home/web/board/list.do?menuId=286&boardMasterId=1&boardCategoryId=39"
res = requests.get(환경부)
soup = BeautifulSoup(res.content, 'html.parser')
titles = soup.find_all('td', attrs={'class':'al'})

# 제목
titles = soup.select("td.al")
title = [title.get_text().strip() for title in titles]
# 주소
hrefs = soup.select("td.al > a[href]")
href = ["https://me.go.kr/" + href.get('href') for href in hrefs]
# 날짜
dates = soup.find_all("td", attrs={"class":"",'span':""})
# 5개의 데이터 중 3번 째 이므로 
date = []
for i, x in enumerate(dates):
    if (i+1)%5==4:
        date.append(x.get_text().strip())
# dataframe
df_2 = pd.DataFrame({"title":title, "href":href, "date":date})
df_2 
df_2['구분'] = '환경부'

df = pd.concat([df_1, df_2], ignore_index=True).sort_values(by='date', ascending=False)
df.to_csv(f"{today}_보도자료.csv",index=False, encoding='cp949')

#####################################################################
########################### 키 워 드 ################################
#####################################################################
keywords = ['원자력','탄소중립'] 
for x in df['title']:
    for keyword in keywords:
        if keyword in x:
            print(x)
df

 

<결과 엑셀> 

 

Comments