# -*- coding: utf-8 -*-
'''
Created on 2011. 9. 14.
@author: Taejun Park
'''
import urllib
from datetime import datetime
def getHtml(url):
"""
read the html of the channel guide page
"""
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
def getPrograms(broadcastingChannel):
"""
Args:
broadcastingChannel: KBS1 or KBS2,
"http://www.kbs.co.kr/plan_table/channel/1tv/index.html"
or "http://www.kbs.co.kr/plan_table/channel/2tv/index.html"
Returns:
List of [program:time], program is string(utf-16) and time datetime.time
this "getPrograms" is only suit for kbs. it parses program guide as a list
while parsing, there is some problem.
one type has got <a> tag which I tried to solve first(type1),but there are some
programs which don't have web sites(type2).
for them, this program checks character after <td> opening tag next to the time.
html samples:
type1 -
<td height="25" align="center" class="gd_list_date">17:35</td>
<td height="25"><a href="http://www.kbs.co.kr/1tv/enter/openconcert/index.html" target="_blank">대장경 천년 세계문화 축전기념 열린음악회 [경남 합천 군민생활 체육공원 ] </a>
type2 -
<td height="25" align="center" class="gd_list_date">19:00</td>
<td height="25">KBS 뉴스 <img src="http://img.kbs.co.kr/cms/plan_table/channel/images/icon_hd.jpg" width="13" height="11" hspace="2">
"""
#get html source
if broadcastingChannel is "KBS1":
htmlSource = getHtml("http://www.kbs.co.kr/plan_table/channel/1tv/index.html")
elif broadcastingChannel is "KBS2":
htmlSource = getHtml("http://www.kbs.co.kr/plan_table/channel/2tv/index.html")
#find the mark which marks the beginning of the channel guide
#every program has this class property, so it will be the mark for each program
markOfProgram = 'class="gd_list_date"' #20 chars
#programStartingPoint is Integer, and other points as well
programStartingPoint = htmlSource.find(markOfProgram)
#if there's no programStartingPoint, you should revise the markOfProgram
if programStartingPoint == -1:
print 'you need to arrange the startingPoint'
#List to return
programList = []
now = datetime.now()
while programStartingPoint != -1: #do it until there's no more program
#20chars + "<", and time has 5 chars (ex. 12:30
programTime = htmlSource[programStartingPoint+21:programStartingPoint+26]
#I found that after 60 chars, there begins <a> tag for type 1 or title for type 2
if htmlSource[programStartingPoint + 60] == '<': #for type 1
#title string is: <a >HERE</a>. between the <a> starting and end tags.
aTagOpeningEndingPoint = htmlSource[programStartingPoint+60:].find(">") + programStartingPoint+60
aTagClosingStartingPoint = htmlSource[aTagOpeningEndingPoint:].find("<") + aTagOpeningEndingPoint
title = htmlSource[ aTagOpeningEndingPoint+1 : aTagClosingStartingPoint ]
else: #for type 2
#count characters until program title be found, and then find < tag
aTagClosingStartingPoint = htmlSource[programStartingPoint+60:].find("<") + programStartingPoint+60
title = htmlSource[programStartingPoint+60:aTagClosingStartingPoint]
#before save, make title to be utf-16, make programTime as Datetime class
if programTime < '04:01': #if the programTime is for tomorrow
programTimeFormatted = datetime.strptime(programTime, "%H:%M")
#remove whitespaces from title
programList.append([title.strip(), now.replace(day=now.day+1, hour=programTimeFormatted.hour, minute=programTimeFormatted.minute)])
else:
programTimeFormatted = datetime.strptime(programTime, "%H:%M")
#remove whitespaces from title
programList.append([title.strip(), now.replace(hour=programTimeFormatted.hour, minute=programTimeFormatted.minute)])
#reset programStartingPoint
programStartingPoint = htmlSource[aTagClosingStartingPoint:].find(markOfProgram)
if programStartingPoint != -1: #-1 if there's no more program -> exit while
programStartingPoint = programStartingPoint + aTagClosingStartingPoint
return programList
이 프로그램은 그나마 주석을 적당히 달았다고 생각하는데요, getHtml 함수가 html 파일을 받아오고, getPrograms 함수가 파싱작업을 거쳐 프로그램들의 리스트를 리턴하게 됩니다.
특별히 복잡한 부분은 없고, kbs의 편성표 페이지를 보고 이런 저런 조건을 만들었습니다. 어떤 페이지는 링크가 달려있고 어떤 건 그렇지 않아 그에 맞춰 작성했고, 방송시간은 datetime 클래스로 저장하였습니다. 편성표에서 4시 이하의 시간은 다음날 방송으로 처리하도록 했습니다.
programList.append([title.strip(), now.replace(day=now.day+1, hour=programTimeFormatted.hour, minute=programTimeFormatted.minute)])
답글삭제이 부분의 day=now.day+1을 day=(now+timedelta(days=1)).day
로 정정했습니다. 앞의 코드는 월말에 정상동작하지 않네요