更新:Python:获取全国旅客列车车次及其始发终点站(更新)


最近在尝试爬取12306的种种信息,就从爬全国火车车次下手了,用HttpWatch抓包后发现可能比想象中的简单,这里记录一下。

HttpWatch抓包:

选择出发站、目的站和出发日期后(我这里选的是2015年5月30日宜昌东到上海),点击查询实际上是发送了一个GET请求:

https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date
=2015-05-30&leftTicketDTO.from_station=HAN&leftTicketDTO.to_station
=SHH&purpose_codes=ADULT

想必是在给leftTicketDTO传参,包括train_date即出发日期、from_station即出发站、to_station即目的站,后面的purpose_codes即区分成人、学生之类的。

这个GET请求返回的值很有意思,直接就是JSON格式的列车信息:

{
    "validateMessagesShowId":"_validatorMessage",
    "status":true,
    "httpstatus":200,
    "data":[
        {
            "queryLeftNewDTO":{
                "train_no":"42000D300806",
                "station_train_code":"D3008",
                "start_station_telecode":"HAN",
                "start_station_name":"宜昌东",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"06:45",
                "arrive_time":"15:00",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:15",
                "canWebBuy":"Y",
                "lishiValue":"495",
                "yp_info":"O034750930O034753210M041600036",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0O0M0",
                "train_seat_feature":"3",
                "seat_types":"OOM",
                "location_code":"N2",
                "from_station_no":"01",
                "to_station_no":"20",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"有",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"有",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNEMzAwOCMwODoxNSMwNjo0NSM0MjAwMEQzMDA4MDYjSEFOI0FPSCMxNTowMCPlrpzmmIzkuJwj5LiK5rW36Jm55qGlIzAxIzIwI08wMzQ3NTA5MzBPMDM0NzUzMjEwTTA0MTYwMDAzNiNOMiMxNDMyNTIxMTkwNDA5IzE0Mjc4Njk4MDAwMDAjQkZCNENGOTlDRkQ3Mzg1MjBEQ0JGQTQ4QjVBMDZGNEU1MzI4Nzk4QzVBQzA1RTVFM0QyREFBQjg%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"77000D221400",
                "station_train_code":"D2214",
                "start_station_telecode":"CUW",
                "start_station_name":"重庆北",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"11:46",
                "arrive_time":"20:14",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:28",
                "canWebBuy":"Y",
                "lishiValue":"508",
                "yp_info":"O034750086M041600021O034753000",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0M0O0",
                "train_seat_feature":"3",
                "seat_types":"OMO",
                "location_code":"W1",
                "from_station_no":"05",
                "to_station_no":"21",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"无",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"有",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNEMjIxNCMwODoyOCMxMTo0NiM3NzAwMEQyMjE0MDAjSEFOI0FPSCMyMDoxNCPlrpzmmIzkuJwj5LiK5rW36Jm55qGlIzA1IzIxI08wMzQ3NTAwODZNMDQxNjAwMDIxTzAzNDc1MzAwMCNXMSMxNDMyNTIxMTkwNDA5IzE0Mjc4Njk4MDAwMDAjMjI3NTg5NDdGMTI3NkJCRDcxNEQwRTRDQUEwODkxOTMxREIzNTgxNERCRjRCOTQ1OENBQjQyMDQ%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"77000D221803",
                "station_train_code":"D2218",
                "start_station_telecode":"CUW",
                "start_station_name":"重庆北",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"13:07",
                "arrive_time":"21:22",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:15",
                "canWebBuy":"Y",
                "lishiValue":"495",
                "yp_info":"O034750024M041600064O034753000",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0M0O0",
                "train_seat_feature":"3",
                "seat_types":"OMO",
                "location_code":"W1",
                "from_station_no":"06",
                "to_station_no":"23",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"无",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"有",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNEMjIxOCMwODoxNSMxMzowNyM3NzAwMEQyMjE4MDMjSEFOI0FPSCMyMToyMiPlrpzmmIzkuJwj5LiK5rW36Jm55qGlIzA2IzIzI08wMzQ3NTAwMjRNMDQxNjAwMDY0TzAzNDc1MzAwMCNXMSMxNDMyNTIxMTkwNDA5IzE0Mjc4Njk4MDAwMDAjNzk2QUE0OENBREQyMjA4NDNDNTkwQUJDRjA3QjZEMTEyQkY2RjA4MjAwRUM5Q0IzNjc2NDExRTM%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"42000D307405",
                "station_train_code":"D3074",
                "start_station_telecode":"HAN",
                "start_station_name":"宜昌东",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"13:33",
                "arrive_time":"21:40",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:07",
                "canWebBuy":"Y",
                "lishiValue":"487",
                "yp_info":"O034750947M041600167O034753180",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0M0O0",
                "train_seat_feature":"3",
                "seat_types":"OMO",
                "location_code":"N3",
                "from_station_no":"01",
                "to_station_no":"17",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"有",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"有",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNEMzA3NCMwODowNyMxMzozMyM0MjAwMEQzMDc0MDUjSEFOI0FPSCMyMTo0MCPlrpzmmIzkuJwj5LiK5rW36Jm55qGlIzAxIzE3I08wMzQ3NTA5NDdNMDQxNjAwMTY3TzAzNDc1MzE4MCNOMyMxNDMyNTIxMTkwNDA5IzE0Mjc4Njk4MDAwMDAjODMxRTA5ODZEMjlCMTYxNTM5QkZERENEMkMyOTNFNUYxQkU3QTc0QzFEQjlBODJCRjAyNUZGRTI%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"420000K25405",
                "station_train_code":"K254",
                "start_station_telecode":"HAN",
                "start_station_name":"宜昌东",
                "end_station_telecode":"SNH",
                "end_station_name":"上海南",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"SNH",
                "to_station_name":"上海南",
                "start_time":"13:33",
                "arrive_time":"13:14",
                "day_difference":"1",
                "train_class_name":"",
                "lishi":"23:41",
                "canWebBuy":"Y",
                "lishiValue":"1421",
                "yp_info":"1020603192405560001610206003253035300261",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"W3431333",
                "yp_ex":"10401030",
                "train_seat_feature":"3",
                "seat_types":"1413",
                "location_code":"N2",
                "from_station_no":"01",
                "to_station_no":"21",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"16",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"有",
                "yb_num":"--",
                "yw_num":"有",
                "yz_num":"有",
                "ze_num":"--",
                "zy_num":"--",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNLMjU0IzIzOjQxIzEzOjMzIzQyMDAwMEsyNTQwNSNIQU4jU05IIzEzOjE0I%2BWunOaYjOS4nCPkuIrmtbfljZcjMDEjMjEjMTAyMDYwMzE5MjQwNTU2MDAwMTYxMDIwNjAwMzI1MzAzNTMwMDI2MSNOMiMxNDMyNTIxMTkwNDA5IzE0Mjc4Njk4MDAwMDAjNDQ2RTM1NDkzNzFBRENERDBDMzgwRTkxNjZDODE5ODU5MEY3NDUzQkU5QzM4OEVCNjRERDg2RkQ%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"760000D63801",
                "station_train_code":"D638",
                "start_station_telecode":"ICW",
                "start_station_name":"成都东",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"14:20",
                "arrive_time":"22:35",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:15",
                "canWebBuy":"Y",
                "lishiValue":"495",
                "yp_info":"O034750071M041600000O034753000",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0M0O0",
                "train_seat_feature":"3",
                "seat_types":"OMO",
                "location_code":"W2",
                "from_station_no":"09",
                "to_station_no":"25",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"无",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"无",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNENjM4IzA4OjE1IzE0OjIwIzc2MDAwMEQ2MzgwMSNIQU4jQU9IIzIyOjM1I%2BWunOaYjOS4nCPkuIrmtbfombnmoaUjMDkjMjUjTzAzNDc1MDA3MU0wNDE2MDAwMDBPMDM0NzUzMDAwI1cyIzE0MzI1MjExOTA0MDkjMTQyNzg2OTgwMDAwMCMwMEM3QTE4MDhGRDYxOTc1QkRFRDcyQjUwNDRCODgzRENEQkZFODg1NDcyN0NEOTI2OUYwMThDNQ%3D%3D",
            "buttonTextInfo":"预订"
        },
        {
            "queryLeftNewDTO":{
                "train_no":"76000D220803",
                "station_train_code":"D2208",
                "start_station_telecode":"ICW",
                "start_station_name":"成都东",
                "end_station_telecode":"AOH",
                "end_station_name":"上海虹桥",
                "from_station_telecode":"HAN",
                "from_station_name":"宜昌东",
                "to_station_telecode":"AOH",
                "to_station_name":"上海虹桥",
                "start_time":"15:00",
                "arrive_time":"23:15",
                "day_difference":"0",
                "train_class_name":"动车",
                "lishi":"08:15",
                "canWebBuy":"Y",
                "lishiValue":"495",
                "yp_info":"O034750092M041600051O034753000",
                "control_train_day":"20301231",
                "start_train_date":"20150530",
                "seat_feature":"O3M3W3",
                "yp_ex":"O0M0O0",
                "train_seat_feature":"3",
                "seat_types":"OMO",
                "location_code":"W2",
                "from_station_no":"09",
                "to_station_no":"24",
                "control_day":59,
                "sale_time":"1430",
                "is_support_card":"0",
                "gg_num":"--",
                "gr_num":"--",
                "qt_num":"--",
                "rw_num":"--",
                "rz_num":"--",
                "tz_num":"--",
                "wz_num":"无",
                "yb_num":"--",
                "yw_num":"--",
                "yz_num":"--",
                "ze_num":"有",
                "zy_num":"有",
                "swz_num":"--"
            },
            "secretStr":"MjAxNS0wNS0zMCMwMCNEMjIwOCMwODoxNSMxNTowMCM3NjAwMEQyMjA4MDMjSEFOI0FPSCMyMzoxNSPlrpzmmIzkuJwj5LiK5rW36Jm55qGlIzA5IzI0I08wMzQ3NTAwOTJNMDQxNjAwMDUxTzAzNDc1MzAwMCNXMiMxNDMyNTIxMTkwNDEwIzE0Mjc4Njk4MDAwMDAjM0Q3N0Q3RTA3OEM3Nzc1RkQwNTE1MUU0OEE0NzM2RDA0REM0RTE4MjEwMUREMjQyNkFDRTU0MTQ%3D",
            "buttonTextInfo":"预订"
        }
    ],
    "messages":[

    ],
    "validateMessages":{

    }
}

提取需要的信息:

上面的内容很容易懂,但我这里只需要"train_no""start_station_telecode""end_station_telecode",直接用正则表达式就能搞定:

'"train_no":"(.*?)".*?"start_station_telecode":"([A-Z]*)".*?"end_station_telecode":"([A-Z]*)"'

得到的结果:

('42000D300806', 'HAN', 'AOH')
('77000D221400', 'CUW', 'AOH')
('77000D221803', 'CUW', 'AOH')
('42000D307405', 'HAN', 'AOH')
('420000K25405', 'HAN', 'SNH')
('760000D63801', 'ICW', 'AOH')
('76000D220803', 'ICW', 'AOH')

这样思路就清晰了,通过穷举出发站和目的站发送GET请求,可以得到全国列车车次及其始发站、终点站。


获取火车站代号:

先回过头来,注意到上面那个GET,出发站目的站都是代号,所以为了能够遍历站点,首先要得到所以火车站的代号,其实这些信息就在12306网站的一个javascript文件里,URL如下:

https://kyfw.12306.cn/otn/resources/js/framework/station_name.js

每个火车站名字代号等通过|分隔,火车站之间通过@分隔,自然还是用正则表达式:

'@([a-z]*)\|(.*?)\|([A-Z]*)\|([a-z]*)\|([a-z]*)\|([0-9]*)'

得到的部分结果如下:

bjb 北京北 VAP beijingbei bjb 0
bjd 北京东 BOP beijingdong bjd 1
bji 北京 BJP beijing bj 2
bjn 北京南 VNP beijingnan bjn 3
bjx 北京西 BXP beijingxi bjx 4
gzn 广州南 IZQ guangzhounan gzn 5
cqb 重庆北 CUW chongqingbei cqb 6
cqi 重庆 CQW chongqing cq 7
cqn 重庆南 CRW chongqingnan cqn 8
gzd 广州东 GGQ guangzhoudong gzd 9
sha 上海 SHH shanghai sh 10
shn 上海南 SNH shanghainan shn 11
shq 上海虹桥 AOH shanghaihongqiao shhq 12
shx 上海西 SXH shanghaixi shx 13
tjb 天津北 TBP tianjinbei tjb 14
tji 天津 TJP tianjin tj 15
tjn 天津南 TIP tianjinnan tjn 16
tjx 天津西 TXP tianjinxi tjx 17
cch 长春 CCT changchun cc 18

在EXCEL导入文本文档,用空格作分隔符就很清楚啦!
然后在EXCEL里把火车站代号复制出来到文本文档就行了,观察上面的代号可以发现实际上火车站分了三级,第一级是枢纽火车站,一共72个;第二级是重要火车站,一共451个;剩下的就是一般的啦。

穷举出发站目的站,把得到的列车信息存到MySQL

把存储的火车站代号加载到数组:

f = open('stationcode.txt', 'r')
station = []
for line in f:
    station.append(line.strip())

连接数据库:

import MySQLdb

conn = MySQLdb.connect(
    host='localhost', user='root', passwd='*****', db='12306', port=3306)
cur = conn.cursor()

禁止SSL证书验证:

因为12306 SSL证书问题,需要取消验证SSL证书:

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

穷举:

date = '2015-05-30'
for i in range(start, end):
    for j in range(i, len(station)):
        print i, j
        time.sleep(0.3)
        r = urllib.urlopen('https://kyfw.12306.cn/otn/leftTicket/query?'
                           + 'leftTicketDTO.train_date=' + date
                           + '&leftTicketDTO.from_station=' + station[i]
                           + '&leftTicketDTO.to_station=' + station[j]
                           + '&purpose_codes=ADULT')

对每次GET得到的JSON数据用正则表达式提取:

f = re.findall('"train_no":"(.*?)".*?"' +
               'start_station_telecode":"([A-Z]*)".*?"' +
               'end_station_telecode":"([A-Z]*)"',
               r.read())

将列车车次、始发站、终点站存到数据库:

for l in f:
    cur.execute("insert ignore into traininfo values(%s,%s,%s)", l)
    conn.commit()

几个问题:

  1. 还没有加入HTTP错误处理,所以频繁使用conn.commit(),这样会造成MySQL进程占用很多内存资源
  2. 因为12306会对频繁GETIP禁封,所以如果不采用多IP多进程抓取,以0.3秒每条的频率GET,会花很多时间
  3. 代码中用到的正则表达式都是我自己临时写的,想必效果不太好

完整代码:

获取火车站代号

# -*- coding: utf-8 -*-
# 用来格式化12306存储的火车站信息,包括火车站名、火车站代号等信息
# 火车站信息来源于https://kyfw.12306.cn/otn/resources/js/framework/station_name.js
# 此js文件存储于外部文件station.txt中
# 格式化后格式如:bjb 北京北 VAP beijingbei bjb 0
import re

f = open('station.txt', 'r').read()
r = re.findall('@([a-z]*)\|(.*?)\|([A-Z]*)\|([a-z]*)\|([a-z]*)\|([0-9]*)', f)
for s in r:
    for b in s:
        print b,
    print

穷举并存入MySQL

# -*- coding: utf-8 -*-
# 通过穷举出发站、目的站向12306发送GET请求,将得到的列车信息存储至MySQL数据库
import urllib
import re
import ssl
import MySQLdb
import time

# 因12306证书问题,取消SSL验证
ssl._create_default_https_context = ssl._create_unverified_context


def get(station, start, end):
    date = '2015-05-30'  # 出发日期,注意一定要是未来几天
    for i in range(start, end):
        for j in range(i, len(station)):  # 可以理解为矩阵右上半部分
            print i, j  # 方便观察
            time.sleep(0.3)  # GET请求提交过快,12306会封IP,所以每次提交停顿0.3秒
            # 提交GET请求
            r = urllib.urlopen('https://kyfw.12306.cn/otn/leftTicket/query?'
                               + 'leftTicketDTO.train_date=' + date
                               + '&leftTicketDTO.from_station=' + station[i]
                               + '&leftTicketDTO.to_station=' + station[j]
                               + '&purpose_codes=ADULT')
            # 对得到的JSON数据用正则表达式提取
            f = re.findall('"train_no":"(.*?)".*?"' +
                           'start_station_telecode":"([A-Z]*)".*?"' +
                           'end_station_telecode":"([A-Z]*)"',
                           r.read())
            # 分离出list对象,存入数据库
            for l in f:
                # 表内为"trainno","startstation","endstation"
                # 使用ignore参数可以实现主键无重复插入
                cur.execute("insert ignore into traininfo values(%s,%s,%s)", l)
                conn.commit()  # commit()后上述修改才能生效


if __name__ == '__main__':
    # 下面是将火车站代号加载到数组
    f = open('stationcode.txt', 'r')
    station = []
    for line in f:
        station.append(line.strip())
    # 连接数据库
    conn = MySQLdb.connect(
        host='localhost', user='root', passwd='******', db='12306', port=3306)
    cur = conn.cursor()

    get(station, 0, len(station))
    cur.close()
    conn.close()
    print "done"