建材网(聚焦建筑材料、装饰材料的垂直 B2B 电商平台,如中国建材网、建材在线等)的商品详情数据(如材质规格、工程案例、供应商资质、批量报价等)对工程采购、供应链比价、建材选型等场景具有重要价值。由于平台多为垂直领域站点,无统一公开 API,开发者需通过页面解析实现商品详情(item_get)的获取。本文以典型建材网为例,系统讲解接口逻辑、技术实现、建材场景适配及反爬应对,帮助构建稳定的建材商品详情获取系统。
一、接口基础认知(核心功能与场景)
二、对接前置准备(环境与 URL 结构)
三、接口调用流程(基于页面解析与动态接口)
四、代码实现示例(Python)
import requests
import time
import random
import re
import json
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from typing import Dict, List
class JcwItemApi:
def __init__(self, base_domain: str = "https://www.example-jc.com", proxy_pool: List[str] = None, cookie: str = "", area: str = "110000"):
self.base_domain = base_domain # 建材网域名(如中国建材网、建材在线)
self.detail_url = f"{base_domain}/product/{{product_id}}.html"
self.api_url = f"{base_domain}/api/product/detail"
self.ua = UserAgent()
self.proxy_pool = proxy_pool # 代理池列表
self.cookie = cookie # 登录态Cookie(部分站点需登录查看报价)
self.area = area # 区域编码(默认北京110000)
def _get_headers(self) -> Dict[str, str]:
"""生成随机请求头,包含区域信息"""
headers = {
"User-Agent": self.ua.random,
"Referer": f"{self.base_domain}/",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"X-Requested-With": "XMLHttpRequest"
}
if self.cookie:
headers["Cookie"] = self.cookie
return headers
def _get_proxy(self) -> Dict[str, str]:
"""随机获取代理"""
if self.proxy_pool and len(self.proxy_pool) > 0:
proxy = random.choice(self.proxy_pool)
return {"http": proxy, "https": proxy}
return None
def _clean_price(self, price_str: str) -> float:
"""清洗价格(去除¥、/吨等)"""
if not price_str:
return 0.0
price_str = re.sub(r"[^\d.]", "", price_str)
return float(price_str) if price_str else 0.0
def _parse_param_table(self, soup) -> Dict:
"""解析参数表格为键值对"""
params = {}
# 适配常见参数表格结构(可能有多个表格)
for table in soup.select("table.param-table, div.param-table table"):
for row in table.select("tr"):
th = row.select_one("th")?.text.strip()
td = row.select_one("td")?.text.strip()
if th and td:
params[th] = td
return params
def _parse_static_data(self, html: str) -> Dict:
"""解析主页面静态数据"""
soup = BeautifulSoup(html, "lxml")
# 提取规格选项(多规格商品)
specs = []
spec_container = soup.select_one("div.spec-options")
if spec_container:
spec_name = spec_container.select_one(".spec-name")?.text.strip() or "规格"
spec_values = [
{
"name": option.text.strip(),
"spec_id": option.get("data-spec") or "", # 规格ID
"selected": "selected" in option.get("class", [])
}
for option in spec_container.select(".spec-item")
]
if spec_values:
specs.append({
"name": spec_name,
"values": spec_values
})
return {
"title": soup.select_one("h1.pro-title")?.text.strip() or "",
"images": {
"main": [img.get("src") for img in soup.select("div.main-img img") if img.get("src")],
"detail": [img.get("src") for img in soup.select("div.detail-img img") if img.get("src")],
"cases": [img.get("src") for img in soup.select("div.case-img img") if img.get("src")]
},
"price": {
"single_str": soup.select_one("div.price-box .current-price")?.text.strip() or "",
"tax_str": soup.select_one("div.price-box .tax-tag")?.text.strip() or ""
},
"supplier": {
"name": soup.select_one("div.supplier-info .name")?.text.strip() or "",
"contact": soup.select_one("div.supplier-info .contact")?.text.strip() or "",
"qualifications": [q.text.strip() for q in soup.select("div.qualification-tags span")]
},
"application": {
"scenes": soup.select_one("div.application-scene")?.text.strip() or "",
"standards": [s.text.strip() for s in soup.select("div.standards span")]
},
"transport": {
"method": soup.select_one("div.transport-method")?.text.strip() or "",
"delivery": soup.select_one("div.delivery-time")?.text.strip() or ""
},
"params": self._parse_param_table(soup), # 结构化参数表
"specs": specs # 规格选项(如管径、长度)
}
def _fetch_api_data(self, product_id: str, spec_id: str = "", headers: Dict[str, str], proxy: Dict[str, str]) -> Dict:
"""调用动态API接口获取核心数据"""
api_data = {"ladderPrices": [], "stock": {}, "tests": [], "cases": []}
try:
params = {
"pid": product_id,
"area": self.area, # 区域参数
"t": int(time.time() * 1000)
}
if spec_id:
params["specId"] = spec_id # 多规格时传递规格ID
response = requests.get(
self.api_url,
params=params,
headers=headers,
proxies=proxy,
timeout=10
)
data = response.json()
if not data.get("success", True):
return api_data
# 解析阶梯价
api_data["ladderPrices"] = data.get("ladderPrices", [])
# 解析库存
api_data["stock"] = {
"total": data.get("stock", {}).get("total", 0),
"areaStock": data.get("stock", {}).get("areaStock", {})
}
# 解析检测报告
api_data["tests"] = data.get("tests", [])
# 解析工程案例
api_data["cases"] = data.get("cases", [])
except Exception as e:
print(f"API数据获取失败: {str(e)}")
return api_data
def _merge_multi_specs(self, static_specs: List[Dict], product_id: str, headers: Dict[str, str], proxy: Dict[str, str]) -> List[Dict]:
"""合并多规格商品的价格和库存"""
merged_specs = []
for spec_group in static_specs:
spec_name = spec_group["name"]
merged_values = []
for spec in spec_group["values"]:
spec_id = spec["spec_id"]
if not spec_id:
merged_values.append(spec)
continue
# 调用规格对应的API接口
spec_data = self._fetch_api_data(product_id, spec_id, headers, proxy)
merged_values.append({
**spec,** {
"ladderPrices": spec_data["ladderPrices"],
"stock": spec_data["stock"]
}
})
merged_specs.append({
"name": spec_name,
"values": merged_values
})
return merged_specs
def item_get(self, product_id: str, timeout: int = 10) -> Dict:
"""
获取建材网商品详情
:param product_id: 商品ID(如123456)
:param timeout: 超时时间
:return: 标准化商品数据
"""
try:
# 1. 主页面请求
url = self.detail_url.format(product_id=product_id)
headers = self._get_headers()
proxy = self._get_proxy()
# 随机延迟,避免反爬
time.sleep(random.uniform(2, 4))
response = requests.get(
url=url,
headers=headers,
proxies=proxy,
timeout=timeout
)
response.raise_for_status()
main_html = response.text
# 2. 解析主页面数据
static_data = self._parse_static_data(main_html)
if not static_data["title"]:
return {"success": False, "error_msg": "商品不存在或已下架"}
# 3. 获取API核心数据(默认规格)
api_data = self._fetch_api_data(product_id, "", headers, proxy)
# 4. 处理多规格商品(若有)
merged_specs = static_data["specs"]
if static_data["specs"] and any(len(s["values"]) > 1 for s in static_data["specs"]):
merged_specs = self._merge_multi_specs(static_data["specs"], product_id, headers, proxy)
# 5. 整合结果
result = {
"success": True,
"data": {
"product_id": product_id,
**static_data,** api_data, # 合并阶梯价/库存/检测报告/案例
"specs": merged_specs, # 多规格已合并数据
"url": url,
"update_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"area": self.area # 数据对应的区域
}
}
return result
except requests.exceptions.HTTPError as e:
if "403" in str(e):
return {"success": False, "error_msg": "触发反爬,建议更换代理或Cookie", "code": 403}
if "401" in str(e):
return {"success": False, "error_msg": "需登录查看,请更新Cookie", "code": 401}
return {"success": False, "error_msg": f"HTTP错误: {str(e)}", "code": response.status_code}
except Exception as e:
return {"success": False, "error_msg": f"获取失败: {str(e)}", "code": -1}
# 使用示例
if __name__ == "__main__":
# 代理池(替换为有效代理)
PROXIES = [
"http://123.45.67.89:8888",
"http://98.76.54.32:8080"
]
# 登录态Cookie(部分建材网需登录查看完整报价)
COOKIE = "userid=xxx; sessionId=xxx; area=110000" # 北京区域
# 初始化API客户端(以某建材网为例)
api = JcwItemApi(
base_domain="https://www.example-jc.com",
proxy_pool=PROXIES,
cookie=COOKIE,
area="110000" # 北京区域
)
# 获取商品详情(示例product_id)
product_id = "123456" # C30混凝土商品ID
result = api.item_get(product_id)
if result["success"]:
data = result["data"]
print(f"商品标题: {data['title']}")
print(f"基础价格: {data['price']['single_str']} | {data['price']['tax_str']}")
if data['ladderPrices']:
print("批量阶梯价:")
for ladder in data['ladderPrices']:
print(f" 采购{ladder['quantity']}{ladder['unit']}及以上: ¥{ladder['price']}/{ladder['unit']}")
print(f"库存信息: 总库存{data['stock']['total']}吨")
if data['stock']['areaStock']:
print("区域库存:")
for region, stock in data['stock']['areaStock'].items():
print(f" {region}: {stock}吨")
print(f"核心参数: 强度等级={data['params'].get('强度等级')} | 坍落度={data['params'].get('坍落度')} | 执行标准={data['params'].get('执行标准')}")
print(f"适用场景: {data['application']['scenes']}")
print(f"供应商: {data['supplier']['name']} | 资质: {', '.join(data['supplier']['qualifications'])}")
if data['cases']:
print("工程案例:")
for case in data['cases'][:2]:
print(f" {case['name']}: 用量{case['quantity']}吨 ({case['time']})")
if data['tests']:
print(f"检测报告: {len(data['tests'])}份(如{data['tests'][0]['name']})")
print(f"详情页: {data['url']}")
else:
print(f"获取失败: {result['error_msg']}(错误码: {result.get('code')})")