Amazon(亚马逊)作为全球最大的电商平台之一,其商品评论数据蕴含着丰富的用户反馈和市场洞察。通过亚马逊商品评论接口(item_review),开发者可以获取商品的用户评价、评分、评论时间等关键信息,为竞品分析、产品优化、市场策略制定提供数据支持。
一、Amazon item_review 接口核心特性分析
1. 接口定位与核心价值
2. 接口权限与调用限制
3. 核心参数解析
必选参数
可选参数
二、签名生成与返回数据结构
1. 签名生成逻辑
2. 返回数据结构解析
三、Python 实现方案
import requests
import time
import hmac
import hashlib
import base64
import urllib.parse
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
import re
import xml.etree.ElementTree as ET
from textblob import TextBlob # 用于情感分析
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
# 下载NLTK停用词(首次运行需要)
try:
stopwords.words('english')
except LookupError:
nltk.download('stopwords')
nltk.download('punkt')
# 配置日志
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
# 配置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False
class AmazonItemReview:
"""Amazon item_review接口封装类,用于获取和分析商品评论"""
# 地区对应的API端点
ENDPOINTS = {
'us': 'webservices.amazon.com',
'ca': 'webservices.amazon.ca',
'uk': 'webservices.amazon.co.uk',
'de': 'webservices.amazon.de',
'fr': 'webservices.amazon.fr',
'jp': 'webservices.amazon.co.jp',
'it': 'webservices.amazon.it',
'es': 'webservices.amazon.es',
'in': 'webservices.amazon.in'
}
def __init__(self, aws_access_key: str, aws_secret_key: str, associate_tag: str, locale: str = 'us'):
"""
初始化Amazon API客户端
:param aws_access_key: AWS访问密钥
:param aws_secret_key: AWS密钥
:param associate_tag: 关联标签
:param locale: 地区代码,默认美国(us)
"""
self.aws_access_key = aws_access_key
self.aws_secret_key = aws_secret_key
self.associate_tag = associate_tag
self.locale = locale
self.endpoint = self.ENDPOINTS.get(locale, 'webservices.amazon.com')
self.base_url = f'https://{self.endpoint}/onca/xml'
# 频率控制
self.last_request_time = 0
self.request_interval = 1 # 1秒/次,遵守API限制
# 英文停用词
self.stop_words = set(stopwords.words('english'))
def _generate_signature(self, params: Dict) -> str:
"""生成请求签名"""
# 1. 按字母顺序排序参数
sorted_params = sorted(params.items())
# 2. URL编码参数
encoded_params = []
for key, value in sorted_params:
encoded_key = urllib.parse.quote_plus(str(key))
encoded_value = urllib.parse.quote_plus(str(value))
encoded_params.append(f"{encoded_key}={encoded_value}")
# 3. 拼接参数字符串
params_string = '&'.join(encoded_params)
# 4. 构造待签名字符串
string_to_sign = f"GET\n{self.endpoint}\n/onca/xml\n{params_string}"
# 5. 计算HMAC-SHA256签名
signature = hmac.new(
self.aws_secret_key.encode('utf-8'),
string_to_sign.encode('utf-8'),
hashlib.sha256
).digest()
# 6. Base64编码并URL编码
return urllib.parse.quote_plus(base64.b64encode(signature))
def _check_rate_limit(self) -> None:
"""检查并控制请求频率"""
current_time = time.time()
elapsed = current_time - self.last_request_time
if elapsed < self.request_interval:
sleep_time = self.request_interval - elapsed
time.sleep(sleep_time)
self.last_request_time = time.time()
def get_reviews(self, item_id: str, page: int = 1, sort: str = 'Recent',
filter_by_star: str = 'AllStars') -> Optional[Dict]:
"""
获取商品评论
:param item_id: 商品ID(ASIN)
:param page: 页码
:param sort: 排序方式:'Recent'或'Helpful'
:param filter_by_star: 按星级筛选
:return: 评论数据字典
"""
# 构建基础参数
params = {
'AWSAccessKeyId': self.aws_access_key,
'AssociateTag': self.associate_tag,
'ItemId': item_id,
'Operation': 'ItemReviews',
'ResponseGroup': 'Reviews',
'Service': 'AWSECommerceService',
'Timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
'Version': '2013-08-01',
'ReviewPage': page,
'Sort': sort,
'FilterByStar': filter_by_star
}
# 生成签名
params['Signature'] = self._generate_signature(params)
# 检查频率限制
self._check_rate_limit()
try:
# 发送请求
response = requests.get(self.base_url, params=params, timeout=15)
response.raise_for_status()
# 解析XML响应
root = ET.fromstring(response.content)
# 检查是否有错误
if root.find('.//Error') is not None:
error_code = root.find('.//ErrorCode').text
error_msg = root.find('.//ErrorMessage').text
logging.error(f"API错误: {error_code} - {error_msg}")
return None
# 提取商品信息
item_info = {
'asin': root.find('.//ItemId').text,
'title': root.find('.//Title').text,
'total_reviews': int(root.find('.//TotalReviews').text) if root.find('.//TotalReviews') else 0,
'average_rating': float(root.find('.//AverageRating').text) if root.find('.//AverageRating') else 0
}
# 提取评论
reviews = []
for review in root.findall('.//Review'):
review_data = {
'review_id': review.find('ReviewID').text,
'rating': int(review.find('Rating').text),
'title': review.find('Title').text,
'content': review.find('Content').text,
'reviewer_name': review.find('ReviewerName').text,
'review_date': review.find('ReviewDate').text,
'helpful_votes': int(review.find('HelpfulVotes').text) if review.find('HelpfulVotes') else 0,
'total_votes': int(review.find('TotalVotes').text) if review.find('TotalVotes') else 0,
'reviewer_location': review.find('ReviewerLocation').text if review.find('ReviewerLocation') else None
}
# 检查是否有商家回复
response_element = review.find('Response')
if response_element is not None:
review_data['response'] = {
'content': response_element.find('Content').text,
'date': response_element.find('Date').text
}
reviews.append(review_data)
# 提取分页信息
pagination = {
'total_pages': int(root.find('.//TotalReviewPages').text) if root.find('.//TotalReviewPages') else 1,
'current_page': page
}
return {
'item': item_info,
'reviews': reviews,
'pagination': pagination
}
except requests.exceptions.RequestException as e:
logging.error(f"请求异常: {str(e)}")
return None
except ET.ParseError:
logging.error(f"XML解析失败: {response.content[:200]}...")
return None
def get_multiple_pages(self, item_id: str, max_pages: int = 5, sort: str = 'Recent',
filter_by_star: str = 'AllStars') -> Tuple[List[Dict], Dict]:
"""
获取多页评论
:param item_id: 商品ID
:param max_pages: 最大页数
:param sort: 排序方式
:param filter_by_star: 按星级筛选
:return: 评论列表和商品信息
"""
all_reviews = []
item_info = None
page = 1
while page <= max_pages:
logging.info(f"获取第 {page} 页评论")
result = self.get_reviews(item_id, page, sort, filter_by_star)
if not result:
break
# 保存商品信息(第一页)
if not item_info:
item_info = result['item']
# 添加评论
all_reviews.extend(result['reviews'])
# 检查是否已到最后一页
total_pages = result['pagination']['total_pages']
if page >= total_pages:
break
page += 1
logging.info(f"共获取 {len(all_reviews)} 条评论")
return all_reviews, item_info
def analyze_reviews(self, reviews: List[Dict]) -> Dict:
"""分析评论数据"""
if not reviews:
return {"error": "没有评论数据可分析"}
# 1. 评分分布
rating_counts = defaultdict(int)
for review in reviews:
rating_counts[review['rating']] += 1
# 2. 情感分析
sentiment_results = self._analyze_sentiment(reviews)
# 3. 关键词提取
keywords = self._extract_keywords(reviews)
# 4. 评论长度分析
content_lengths = [len(review['content']) for review in reviews if review['content']]
length_stats = {}
if content_lengths:
length_stats = {
'avg': round(sum(content_lengths) / len(content_lengths), 1),
'min': min(content_lengths),
'max': max(content_lengths),
'median': np.median(content_lengths)
}
# 5. 有用性分析
helpful_ratios = []
for review in reviews:
if review['total_votes'] > 0:
ratio = review['helpful_votes'] / review['total_votes']
helpful_ratios.append(ratio)
helpful_stats = {}
if helpful_ratios:
helpful_stats = {
'avg_ratio': round(sum(helpful_ratios) / len(helpful_ratios), 2),
'top3_reviews': sorted(
[(r['review_id'], r['helpful_votes'], r['total_votes']) for r in reviews if r['total_votes'] > 0],
key=lambda x: x[1]/x[2] if x[2] > 0 else 0,
reverse=True
)[:3]
}
# 6. 商家回复率
response_count = sum(1 for review in reviews if 'response' in review and review['response'])
response_rate = response_count / len(reviews) if reviews else 0
return {
'total_reviews': len(reviews),
'rating_analysis': {
'distribution': dict(rating_counts),
'avg_rating': sum(k * v for k, v in rating_counts.items()) / len(reviews) if reviews else 0
},
'sentiment_analysis': sentiment_results,
'keywords': keywords,
'content_length': length_stats,
'helpful_analysis': helpful_stats,
'response_rate': response_rate
}
def _analyze_sentiment(self, reviews: List[Dict]) -> Dict:
"""分析评论情感"""
sentiment_scores = []
positive_count = 0
negative_count = 0
neutral_count = 0
for review in reviews:
content = review['content'] or ''
# 使用TextBlob进行情感分析, polarity范围为[-1, 1],越接近1越积极
analysis = TextBlob(content)
polarity = analysis.sentiment.polarity
sentiment_scores.append(polarity)
# 分类情感
if polarity > 0.1:
positive_count += 1
elif polarity < -0.1:
negative_count += 1
else:
neutral_count += 1
# 按评分分析情感
rating_sentiment = defaultdict(list)
for review in reviews:
content = review['content'] or ''
analysis = TextBlob(content)
rating_sentiment[review['rating']].append(analysis.sentiment.polarity)
rating_sentiment_avg = {
rating: sum(scores)/len(scores) if scores else 0
for rating, scores in rating_sentiment.items()
}
return {
'total_analyzed': len(sentiment_scores),
'positive_ratio': positive_count / len(sentiment_scores) if sentiment_scores else 0,
'negative_ratio': negative_count / len(sentiment_scores) if sentiment_scores else 0,
'neutral_ratio': neutral_count / len(sentiment_scores) if sentiment_scores else 0,
'avg_polarity': sum(sentiment_scores)/len(sentiment_scores) if sentiment_scores else 0,
'rating_sentiment_avg': rating_sentiment_avg
}
def _extract_keywords(self, reviews: List[Dict], top_n: int = 20) -> Dict:
"""提取评论关键词"""
# 合并所有评论内容
all_content = ' '.join([review['content'] for review in reviews if review['content']])
# 清洗文本
all_content = re.sub(r'[^\w\s]', ' ', all_content.lower()) # 去除特殊字符并转为小写
all_content = re.sub(r'\s+', ' ', all_content).strip() # 合并空格
# 分词并过滤停用词
words = all_content.split()
filtered_words = [word for word in words if word not in self.stop_words and len(word) > 2]
# 计算词频
word_freq = defaultdict(int)
for word in filtered_words:
word_freq[word] += 1
# 按词频排序
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
# 按评分提取关键词
rating_keywords = {}
for rating in [1, 2, 3, 4, 5]:
rating_content = ' '.join([
review['content'] for review in reviews
if review['content'] and review['rating'] == rating
])
if rating_content:
rating_content = re.sub(r'[^\w\s]', ' ', rating_content.lower())
rating_content = re.sub(r'\s+', ' ', rating_content).strip()
rating_words = rating_content.split()
rating_filtered = [word for word in rating_words if word not in self.stop_words and len(word) > 2]
rating_word_freq = defaultdict(int)
for word in rating_filtered:
rating_word_freq[word] += 1
rating_keywords[rating] = sorted(rating_word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
return {
'overall': sorted_words[:top_n],
'by_rating': rating_keywords
}
def get_negative_feedback(self, reviews: List[Dict], top_n: int = 5) -> List[Dict]:
"""获取负面反馈的主要问题"""
# 筛选负面评论(评分<=2星或情感极性<0)
negative_reviews = []
for review in reviews:
content = review['content'] or ''
if not content:
continue
analysis = TextBlob(content)
if review['rating'] <= 2 or analysis.sentiment.polarity < 0:
negative_reviews.append(review)
if not negative_reviews:
return []
# 提取负面评论关键词
negative_content = ' '.join([review['content'] for review in negative_reviews])
negative_content = re.sub(r'[^\w\s]', ' ', negative_content.lower())
negative_content = re.sub(r'\s+', ' ', negative_content).strip()
words = negative_content.split()
filtered_words = [word for word in words if word not in self.stop_words and len(word) > 2]
word_freq = defaultdict(int)
for word in filtered_words:
word_freq[word] += 1
top_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
# 为每个关键词找到示例评论
result = []
for keyword, count in top_keywords:
examples = []
for review in negative_reviews[:50]: # 只在前50条中查找
if keyword in review['content'].lower() and len(examples) < 2:
examples.append({
'review_id': review['review_id'],
'content': review['content'][:100] + '...',
'rating': review['rating']
})
result.append({
'keyword': keyword,
'count': count,
'examples': examples
})
return result
def visualize_analysis(self, analysis: Dict, item_info: Dict, output_dir: str = ".") -> None:
"""可视化分析结果"""
item_title = item_info['title'][:30] + '...' if len(item_info['title']) > 30 else item_info['title']
# 1. 评分分布条形图
if 'rating_analysis' in analysis and analysis['rating_analysis']['distribution']:
ratings = sorted(analysis['rating_analysis']['distribution'].keys())
counts = [analysis['rating_analysis']['distribution'][r] for r in ratings]
plt.figure(figsize=(10, 6))
plt.bar(ratings, counts, color='skyblue')
plt.title(f'{item_title} - 评分分布 (平均: {analysis["rating_analysis"]["avg_rating"]:.1f})')
plt.xlabel('星级')
plt.ylabel('评论数量')
plt.xticks(ratings)
for i, v in enumerate(counts):
plt.text(i + 1, v + 0.5, str(v), ha='center')
plt.tight_layout()
plt.savefig(f"{output_dir}/rating_distribution.png")
plt.close()
logging.info(f"评分分布图表已保存至 {output_dir}/rating_distribution.png")
# 2. 情感分析饼图
if 'sentiment_analysis' in analysis:
sentiment = analysis['sentiment_analysis']
labels = ['正面', '负面', '中性']
sizes = [
sentiment['positive_ratio'],
sentiment['negative_ratio'],
sentiment['neutral_ratio']
]
colors = ['#4CAF50', '#F44336', '#9E9E9E']
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title(f'{item_title} - 评论情感分布')
plt.axis('equal')
plt.tight_layout()
plt.savefig(f"{output_dir}/sentiment_distribution.png")
plt.close()
logging.info(f"情感分布图表已保存至 {output_dir}/sentiment_distribution.png")
# 3. 关键词云图
if 'keywords' in analysis and analysis['keywords']['overall']:
try:
# 准备关键词文本
keyword_text = ' '.join([kw for kw, _ in analysis['keywords']['overall']])
# 生成词云
wc = WordCloud(
background_color="white",
width=800,
height=600,
max_words=50
).generate(keyword_text)
plt.figure(figsize=(10, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(f"{item_title} - 评论关键词云图")
plt.tight_layout(pad=0)
plt.savefig(f"{output_dir}/keyword_cloud.png")
plt.close()
logging.info(f"关键词云图已保存至 {output_dir}/keyword_cloud.png")
except Exception as e:
logging.warning(f"生成关键词云图失败: {e}")
def export_to_excel(self, reviews: List[Dict], analysis: Dict, item_info: Dict, filename: str) -> None:
"""导出评论数据和分析结果到Excel"""
if not reviews and not analysis:
logging.warning("没有数据可导出")
return
try:
with pd.ExcelWriter(filename) as writer:
# 商品信息
pd.DataFrame([item_info]).to_excel(writer, sheet_name='商品信息', index=False)
# 评论数据
if reviews:
df_reviews = pd.DataFrame(reviews)
df_reviews.to_excel(writer, sheet_name='评论数据', index=False)
# 分析结果
if analysis:
# 评分分析
if 'rating_analysis' in analysis:
df_rating = pd.DataFrame([
{"评分": k, "数量": v, "比例": v/analysis['total_reviews']}
for k, v in analysis['rating_analysis']['distribution'].items()
])
df_rating.to_excel(writer, sheet_name='评分分析', index=False)
# 情感分析
if 'sentiment_analysis' in analysis:
df_sentiment = pd.DataFrame([{
"类型": "正面",
"比例": analysis['sentiment_analysis']['positive_ratio']
}, {
"类型": "负面",
"比例": analysis['sentiment_analysis']['negative_ratio']
}, {
"类型": "中性",
"比例": analysis['sentiment_analysis']['neutral_ratio']
}])
df_sentiment.to_excel(writer, sheet_name='情感分析', index=False)
# 关键词
if 'keywords' in analysis and analysis['keywords']['overall']:
df_keywords = pd.DataFrame([
{"关键词": kw, "出现次数": count}
for kw, count in analysis['keywords']['overall']
])
df_keywords.to_excel(writer, sheet_name='关键词分析', index=False)
logging.info(f"数据已导出至 {filename}")
except Exception as e:
logging.error(f"导出Excel失败: {e}")
# 示例调用
if __name__ == "__main__":
# 替换为实际的参数(从Amazon开发者平台获取)
AWS_ACCESS_KEY = "your_aws_access_key"
AWS_SECRET_KEY = "your_aws_secret_key"
ASSOCIATE_TAG = "your_associate_tag"
ITEM_ID = "B07VGRJDFY" # 示例ASIN,可替换为任意Amazon商品ASIN
LOCALE = "us" # 地区代码
# 初始化API客户端
amazon_reviews = AmazonItemReview(
aws_access_key=AWS_ACCESS_KEY,
aws_secret_key=AWS_SECRET_KEY,
associate_tag=ASSOCIATE_TAG,
locale=LOCALE
)
# 1. 获取评论数据
print("=== 获取评论数据 ===")
# 获取最多5页评论
reviews, item_info = amazon_reviews.get_multiple_pages(
item_id=ITEM_ID,
max_pages=5,
sort='Recent'
)
if item_info:
print(f"商品: {item_info['title']}")
print(f"ASIN: {item_info['asin']}")
print(f"总评论数: {item_info['total_reviews']}")
print(f"平均评分: {item_info['average_rating']}")
print(f"获取评论数: {len(reviews)}")
# 2. 分析评论数据
print("\n=== 评论数据分析 ===")
if reviews:
analysis = amazon_reviews.analyze_reviews(reviews)
print(f"评分分布: {analysis['rating_analysis']['distribution']}")
print(f"情感分析: 正面{analysis['sentiment_analysis']['positive_ratio']:.1%}, "
f"负面{analysis['sentiment_analysis']['negative_ratio']:.1%}, "
f"中性{analysis['sentiment_analysis']['neutral_ratio']:.1%}")
print(f"商家回复率: {analysis['response_rate']:.1%}")
print("\n热门关键词:")
for kw, count in analysis["keywords"]["overall"][:10]:
print(f" {kw}: {count}次")
# 3. 获取负面反馈
print("\n=== 负面反馈主要问题 ===")
negative_feedback = amazon_reviews.get_negative_feedback(reviews)
for i, feedback in enumerate(negative_feedback[:5], 1):
print(f"{i}. 关键词: {feedback['keyword']} (出现{feedback['count']}次)")
if feedback["examples"]:
print(f" 示例: {feedback['examples'][0]['content']}")
# 4. 可视化分析结果
amazon_reviews.visualize_analysis(analysis, item_info)
# 5. 导出数据到Excel
amazon_reviews.export_to_excel(reviews, analysis, item_info, "亚马逊商品评论分析.xlsx")
else:
print("未获取到评论数据,无法进行分析")