diff --git a/resumes/management/commands/import_accounting_resumes.py b/resumes/management/commands/import_accounting_resumes.py index 5f810dd..11b9990 100644 --- a/resumes/management/commands/import_accounting_resumes.py +++ b/resumes/management/commands/import_accounting_resumes.py @@ -1,9 +1,10 @@ from datetime import datetime, timedelta - from django.core.management.base import BaseCommand +from pandas._libs.tslibs.timestamps import Timestamp import pandas as pd from resumes.models import ResumeBasic import re +import traceback class Command(BaseCommand): @@ -20,24 +21,24 @@ class Command(BaseCommand): default_source = options['source'] df = pd.read_excel(filepath) - # 中文列名映射(依据新版表格) rename_map = { - '简历ID': 'resume_id', '姓名': 'name', '性别': 'gender', '年龄': 'age', + '姓名': 'name', '性别': 'gender', '年龄': 'age', '手机': 'phone', '婚姻状况': 'marital_status', '身高': 'height', '体重': 'weight', - '学历': 'education', '毕业学校': 'school', '专业': 'major', '工作经验': 'work_years', - '现居住地': 'current_location', '期望职位': 'expected_position', '期望月薪': 'expected_salary', - '工作地点': 'job_location', '到岗时间': 'available_time', '更新时间': 'update_time' + '学历': 'education', '毕业学校': 'school', '工作经验': 'work_years', + '现居住地': 'current_location', '工作地点': 'job_location', '到岗时间': 'available_time', + '更新时间': 'update_time', '最高学历': 'education', '婚姻状态': 'marital_status', + '民族': 'ethnicity', '工作职能': 'job_function', '意向岗位': 'intended_position', + '从事行业': 'industry', '期望薪资': 'expected_salary', '工作性质': 'job_property', + '求职状态': 'job_status', } df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True) - # 默认字段填充 df['source_id'] = default_source df['crawl_keywords'] = default_keyword - # 时间格式清洗 def parse_update_time(val): if pd.isna(val): - return None + return datetime(2019, 12, 12) val = str(val) now = datetime.now() if "刚刚" in val: @@ -49,52 +50,90 @@ class Command(BaseCommand): days = int(re.search(r'\d+', val).group()) return now - timedelta(days=days) try: - return pd.to_datetime(val) + dt = pd.to_datetime(val) + return dt.to_pydatetime() except Exception: + return datetime(2019, 12, 12) + + df['update_time'] = df['update_time'].apply(parse_update_time) + + def val(v, field=None): + if v is None or pd.isna(v): + if field == 'update_time': + return datetime(2019, 12, 12) return None + if field == 'update_time': + if isinstance(v, Timestamp): + return v.to_pydatetime() + if isinstance(v, str): + try: + return pd.to_datetime(v).to_pydatetime() + except Exception: + return datetime(2019, 12, 12) + if isinstance(v, datetime): + return v + return datetime(2019, 12, 12) + if isinstance(v, Timestamp): + return v.to_pydatetime() + return v - if 'update_time' in df.columns: - df['update_time'] = df['update_time'].apply(parse_update_time) + success_count = 0 + fail_count = 0 + errors = [] - # 清洗身高/体重(复合字段提取) - def extract_height_weight(text): - text = str(text) if text and not pd.isna(text) else '' - h = re.search(r'(\d{2,3})\s*cm', text) - w = re.search(r'(\d{2,3})\s*kg', text) - return { - 'height': int(h.group(1)) if h else None, - 'weight': int(w.group(1)) if w else None - } + for i, row in df.iterrows(): + try: + resume_id = val(row.get('resume_id')) + defaults = { + 'name': val(row.get('name')), + 'gender': val(row.get('gender')), + 'age': val(row.get('age')), + 'phone': val(row.get('phone')), + 'marital_status': val(row.get('marital_status')), + 'height': val(row.get('height')), + 'weight': val(row.get('weight')), + 'education': val(row.get('education')), + 'school': val(row.get('school')), + 'work_years': val(row.get('work_years')), + 'current_location': val(row.get('current_location')), + 'job_location': val(row.get('job_location')), + 'available_time': val(row.get('available_time')), + 'update_time': val(row['update_time'], field='update_time'), + 'ethnicity': val(row.get('ethnicity')), + 'job_function': val(row.get('job_function')), + 'intended_position': val(row.get('intended_position')), + 'industry': val(row.get('industry')), + 'expected_salary': val(row.get('expected_salary')), + 'job_property': val(row.get('job_property')), + 'job_status': val(row.get('job_status')), + 'source_id': val(row.get('source_id')), + 'crawl_keywords': val(row.get('crawl_keywords')), + } - for idx, row in df.iterrows(): - text = ' '.join([str(v) for k, v in row.items() if k not in ['height', 'weight']]) - parsed = extract_height_weight(text) - for key in ['height', 'weight']: - val = row.get(key) - try: - if pd.isna(val) or str(val).strip().lower() in ['nan', 'none', 'null', '']: - df.at[idx, key] = parsed[key] - except: - df.at[idx, key] = parsed[key] + # 安全方式:get_or_create + 逐字段 set + obj, _ = ResumeBasic.objects.get_or_create(resume_id=resume_id) - if 'age' in df.columns: - df['age'] = df['age'].apply(lambda x: int(re.search(r'\d+', str(x)).group()) if pd.notna(x) and re.search(r'\d+', str(x)) else None) + for k, v in defaults.items(): + try: + setattr(obj, k, v) + except Exception as field_error: + print(f"[字段设置错误] {k} = {v!r} ({type(v)}) → {field_error}") + raise - valid_fields = [f.name for f in ResumeBasic._meta.fields] - df = df[[col for col in df.columns if col in valid_fields]] + obj.save() + success_count += 1 - # 清除所有 NaN -> None - for col in df.columns: - df[col] = df[col].apply(lambda x: None if pd.isna(x) or str(x).strip().lower() in ['nan', 'none', 'null', ''] else x) + except Exception as e: + fail_count += 1 + errors.append((i + 2, str(e))) + print(f"\n❌ 第 {i + 2} 行出错:{e}") + print(f"resume_id: {repr(resume_id)} ({type(resume_id).__name__})") + for k, v in defaults.items(): + print(f"{k:<20} | {repr(v):<30} | {type(v).__name__}") + traceback.print_exc() - records = df.to_dict(orient='records') - existing_ids = set(ResumeBasic.objects.filter( - resume_id__in=[r["resume_id"] for r in records if "resume_id" in r] - ).values_list("resume_id", flat=True)) - - new_records = [r for r in records if r.get("resume_id") not in existing_ids] - - ResumeBasic.objects.bulk_create([ResumeBasic(**r) for r in new_records]) - self.stdout.write(self.style.SUCCESS( - f"✅ 成功导入 {len(new_records)} 条简历记录(关键词:{default_keyword},来源:{default_source})" - )) + self.stdout.write(self.style.SUCCESS(f"导入完成!总数:{len(df)},成功:{success_count},失败:{fail_count}")) + if errors: + self.stdout.write(self.style.WARNING("失败记录如下:")) + for line_no, msg in errors: + self.stdout.write(f" 第 {line_no} 行出错:{msg}")