导入遵化网站完善

2025-04-15 13:29:46 +08:00 · 2025-04-15 13:29:46 +08:00 · 8d3719206f
commit 8d3719206f
parent 286cbe907b
1 changed files with 88 additions and 49 deletions
--- a/resumes/management/commands/import_accounting_resumes.py
+++ b/resumes/management/commands/import_accounting_resumes.py
@ -1,9 +1,10 @@
 from datetime import datetime, timedelta
-
 from django.core.management.base import BaseCommand
+from pandas._libs.tslibs.timestamps import Timestamp
 import pandas as pd
 from resumes.models import ResumeBasic
 import re
+import traceback


 class Command(BaseCommand):
@ -20,24 +21,24 @@ class Command(BaseCommand):
        default_source = options['source']
        df = pd.read_excel(filepath)

-        # 中文列名映射（依据新版表格）
        rename_map = {
-            '简历ID': 'resume_id', '姓名': 'name', '性别': 'gender', '年龄': 'age',
+            '姓名': 'name', '性别': 'gender', '年龄': 'age',
            '手机': 'phone', '婚姻状况': 'marital_status', '身高': 'height', '体重': 'weight',
-            '学历': 'education', '毕业学校': 'school', '专业': 'major', '工作经验': 'work_years',
-            '现居住地': 'current_location', '期望职位': 'expected_position', '期望月薪': 'expected_salary',
-            '工作地点': 'job_location', '到岗时间': 'available_time', '更新时间': 'update_time'
+            '学历': 'education', '毕业学校': 'school', '工作经验': 'work_years',
+            '现居住地': 'current_location', '工作地点': 'job_location', '到岗时间': 'available_time',
+            '更新时间': 'update_time', '最高学历': 'education', '婚姻状态': 'marital_status',
+            '民族': 'ethnicity', '工作职能': 'job_function', '意向岗位': 'intended_position',
+            '从事行业': 'industry', '期望薪资': 'expected_salary', '工作性质': 'job_property',
+            '求职状态': 'job_status',
        }
        df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True)

-        # 默认字段填充
        df['source_id'] = default_source
        df['crawl_keywords'] = default_keyword

-        # 时间格式清洗
        def parse_update_time(val):
            if pd.isna(val):
-                return None
+                return datetime(2019, 12, 12)
            val = str(val)
            now = datetime.now()
            if "刚刚" in val:
@ -49,52 +50,90 @@ class Command(BaseCommand):
                days = int(re.search(r'\d+', val).group())
                return now - timedelta(days=days)
            try:
-                return pd.to_datetime(val)
+                dt = pd.to_datetime(val)
+                return dt.to_pydatetime()
            except Exception:
+                return datetime(2019, 12, 12)
+
+        df['update_time'] = df['update_time'].apply(parse_update_time)
+
+        def val(v, field=None):
+            if v is None or pd.isna(v):
+                if field == 'update_time':
+                    return datetime(2019, 12, 12)
                return None
+            if field == 'update_time':
+                if isinstance(v, Timestamp):
+                    return v.to_pydatetime()
+                if isinstance(v, str):
+                    try:
+                        return pd.to_datetime(v).to_pydatetime()
+                    except Exception:
+                        return datetime(2019, 12, 12)
+                if isinstance(v, datetime):
+                    return v
+                return datetime(2019, 12, 12)
+            if isinstance(v, Timestamp):
+                return v.to_pydatetime()
+            return v

-        if 'update_time' in df.columns:
-            df['update_time'] = df['update_time'].apply(parse_update_time)
+        success_count = 0
+        fail_count = 0
+        errors = []

-        # 清洗身高/体重（复合字段提取）
-        def extract_height_weight(text):
-            text = str(text) if text and not pd.isna(text) else ''
-            h = re.search(r'(\d{2,3})\s*cm', text)
-            w = re.search(r'(\d{2,3})\s*kg', text)
-            return {
-                'height': int(h.group(1)) if h else None,
-                'weight': int(w.group(1)) if w else None
-            }
+        for i, row in df.iterrows():
+            try:
+                resume_id = val(row.get('resume_id'))
+                defaults = {
+                    'name': val(row.get('name')),
+                    'gender': val(row.get('gender')),
+                    'age': val(row.get('age')),
+                    'phone': val(row.get('phone')),
+                    'marital_status': val(row.get('marital_status')),
+                    'height': val(row.get('height')),
+                    'weight': val(row.get('weight')),
+                    'education': val(row.get('education')),
+                    'school': val(row.get('school')),
+                    'work_years': val(row.get('work_years')),
+                    'current_location': val(row.get('current_location')),
+                    'job_location': val(row.get('job_location')),
+                    'available_time': val(row.get('available_time')),
+                    'update_time': val(row['update_time'], field='update_time'),
+                    'ethnicity': val(row.get('ethnicity')),
+                    'job_function': val(row.get('job_function')),
+                    'intended_position': val(row.get('intended_position')),
+                    'industry': val(row.get('industry')),
+                    'expected_salary': val(row.get('expected_salary')),
+                    'job_property': val(row.get('job_property')),
+                    'job_status': val(row.get('job_status')),
+                    'source_id': val(row.get('source_id')),
+                    'crawl_keywords': val(row.get('crawl_keywords')),
+                }

-        for idx, row in df.iterrows():
-            text = ' '.join([str(v) for k, v in row.items() if k not in ['height', 'weight']])
-            parsed = extract_height_weight(text)
-            for key in ['height', 'weight']:
-                val = row.get(key)
-                try:
-                    if pd.isna(val) or str(val).strip().lower() in ['nan', 'none', 'null', '']:
-                        df.at[idx, key] = parsed[key]
-                except:
-                    df.at[idx, key] = parsed[key]
+                # 安全方式：get_or_create + 逐字段 set
+                obj, _ = ResumeBasic.objects.get_or_create(resume_id=resume_id)

-        if 'age' in df.columns:
-            df['age'] = df['age'].apply(lambda x: int(re.search(r'\d+', str(x)).group()) if pd.notna(x) and re.search(r'\d+', str(x)) else None)
+                for k, v in defaults.items():
+                    try:
+                        setattr(obj, k, v)
+                    except Exception as field_error:
+                        print(f"[字段设置错误] {k} = {v!r} ({type(v)}) → {field_error}")
+                        raise

-        valid_fields = [f.name for f in ResumeBasic._meta.fields]
-        df = df[[col for col in df.columns if col in valid_fields]]
+                obj.save()
+                success_count += 1

-        # 清除所有 NaN -> None
-        for col in df.columns:
-            df[col] = df[col].apply(lambda x: None if pd.isna(x) or str(x).strip().lower() in ['nan', 'none', 'null', ''] else x)
+            except Exception as e:
+                fail_count += 1
+                errors.append((i + 2, str(e)))
+                print(f"\n❌ 第 {i + 2} 行出错：{e}")
+                print(f"resume_id: {repr(resume_id)} ({type(resume_id).__name__})")
+                for k, v in defaults.items():
+                    print(f"{k:<20} | {repr(v):<30} | {type(v).__name__}")
+                traceback.print_exc()

-        records = df.to_dict(orient='records')
-        existing_ids = set(ResumeBasic.objects.filter(
-            resume_id__in=[r["resume_id"] for r in records if "resume_id" in r]
-        ).values_list("resume_id", flat=True))
-
-        new_records = [r for r in records if r.get("resume_id") not in existing_ids]
-
-        ResumeBasic.objects.bulk_create([ResumeBasic(**r) for r in new_records])
-        self.stdout.write(self.style.SUCCESS(
-            f"✅ 成功导入 {len(new_records)} 条简历记录（关键词：{default_keyword}，来源：{default_source}）"
-        ))
+        self.stdout.write(self.style.SUCCESS(f"导入完成！总数：{len(df)}，成功：{success_count}，失败：{fail_count}"))
+        if errors:
+            self.stdout.write(self.style.WARNING("失败记录如下："))
+            for line_no, msg in errors:
+                self.stdout.write(f"  第 {line_no} 行出错：{msg}")