Question

我收到了一份短on文件,从Xml文档中提取相关数据,出口到Excel文档。我大部分工作。 xml文档是一套数据,可输入Excel的一行。

我不知道如何解决的问题是,如何(可能的话)得出相同类型的多重价值。我仅作一谈。

这方面的一个例子是我对《古典》的看法:

 elif child.tag.endswith("titleInfo"):
                if child.attrib.get("type") == "alternative":
                    alternative_title = child.find("mods:title", namespaces).text
                    record_data["Alternative Title"] = alternative_title

这里是Xml相关卷宗内(我需要处理的其他xml文档可能具有0、1或更多替代名称):

 <mods:titleInfo type="alternative">
  <mods:title>Ziemer diary</mods:title>
 </mods:titleInfo>
 <mods:titleInfo type="alternative">
  <mods:title>F.G. Ziemer Tagebuch</mods:title>
 </mods:titleInfo>

** 在我执政时,只记录第二例。

There are several places in these xml files that will need the same solution.

得到帮助!

全书

# MODS/METS XML to XLSX Crosswalk

import xml.etree.ElementTree as ET
import pandas as pd


def parse_xml(xml_file):
    # Create an ElementTree object and parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Define namespaces
    namespaces = {
        "mods": "http://www.loc.gov/mods/v3",
        "mets": "http://www.loc.gov/METS/"
    }

    # Extract the necessary data from the XML structure
    data = []
    for dmdSec in root.iterfind(".//mets:dmdSec", namespaces):
        record_data = {}
        mods = dmdSec.find(".//mods:mods", namespaces)

        title = mods.find(".//mods:title", namespaces)
        if title is not None:
            record_data["Title"] = title.text


        for child in mods:
            if child.tag.endswith("title"):
                record_data["Title"] = child.text
            elif child.tag.endswith("abstract"):
                record_data["Abstract"] = child.text
            elif child.tag.endswith("identifier"):
                record_data["Identifier"] = child.text
            elif child.tag.endswith("location"):
                file_link = child.find("mods:url", namespaces).text
                record_data["Path"] = file_link
            elif child.tag.endswith("typeOfResource"):
                record_data["Type of Resource"] = child.text
            elif child.tag.endswith("genre"):
                record_data["Genre"] = child.text
            elif child.tag.endswith("subject"):
                if child.find("mods:hierarchicalGeographic", namespaces) is not None:
                    hier_geo = child.find("mods:hierarchicalGeographic", namespaces)
                    country = hier_geo.find(".//mods:country", namespaces)
                    state = hier_geo.find(".//mods:state", namespaces)
                    county = hier_geo.find(".//mods:county", namespaces)
                    city = hier_geo.find(".//mods:city", namespaces)
                    if country is not None:
                        record_data["Country"] = country.text
                    if state is not None:
                        record_data["State"] = state.text
                    if county is not None:
                        record_data["County"] = county.text
                    if city is not None:
                        record_data["City"] = city.text
                # if child.find("mods:geographic", namespaces) is not None:
                #     geographic = child.find("mods:geographic", namespaces).text
                #     record_data.setdefault("Geographic", []).append(geographic)
                # elif child.find("mods:temporal", namespaces) is not None:
                #     temporal = child.find("mods:temporal", namespaces).text
                #     record_data.setdefault("Temporal", []).append(temporal)
                # elif child.find("mods:topic", namespaces) is not None:
                #     topic = child.find("mods:topic", namespaces).text
                #     record_data.setdefault("Subject", []).append(topic)
            elif child.tag.endswith("originInfo"):
                for info in child:
                    if info.tag.endswith("place"):
                        record_data["Place"] = info.find("mods:placeTerm", namespaces).text
                    elif info.tag.endswith("publisher"):
                        record_data["Publisher"] = info.text
                    elif info.tag.endswith("dateIssued"):
                        record_data["Date Issued"] = info.text
                    elif info.tag.endswith("dateCaptured"):
                        record_data["Date Captured"] = info.text
            # elif child.tag.endswith("part"):
            #     part_number = child.find(".//mods:number", namespaces).text
            #     record_data["Date Published"] = part_number
            elif child.tag.endswith("titleInfo"):
                if child.attrib.get("type") == "alternative":
                    alternative_title = child.find("mods:title", namespaces).text
                    record_data["Alternative Title"].append(alternative_title)

            elif child.tag.endswith("name"):
                if child.attrib.get("type") == "corporate":
                    corporate_name = child.find("mods:namePart", namespaces).text
                    record_data["Corporate Name"] = corporate_name
                elif child.attrib.get("type") == "personal":
                    author_name = child.find("mods:namePart", namespaces).text
                    record_data["Author"] = author_name

            # Additional fields from the provided XML structure
            elif child.tag.endswith("language"):
                language_term = child.find("mods:languageTerm", namespaces).text
                record_data["Language"] = language_term
            elif child.tag.endswith("physicalDescription"):
                form = child.find(".//mods:form", namespaces).text
                extent = child.find(".//mods:extent", namespaces).text
                record_data["Form"] = form
                record_data["Dimensions"] = extent
            elif child.tag.endswith("accessCondition"):
                record_data["Access Condition"] = child.text

        data.append(record_data)

    return data


def xml_to_excel(xml_file, excel_file):
    # Parse XML file
    data = parse_xml(xml_file)

    # Convert data to a DataFrame
    df = pd.DataFrame(data)

    # Reorder columns to have "Title" as the first column
    if "Title" in df.columns:
        columns = df.columns.tolist()
        columns.remove("Title")
        columns = ["Title"] + columns
        df = df[columns]

    # Export to Excel
    df.to_excel(excel_file, index=False)

    print(f"Data exported to {excel_file} successfully!")


# File Management
xml_file = "/Users/spaz/Documents/David/Data Scientist/Rescarta Stuff/metadata copy.xml"  # Replace with your XML file path
excel_file = "/Users/spaz/Desktop/Outputs/output.xlsx"  # Replace with desired output Excel file path

xml_to_excel(xml_file, excel_file)

Answer 1

想像你那样,把你的字典与第二个价值观相 over。 http://stackoverflow.com/a/10664876/20851944"compare 该员额。

You can search with xpath, if you use xml.etree.ElementTree , e.g.:

www.un.org/chinese/ga/president

for alternative_title in root.findall(".//*[@type= alternative ]/mods:title", namespaces):
    print(alternative_title.text)

产出:

Ziemer diary
F.G. Ziemer Tagebuch

。

from collections import defaultdict

record_data = defaultdict(list)
for child in root.iter():
    if child.tag.endswith("titleInfo"):
        if child.attrib.get("type") == "alternative":
            alternative_title = child.find("mods:title", namespaces).text
            record_data["Alternative Title"].append(alternative_title)

for key, val in record_data.items():
    print(key, val)
# Alternative Title [ Ziemer diary ,  F.G. Ziemer Tagebuch ]

诸位可以使用邮资向Excel书写:

import pandas as pd

df = pd.DataFrame.from_dict(record_data)
print(df)
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
# if you have pip install openpyxl, e.g.
# df.to_excel( output.xlsx )

产出:

      Alternative Title
0          Ziemer diary
1  F.G. Ziemer Tagebuch

友情链接