Extrahieren der Metadaten aus den Dateinamen#
# load libraries
from pathlib import Path
import re
import pandas as pd
Informationen aus Dateinamen erxtrahieren#
Mit Hilfe von Regularly Expressions (RegEx) werden folgende Informationen aus den Dateinamen extrahiert:
Vorname des/r Autor:in
Nachname des/r Autor:in
Titel
Jahr der Publikation
ggf. Band
# Set path to corpus directory
corpus_path = Path("../data/corpus-of-german-fiction-txt/")
# create patterns to match filenames
# e.g. Theodor_Fontane_-_Cécile_(1887).txt
pattern_with_volume = r'^(.+?)_-_(.+?)_-_(.+?)_\((\d{4})\)$'
# e.g. Agnes_Günther_-_Die_Heilige_und_ihr_Narr_-_Band_2_(1913).txt
pattern_no_volume = r'^(.+?)_-_(.+?)_\((\d{4})\)$'
def process_author_name(author_name:str) -> tuple[str,str]:
name_parts = author_name.split("_")
if len(name_parts) == 0:
print(f"Warning: Could not parse author name: {name_parts}")
firstname = ""
lastname = ""
elif len(name_parts) == 1:
# Only one name part - treat as lastname
firstname = ""
lastname = name_parts[0]
elif len(name_parts) == 2:
# Standard case: Firstname Lastname
firstname = name_parts[0]
lastname = name_parts[1]
else:
# If second last entry is lowercase it belongs to lastname ("von" or "zu")
# otherwise only the last entry is lastname and all others are firstname
if name_parts[-2].islower():
lastname = " ".join(name_parts[-2:])
firstname = " ".join(name_parts[:-2])
else:
lastname = name_parts[-1]
firstname = " ".join(name_parts[:-1])
return firstname, lastname
def extract_metadata(dir_path: Path) -> pd.DataFrame:
metadata = []
for fp in corpus_path.iterdir():
entry = {}
stem = fp.stem
match = re.match(pattern_with_volume, stem)
has_volume = True
if not match:
match = re.match(pattern_no_volume, stem)
has_volume = False
if not match:
print(f"Warning: Could not parse filename: {fp.name}")
continue
author_name = match.group(1)
title = match.group(2).replace('_', ' ')
if has_volume:
volume = match.group(3).replace('_', ' ')
year = match.group(4)
else:
volume = "" # No volume information
year = match.group(3)
firstname, lastname = process_author_name(author_name)
entry["lastname"] = lastname
entry["firstname"] = firstname
entry["DC.title"] = title
entry["year"] = int(year)
entry["DC.date"] = int(year)
entry["volume"] = volume
entry["DC.identifier"]= stem
metadata.append(entry)
df = pd.DataFrame(metadata)
return df
df = extract_metadata(corpus_path)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = extract_metadata(corpus_path)
Cell In[4], line 29, in extract_metadata(dir_path)
27 def extract_metadata(dir_path: Path) -> pd.DataFrame:
28 metadata = []
---> 29 for fp in corpus_path.iterdir():
30 entry = {}
31 stem = fp.stem
File /opt/hostedtoolcache/Python/3.11.14/x64/lib/python3.11/pathlib.py:931, in Path.iterdir(self)
927 def iterdir(self):
928 """Iterate over the files in this directory. Does not yield any
929 result for the special paths '.' and '..'.
930 """
--> 931 for name in os.listdir(self):
932 yield self._make_child_relpath(name)
FileNotFoundError: [Errno 2] No such file or directory: '../data/corpus-of-german-fiction-txt'
# Write as CSV file
df.to_csv("../metadata/metadata_corpus-german_language_fiction.csv", index=False)
Filtern nach Jahr der Publikation (1750 - 1950)#
time_filtered = df[(df.year >= 1750) & (df.year <= 1950)]
time_filtered.year.value_counts(ascending=True).plot(kind="bar", figsize=(16,8))
<Axes: xlabel='year'>
time_filtered.shape
(2696, 5)
time_filtered.lastname.value_counts().plot(kind="bar", figsize=(16,8))
<Axes: xlabel='lastname'>