pyCEURmake API Documentation

`ceur_ws`

`Conference`

Bases: JSONAble

Represents a conference

Source code in ceurws/ceur_ws.py

class Conference(JSONAble):
    """
    Represents a conference
    """

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436",
                "fullTitle": "SIAM International Conference on Data Mining",
                "homepage": "https://www.siam.org/Conferences/CM/Main/sdm19",
                "acronym": "SDM 2019",
            }
        ]
        return samples

`getSamples()` `staticmethod`

get sample records of the entity

Source code in ceurws/ceur_ws.py

@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436",
            "fullTitle": "SIAM International Conference on Data Mining",
            "homepage": "https://www.siam.org/Conferences/CM/Main/sdm19",
            "acronym": "SDM 2019",
        }
    ]
    return samples

`ConferenceManager`

Bases: EntityManager

Contains multiple ceurws sessions

Source code in ceurws/ceur_ws.py

class ConferenceManager(EntityManager):
    """
    Contains multiple ceurws sessions
    """

    def __init__(self):
        super().__init__(
            listName="conferences",
            clazz=Conference,
            tableName="conferences",
            entityName=Conference.__class__.__name__,
            primaryKey="id",
            entityPluralName="conferences",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

`Editor`

Bases: JSONAble

Represents a volume editor

Source code in ceurws/ceur_ws.py

class Editor(JSONAble):
    """
    Represents a volume editor
    """

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436/John Doe",
                "name": "John Doe",
                "homepage": "http://www.example.org/john",
                "country": "Germany",
                "affiliation": "Leibniz University Hannover & L3S Research Center",
                "submitted": False,
            },
            {
                "id": "Vol-2436/Jane Doe",
                "name": "Jane Doe",
                "homepage": "http://www.example.org/jane",
                "country": "Germany",
                "affiliation": "Technical University Dortmund",
                "submitted": True,
            },
        ]
        return samples

`getSamples()` `staticmethod`

get sample records of the entity

Source code in ceurws/ceur_ws.py

@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436/John Doe",
            "name": "John Doe",
            "homepage": "http://www.example.org/john",
            "country": "Germany",
            "affiliation": "Leibniz University Hannover & L3S Research Center",
            "submitted": False,
        },
        {
            "id": "Vol-2436/Jane Doe",
            "name": "Jane Doe",
            "homepage": "http://www.example.org/jane",
            "country": "Germany",
            "affiliation": "Technical University Dortmund",
            "submitted": True,
        },
    ]
    return samples

`EditorManager`

Bases: EntityManager

Contains multiple ceurws editors

Source code in ceurws/ceur_ws.py

class EditorManager(EntityManager):
    """
    Contains multiple ceurws editors
    """

    def __init__(self):
        super().__init__(
            listName="editors",
            clazz=Editor,
            tableName="editors",
            entityName=Session.__class__.__name__,
            primaryKey="id",
            entityPluralName="editors",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

`Paper`

Bases: JSONAble

Represents a paper

Source code in ceurws/ceur_ws.py

class Paper(JSONAble):
    """
    Represents a paper
    """

    def __init__(
        self,
        id: str | None = None,
        title: str | None = None,
        type: str | None = None,
        position: int | None = None,
        pagesFrom: int | None = None,
        pagesTo: int | None = None,
        authors: dict | None = None,
    ):
        super().__init__()
        self.id = id
        self.title = title
        self.type = type
        self.position = position
        self.pagesFrom = pagesFrom
        self.pagesTo = pagesTo
        self.authors = authors

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {  # id is constructed with volume and position
                # → <volNumber>/s<position>/<type>_<position_relative_to_type>
                "id": "Vol-2436/s1/summary",
                "type": "summary",
                "position": 0,
                "title": "1st Workshop on Evaluation and Experimental Design in Data Mining and "
                "Machine Learning (EDML 2019)",
                "pdf": "http://ceur-ws.org/Vol-2436/summary.pdf",
                "pagesFrom": 1,
                "pagesTo": 3,
                "authors": [
                    "Eirini Ntoutsi",
                    "Erich Schubert",
                    "Arthur Zimek",
                    "Albrecht Zimmermann",
                ],
            },
            {
                "id": "Vol-2436/s1/invited_1",
                "type": "invited",
                "position": 1,
                "title": "Evaluation of Unsupervised Learning Results: Making the Seemingly Impossible Possible",
                "pdf": "http://ceur-ws.org/Vol-2436/invited_1.pdf",
                "pagesFrom": 4,
                "pagesTo": 4,
                "authors": ["Ricardo J. G. B. Campello"],
            },
            {
                "id": "Vol-2436/s1/article_1",
                "type": "article",
                "position": 2,
                "title": "EvalNE: A Framework for Evaluating Network Embeddings on Link Prediction",
                "pdf": "http://ceur-ws.org/Vol-2436/article_2.pdf",
                "pagesFrom": 5,
                "pagesTo": 13,
                "authors": [
                    "Alexandru Mara",
                    "Jefrey Lijffijt",
                    "Tijl De Bie",
                ],
            },
        ]
        return samples

    def __str__(self):
        """
        return my string representation

        Returns:
            str: my text representation
        """
        text = self.title
        return text

`str()`

return my string representation

Returns:

Name	Type	Description
`str`		my text representation

Source code in ceurws/ceur_ws.py

def __str__(self):
    """
    return my string representation

    Returns:
        str: my text representation
    """
    text = self.title
    return text

`getSamples()` `staticmethod`

get sample records of the entity

Source code in ceurws/ceur_ws.py

@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {  # id is constructed with volume and position
            # → <volNumber>/s<position>/<type>_<position_relative_to_type>
            "id": "Vol-2436/s1/summary",
            "type": "summary",
            "position": 0,
            "title": "1st Workshop on Evaluation and Experimental Design in Data Mining and "
            "Machine Learning (EDML 2019)",
            "pdf": "http://ceur-ws.org/Vol-2436/summary.pdf",
            "pagesFrom": 1,
            "pagesTo": 3,
            "authors": [
                "Eirini Ntoutsi",
                "Erich Schubert",
                "Arthur Zimek",
                "Albrecht Zimmermann",
            ],
        },
        {
            "id": "Vol-2436/s1/invited_1",
            "type": "invited",
            "position": 1,
            "title": "Evaluation of Unsupervised Learning Results: Making the Seemingly Impossible Possible",
            "pdf": "http://ceur-ws.org/Vol-2436/invited_1.pdf",
            "pagesFrom": 4,
            "pagesTo": 4,
            "authors": ["Ricardo J. G. B. Campello"],
        },
        {
            "id": "Vol-2436/s1/article_1",
            "type": "article",
            "position": 2,
            "title": "EvalNE: A Framework for Evaluating Network Embeddings on Link Prediction",
            "pdf": "http://ceur-ws.org/Vol-2436/article_2.pdf",
            "pagesFrom": 5,
            "pagesTo": 13,
            "authors": [
                "Alexandru Mara",
                "Jefrey Lijffijt",
                "Tijl De Bie",
            ],
        },
    ]
    return samples

`PaperManager`

Bases: EntityManager

Contains multiple ceurws papers

Source code in ceurws/ceur_ws.py

class PaperManager(EntityManager):
    """
    Contains multiple ceurws papers
    """

    def __init__(self):
        super().__init__(
            listName="papers",
            clazz=Paper,
            tableName="papers",
            entityName=Paper.__class__.__name__,
            primaryKey="id",
            entityPluralName="papers",
            config=CEURWS.CONFIG,
            handleInvalidListTypes=True,
            listSeparator=",",
            name=self.__class__.__name__,
        )

`Session`

Bases: JSONAble

Represents a session in ceur-ws

Source code in ceurws/ceur_ws.py

class Session(JSONAble):
    """
    Represents a session in ceur-ws
    """

    def __init__(self, id: str | None, title: str | None, position: int | None, papers: dict[str, "Paper"] | None):
        """
        constructor
        """
        super().__init__()
        self.id = id
        self.title = title
        self.position = position
        self._papers = papers

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436/s1",  # id is constructed with volume and position → <volNumber>/s<position>
                "title": "Information Technologies and Intelligent Decision Making Systems II",
                "position": 1,
                "papers": {  # 1:n relation / command chain
                    "VOL-2436/s1/p1": Paper,
                    "VOL-2436/s1/p2": Paper,
                },
            }
        ]
        return samples

    @property
    def papers(self, cached: bool = False):  # dict: str→Paper
        if cached:
            # check if cached
            pass
        else:
            # load papers
            if cached:
                # set papers
                pass
        return self._papers

    @papers.setter
    def papers(self, paper: Paper):
        # ToDo: Adjust to proper 1:n handling
        if hasattr(self, "_papers") and isinstance(self._papers, dict) and paper.id:
            self._papers[paper.id] = paper
        else:
            self._papers = paper

`init(id, title, position, papers)`

constructor

Source code in ceurws/ceur_ws.py

def __init__(self, id: str | None, title: str | None, position: int | None, papers: dict[str, "Paper"] | None):
    """
    constructor
    """
    super().__init__()
    self.id = id
    self.title = title
    self.position = position
    self._papers = papers

`getSamples()` `staticmethod`

get sample records of the entity

Source code in ceurws/ceur_ws.py

@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436/s1",  # id is constructed with volume and position → <volNumber>/s<position>
            "title": "Information Technologies and Intelligent Decision Making Systems II",
            "position": 1,
            "papers": {  # 1:n relation / command chain
                "VOL-2436/s1/p1": Paper,
                "VOL-2436/s1/p2": Paper,
            },
        }
    ]
    return samples

`SessionManager`

Bases: EntityManager

Contains multiple ceurws sessions

Source code in ceurws/ceur_ws.py

class SessionManager(EntityManager):
    """
    Contains multiple ceurws sessions
    """

    def __init__(self):
        super().__init__(
            listName="sessions",
            clazz=Session,
            tableName="sessions",
            entityName=Session.__class__.__name__,
            primaryKey="id",
            # ToDo: check if just the title is a sufficent key or if an ID must be added
            entityPluralName="sessions",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

`Volume`

Bases: JSONAble

Represents a volume in ceur-ws

Source code in ceurws/ceur_ws.py

class Volume(JSONAble):
    """
    Represents a volume in ceur-ws
    """

    def __init__(
        self,
        number: int | None = None,
        url: str | None = None,
        title: str | None = None,
        fullTitle: str | None = None,
        acronym: str | None = None,
        lang: str | None = None,
        location: str | None = None,
        country: str | None = None,
        countryWikidataId: str | None = None,
        region: str | None = None,
        city: str | None = None,
        cityWikidataId: str | None = None,
        ordinal: int | None = None,
        date: datetime.datetime | None = None,
        dateFrom: datetime.datetime | None = None,
        dateTo: datetime.datetime | None = None,
        pubYear: str | None = None,
        pubDate: datetime.datetime | None = None,
        submitDate: datetime.datetime | None = None,
        valid: bool = True,
        conference: Optional["Conference"] = None,
        editors: list["Editor"] | None = None,
        sessions: list["Session"] | None = None,
        virtualEvent: bool = False,
        submittedBy: str | None = None,
    ):
        """
        constructor
        """
        self.number = number
        self.url = url
        self.title = title
        self.fullTitle = fullTitle
        self.acronym = acronym
        self.lang = lang
        self.location = location
        self.country = country
        self.countryWikidataId = countryWikidataId
        self.region = region
        self.city = city
        self.cityWikidataId = cityWikidataId
        self.ordinal = ordinal
        self.date = date
        self.dateFrom = dateFrom
        self.dateTo = dateTo
        self.pubYear = pubYear
        self.pubDate = pubDate
        self.submitDate = submitDate
        self.valid = valid
        self.conference = conference
        self.editors = editors
        self.sessions = sessions
        self.virtualEvent = virtualEvent
        self.submittedBy = submittedBy

    def getSamples(self):
        samples = [
            {
                "number": 2436,
                "url": "http://ceur-ws.org/Vol-2436/",
                "title": "Evaluation and Experimental Design in Data Mining and Machine Learning",
                "fullTitle": "1st Workshop on Evaluation and Experimental Design in Data Mining and Machine Learning",
                "acronym": "EDML 2019",
                "lang": "en",
                "location": "Calgary, Alberta, Canada",
                "country": "Canada",
                "region": "Alberta",
                "city": "Calgary",
                "ordinal": 1,
                "date": datetime.datetime(year=2019, month=5, day=4),
                "dateFrom": "",
                "dateTo": "",
                "pubYear": 2019,
                "pubDate": "2019-09-08",
                "submitDate": "2019-07-28",
                "valid": True,
                "conference": Conference,
                "editors": [Editor],
                "sessions": [Session],
                "virtualEvent": False,
            }
        ]
        return samples

    def getVolumeNumber(self):
        """
        get number of the volume
        """
        number = getattr(self, "number", "Volume has no number")
        return number

    def getVolumeUrl(self) -> str | None:
        """
        get the url of the volume page
        """
        number = self.number
        if number is None:
            return None
        url = self.getVolumeUrlOf(number)
        return url

    @staticmethod
    def getVolumeUrlOf(
        number: str | int,
    ) -> str | None:
        """
        get the volume url of the given volume number
        Args:
            number: volume number
        """
        url = None
        if number is not None:
            url = f"http://ceur-ws.org/Vol-{number}/"
        return url

    def isVirtualEvent(self) -> bool:
        """
        Returns True if the event is a virtual event
        """
        return getattr(self, "virtualEvent", False)

    def normalize(self):
        """
        Tries to normalize the properties e.g. breaking loctime into designated location and time properties
        Example: 'Vienna, Austria, July 25th, 2022'
        """
        pass

    def get_loctime(self) -> str | None:
        """
        get the loctime
        """
        loctime = getattr(self, "loctime", None)
        if loctime is None:
            td_title = getattr(self, "tdtitle", None)
            if td_title:
                title_parts = td_title.split(",")
                del title_parts[0]
                loctime = ",".join(title_parts)
                loctime = loctime.strip(".")
                self.loctime = loctime
            else:
                pass
        elif not isinstance(loctime, str):
            loctime = None
        return loctime

    def resolveLoctime(self):
        """
        Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo
        """
        loctime = self.get_loctime()
        if loctime is None:
            return None
        dateFrom, dateTo = self.extractDates(loctime)
        if dateFrom is not None:
            self.dateFrom = dateFrom
        if dateTo is not None:
            self.dateTo = dateTo
        self.extractAndSetLocation(locationStr=loctime)

    def extractAndSetLocation(self, locationStr: str):
        """
        Extracts the location from the given string and returns the found city and country
        ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it
        Args:
            locationStr: string to extract the locations from
        """
        parser = self.__class__.__dict__.get("locationparser")
        if parser is None:
            parser = LocationContext.fromCache()
            self.__class__.locationparser = parser
        locationStr = self.removePartsMatching(locationStr, pattern=r"\d")
        for month in calendar.month_name:
            if month == "":
                continue
            locationStr = locationStr.replace(month, " ")
        locations = parser.locateLocation(locationStr, verbose=True)
        locations = self.rankLocations(locationStr, locations)
        city = None
        cityWikidataId = None
        country = None
        countryWikidataId = None
        if locations is not None and len(locations) > 0:
            bestMatch = locations[0]
            if isinstance(bestMatch, City):
                city = bestMatch.name
                cityWikidataId = bestMatch.wikidataid
                country = bestMatch.country.name
                countryWikidataId = bestMatch.country.wikidataid
            elif isinstance(bestMatch, Country):
                country = bestMatch.wikidataid
        virtualEventKeywords = ["virtual", "online"]
        for keyword in virtualEventKeywords:
            if keyword in locationStr.lower():
                self.virtualEvent = True
        if city is not None:
            self.city = city
            self.cityWikidataId = cityWikidataId
        if countryWikidataId is not None:
            self.country = country
            self.countryWikidataId = countryWikidataId

    def extractDates(
        self, dateStr: str, durationThreshold: int = 11
    ) -> tuple[datetime.date | None, datetime.date | None]:
        """ "
        Extracts the start and end time from the given string
        optimized for the format of the loctime property
        Args:
            dateStr: string to extract the dates from
            durationThreshold: number of days allowed between two extracted dates
        """
        dateFrom = None
        dateTo = None
        if dateStr is None:
            return None, None
        # normalize certain foreign language month names that occur regularly
        if "novembro" in dateStr.lower():
            dateStr = dateStr.lower().replace("novembro", "november")
        loctimeParts = re.split("[,)(]", dateStr)
        if re.fullmatch(r"\d{4}", loctimeParts[-1].strip()):
            year = loctimeParts[-1].strip()
            rawDate = loctimeParts[-2].strip()
            if len(loctimeParts) >= 3 and loctimeParts[-3].lower().strip() in [
                cn.lower() for cn in calendar.month_name
            ]:
                rawDate = f"{loctimeParts[-3]} {rawDate}"
            dateParts: list = re.split("[-–‐&]| to | and ", rawDate)
            try:
                if len(dateParts) == 1:
                    dateFrom = dateutil.parser.parse(f"{dateParts[0]} {year}")
                    dateTo = dateFrom
                elif len(dateParts) == 2:
                    dateParts.sort(key=lambda r: len(r), reverse=True)
                    dateOne = dateutil.parser.parse(f"{dateParts[0]} {year}")
                    if len(dateParts[-1].strip()) <= 4:
                        dayMonthParts = dateParts[0].split(" ")
                        dayMonthParts.sort(key=lambda r: len(r), reverse=True)
                        endDate = dayMonthParts[0] + dateParts[1]
                        dateTwo = dateutil.parser.parse(f"{endDate} {year}")
                    else:
                        dateTwo = dateutil.parser.parse(f"{dateParts[1]} {year}")
                    dates = [dateOne, dateTwo]
                    dates.sort()
                    dateFrom = dates[0]
                    dateTo = dates[1]
            except Exception:
                pass
            if dateTo is not None and dateFrom is not None:
                delta = dateTo - dateFrom
                if delta < datetime.timedelta():
                    print("Error this should not be possible")
                elif delta > datetime.timedelta(days=durationThreshold):
                    print(
                        self.number,
                        f"Event with a duration of more than {durationThreshold} days seems suspicious",
                    )
                else:
                    return dateFrom.date(), dateTo.date()
            else:
                print(self.number, dateStr, "→ Dates could not be extracted")
            return None, None
        else:
            # corner case
            return None, None

    @staticmethod
    def removePartsMatching(value: str, pattern: str, separator=","):
        """
        Removes parts from the given value matching the pattern
        """
        parts = value.split(separator)
        resParts = []
        for part in parts:
            if re.search(pattern, part) is None:
                resParts.append(part)
        resValue = separator.join(resParts)
        return resValue

    @staticmethod
    def rankLocations(locationStr: str, locations: list[Location]):
        """
        rank the given locations to find the best match to the given location string
        Args:
            locationStr: location string
            locations: list of locations objects
        """
        rankedLocations = []
        for location in locations:
            locationsToCheck = []
            if isinstance(location, City):
                locationsToCheck = [
                    location,
                    location.region,
                    location.country,
                ]
            elif isinstance(location, Region):
                locationsToCheck = [location, location.country]
            elif isinstance(location, Country):
                locationsToCheck = [location]
            score = 0
            for ltc in locationsToCheck:
                if ltc.name in locationStr:
                    score += 1
            rankedLocations.append((score, location))
        rankedLocations.sort(key=lambda scoreTuple: scoreTuple[0], reverse=True)
        return [location for score, location in rankedLocations]

    def __str__(self):
        text = f"Vol-{self.number}"
        return text

    @property
    def sessions(self):
        """
        sessions of this volume
        """
        return self._sessions

    @sessions.setter
    def sessions(self, session):
        # ToDo: Adjust to proper 1:n handling
        if hasattr(self, "_sessions") and isinstance(self._sessions, list):
            self._sessions.append(session)
        else:
            self._sessions = session

    @property
    def papers(self):
        """
        papers of this volume
        """
        return

    def extractValuesFromVolumePage(self, timeout: float = 3) -> tuple[dict | None, BeautifulSoup | None]:
        """
        extract values from the given volume page
        """
        self.desc = "?"
        self.h1 = "?"
        if self.url is None:
            return None, None
        volumeParser = VolumeParser(timeout=timeout)
        parseDict, soup = volumeParser.parse_volume(self.getVolumeNumber())
        self.fromDict(parseDict)
        return parseDict, soup

    def getSubmittingEditor(self):
        """
        Returns the Editor that submitted the volume
        """
        submitter = None
        if hasattr(self, "editors"):
            for editor in self.editors:
                if isinstance(editor, Editor) and getattr(editor, "submitted", False):
                    submitter = editor
                    break
        return submitter

`papers` `property`

papers of this volume

`sessions` `property` `writable`

sessions of this volume

`init(number=None, url=None, title=None, fullTitle=None, acronym=None, lang=None, location=None, country=None, countryWikidataId=None, region=None, city=None, cityWikidataId=None, ordinal=None, date=None, dateFrom=None, dateTo=None, pubYear=None, pubDate=None, submitDate=None, valid=True, conference=None, editors=None, sessions=None, virtualEvent=False, submittedBy=None)`

constructor

Source code in ceurws/ceur_ws.py

def __init__(
    self,
    number: int | None = None,
    url: str | None = None,
    title: str | None = None,
    fullTitle: str | None = None,
    acronym: str | None = None,
    lang: str | None = None,
    location: str | None = None,
    country: str | None = None,
    countryWikidataId: str | None = None,
    region: str | None = None,
    city: str | None = None,
    cityWikidataId: str | None = None,
    ordinal: int | None = None,
    date: datetime.datetime | None = None,
    dateFrom: datetime.datetime | None = None,
    dateTo: datetime.datetime | None = None,
    pubYear: str | None = None,
    pubDate: datetime.datetime | None = None,
    submitDate: datetime.datetime | None = None,
    valid: bool = True,
    conference: Optional["Conference"] = None,
    editors: list["Editor"] | None = None,
    sessions: list["Session"] | None = None,
    virtualEvent: bool = False,
    submittedBy: str | None = None,
):
    """
    constructor
    """
    self.number = number
    self.url = url
    self.title = title
    self.fullTitle = fullTitle
    self.acronym = acronym
    self.lang = lang
    self.location = location
    self.country = country
    self.countryWikidataId = countryWikidataId
    self.region = region
    self.city = city
    self.cityWikidataId = cityWikidataId
    self.ordinal = ordinal
    self.date = date
    self.dateFrom = dateFrom
    self.dateTo = dateTo
    self.pubYear = pubYear
    self.pubDate = pubDate
    self.submitDate = submitDate
    self.valid = valid
    self.conference = conference
    self.editors = editors
    self.sessions = sessions
    self.virtualEvent = virtualEvent
    self.submittedBy = submittedBy

`extractAndSetLocation(locationStr)`

Extracts the location from the given string and returns the found city and country ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it Args: locationStr: string to extract the locations from

Source code in ceurws/ceur_ws.py

def extractAndSetLocation(self, locationStr: str):
    """
    Extracts the location from the given string and returns the found city and country
    ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it
    Args:
        locationStr: string to extract the locations from
    """
    parser = self.__class__.__dict__.get("locationparser")
    if parser is None:
        parser = LocationContext.fromCache()
        self.__class__.locationparser = parser
    locationStr = self.removePartsMatching(locationStr, pattern=r"\d")
    for month in calendar.month_name:
        if month == "":
            continue
        locationStr = locationStr.replace(month, " ")
    locations = parser.locateLocation(locationStr, verbose=True)
    locations = self.rankLocations(locationStr, locations)
    city = None
    cityWikidataId = None
    country = None
    countryWikidataId = None
    if locations is not None and len(locations) > 0:
        bestMatch = locations[0]
        if isinstance(bestMatch, City):
            city = bestMatch.name
            cityWikidataId = bestMatch.wikidataid
            country = bestMatch.country.name
            countryWikidataId = bestMatch.country.wikidataid
        elif isinstance(bestMatch, Country):
            country = bestMatch.wikidataid
    virtualEventKeywords = ["virtual", "online"]
    for keyword in virtualEventKeywords:
        if keyword in locationStr.lower():
            self.virtualEvent = True
    if city is not None:
        self.city = city
        self.cityWikidataId = cityWikidataId
    if countryWikidataId is not None:
        self.country = country
        self.countryWikidataId = countryWikidataId

`extractDates(dateStr, durationThreshold=11)`

" Extracts the start and end time from the given string optimized for the format of the loctime property Args: dateStr: string to extract the dates from durationThreshold: number of days allowed between two extracted dates

Source code in ceurws/ceur_ws.py

def extractDates(
    self, dateStr: str, durationThreshold: int = 11
) -> tuple[datetime.date | None, datetime.date | None]:
    """ "
    Extracts the start and end time from the given string
    optimized for the format of the loctime property
    Args:
        dateStr: string to extract the dates from
        durationThreshold: number of days allowed between two extracted dates
    """
    dateFrom = None
    dateTo = None
    if dateStr is None:
        return None, None
    # normalize certain foreign language month names that occur regularly
    if "novembro" in dateStr.lower():
        dateStr = dateStr.lower().replace("novembro", "november")
    loctimeParts = re.split("[,)(]", dateStr)
    if re.fullmatch(r"\d{4}", loctimeParts[-1].strip()):
        year = loctimeParts[-1].strip()
        rawDate = loctimeParts[-2].strip()
        if len(loctimeParts) >= 3 and loctimeParts[-3].lower().strip() in [
            cn.lower() for cn in calendar.month_name
        ]:
            rawDate = f"{loctimeParts[-3]} {rawDate}"
        dateParts: list = re.split("[-–‐&]| to | and ", rawDate)
        try:
            if len(dateParts) == 1:
                dateFrom = dateutil.parser.parse(f"{dateParts[0]} {year}")
                dateTo = dateFrom
            elif len(dateParts) == 2:
                dateParts.sort(key=lambda r: len(r), reverse=True)
                dateOne = dateutil.parser.parse(f"{dateParts[0]} {year}")
                if len(dateParts[-1].strip()) <= 4:
                    dayMonthParts = dateParts[0].split(" ")
                    dayMonthParts.sort(key=lambda r: len(r), reverse=True)
                    endDate = dayMonthParts[0] + dateParts[1]
                    dateTwo = dateutil.parser.parse(f"{endDate} {year}")
                else:
                    dateTwo = dateutil.parser.parse(f"{dateParts[1]} {year}")
                dates = [dateOne, dateTwo]
                dates.sort()
                dateFrom = dates[0]
                dateTo = dates[1]
        except Exception:
            pass
        if dateTo is not None and dateFrom is not None:
            delta = dateTo - dateFrom
            if delta < datetime.timedelta():
                print("Error this should not be possible")
            elif delta > datetime.timedelta(days=durationThreshold):
                print(
                    self.number,
                    f"Event with a duration of more than {durationThreshold} days seems suspicious",
                )
            else:
                return dateFrom.date(), dateTo.date()
        else:
            print(self.number, dateStr, "→ Dates could not be extracted")
        return None, None
    else:
        # corner case
        return None, None

`extractValuesFromVolumePage(timeout=3)`

extract values from the given volume page

Source code in ceurws/ceur_ws.py

def extractValuesFromVolumePage(self, timeout: float = 3) -> tuple[dict | None, BeautifulSoup | None]:
    """
    extract values from the given volume page
    """
    self.desc = "?"
    self.h1 = "?"
    if self.url is None:
        return None, None
    volumeParser = VolumeParser(timeout=timeout)
    parseDict, soup = volumeParser.parse_volume(self.getVolumeNumber())
    self.fromDict(parseDict)
    return parseDict, soup

`getSubmittingEditor()`

Returns the Editor that submitted the volume

Source code in ceurws/ceur_ws.py

def getSubmittingEditor(self):
    """
    Returns the Editor that submitted the volume
    """
    submitter = None
    if hasattr(self, "editors"):
        for editor in self.editors:
            if isinstance(editor, Editor) and getattr(editor, "submitted", False):
                submitter = editor
                break
    return submitter

`getVolumeNumber()`

get number of the volume

Source code in ceurws/ceur_ws.py

def getVolumeNumber(self):
    """
    get number of the volume
    """
    number = getattr(self, "number", "Volume has no number")
    return number

`getVolumeUrl()`

get the url of the volume page

Source code in ceurws/ceur_ws.py

def getVolumeUrl(self) -> str | None:
    """
    get the url of the volume page
    """
    number = self.number
    if number is None:
        return None
    url = self.getVolumeUrlOf(number)
    return url

`getVolumeUrlOf(number)` `staticmethod`

get the volume url of the given volume number Args: number: volume number

Source code in ceurws/ceur_ws.py

@staticmethod
def getVolumeUrlOf(
    number: str | int,
) -> str | None:
    """
    get the volume url of the given volume number
    Args:
        number: volume number
    """
    url = None
    if number is not None:
        url = f"http://ceur-ws.org/Vol-{number}/"
    return url

`get_loctime()`

get the loctime

Source code in ceurws/ceur_ws.py

def get_loctime(self) -> str | None:
    """
    get the loctime
    """
    loctime = getattr(self, "loctime", None)
    if loctime is None:
        td_title = getattr(self, "tdtitle", None)
        if td_title:
            title_parts = td_title.split(",")
            del title_parts[0]
            loctime = ",".join(title_parts)
            loctime = loctime.strip(".")
            self.loctime = loctime
        else:
            pass
    elif not isinstance(loctime, str):
        loctime = None
    return loctime

`isVirtualEvent()`

Returns True if the event is a virtual event

Source code in ceurws/ceur_ws.py

def isVirtualEvent(self) -> bool:
    """
    Returns True if the event is a virtual event
    """
    return getattr(self, "virtualEvent", False)

`normalize()`

Tries to normalize the properties e.g. breaking loctime into designated location and time properties Example: 'Vienna, Austria, July 25th, 2022'

Source code in ceurws/ceur_ws.py

def normalize(self):
    """
    Tries to normalize the properties e.g. breaking loctime into designated location and time properties
    Example: 'Vienna, Austria, July 25th, 2022'
    """
    pass

`rankLocations(locationStr, locations)` `staticmethod`

rank the given locations to find the best match to the given location string Args: locationStr: location string locations: list of locations objects

Source code in ceurws/ceur_ws.py

@staticmethod
def rankLocations(locationStr: str, locations: list[Location]):
    """
    rank the given locations to find the best match to the given location string
    Args:
        locationStr: location string
        locations: list of locations objects
    """
    rankedLocations = []
    for location in locations:
        locationsToCheck = []
        if isinstance(location, City):
            locationsToCheck = [
                location,
                location.region,
                location.country,
            ]
        elif isinstance(location, Region):
            locationsToCheck = [location, location.country]
        elif isinstance(location, Country):
            locationsToCheck = [location]
        score = 0
        for ltc in locationsToCheck:
            if ltc.name in locationStr:
                score += 1
        rankedLocations.append((score, location))
    rankedLocations.sort(key=lambda scoreTuple: scoreTuple[0], reverse=True)
    return [location for score, location in rankedLocations]

`removePartsMatching(value, pattern, separator=',')` `staticmethod`

Removes parts from the given value matching the pattern

Source code in ceurws/ceur_ws.py

@staticmethod
def removePartsMatching(value: str, pattern: str, separator=","):
    """
    Removes parts from the given value matching the pattern
    """
    parts = value.split(separator)
    resParts = []
    for part in parts:
        if re.search(pattern, part) is None:
            resParts.append(part)
    resValue = separator.join(resParts)
    return resValue

`resolveLoctime()`

Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo

Source code in ceurws/ceur_ws.py

def resolveLoctime(self):
    """
    Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo
    """
    loctime = self.get_loctime()
    if loctime is None:
        return None
    dateFrom, dateTo = self.extractDates(loctime)
    if dateFrom is not None:
        self.dateFrom = dateFrom
    if dateTo is not None:
        self.dateTo = dateTo
    self.extractAndSetLocation(locationStr=loctime)

`VolumeManager`

Bases: EntityManager

Contains multiple ceurws volumes

Source code in ceurws/ceur_ws.py

class VolumeManager(EntityManager):
    """
    Contains multiple ceurws volumes
    """

    def __init__(self, tableName: str = "volumes"):
        super().__init__(
            listName="volumes",
            clazz=Volume,
            tableName=tableName,
            entityName=Volume.__class__.__name__,
            primaryKey="number",
            entityPluralName="volumes",
            config=CEURWS.CONFIG,
            handleInvalidListTypes=True,
            name=self.__class__.__name__,
        )
        self.volumes: list[Volume] = []

    def load(self):
        """
        load the volumeManager
        """
        if Download.needsDownload(CEURWS.CACHE_FILE):
            self.loadFromIndexHtml()
            self.store()
        else:
            self.loadFromBackup()

    def loadFromBackup(self):
        """
        load from the SQLITE Cache file
        """
        self.fromStore(cacheFile=CEURWS.CACHE_FILE)

    def update(self, parser_config: ParserConfig):
        """
        update me by a checking for recently added volumes
        """
        self.set_down_to_volume(parser_config)
        self.update_or_recreate(parser_config)

    def set_down_to_volume(self, parser_config):
        volumeCount = len(self.volumes)
        if volumeCount > 0:
            max_vol = self.volumes[-1]
            parser_config.down_to_volume = max_vol.number + 1
        else:
            pass

    def recreate(self, parser_config: ParserConfig):
        """
        recreate me by a full parse of all volume files

        Args:
            parser_config: parser configuration
        """

        self.update_or_recreate(parser_config)

    def update_or_recreate(self, parser_config: ParserConfig):
        """
        recreate or update me by parsing the index.html file

        Args:
            parser_config: parser configuration
        """
        progress_bar = parser_config.progress_bar
        loctime_parser = LoctimeParser()
        pm = PaperManager()
        if parser_config.down_to_volume != 1:
            pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
        paper_list = pm.getList()

        # first reload me from the main index
        self.loadFromIndexHtml(parser_config)
        invalid = 0
        for volume in self.volumes:
            if volume.number and volume.number < parser_config.down_to_volume:
                break
            _volume_record, soup = volume.extractValuesFromVolumePage()
            if soup:
                ptp = PaperTocParser(number=str(volume.number), soup=soup, debug=self.debug)
                paper_records = ptp.parsePapers()
                for paper_record in paper_records:
                    paper = Paper()
                    paper.fromDict(paper_record)
                    paper_list.append(paper)
            if not volume.valid:
                invalid += 1
            else:
                loctime = volume.get_loctime()
                if loctime:
                    loc_time_dict = loctime_parser.parse(loctime)
                    for key, value in loc_time_dict.items():
                        attr = f"loc_{key}"
                        setattr(volume, attr, value)
                    volume.resolveLoctime()
            # update progress bar
            if progress_bar:
                if volume.valid:
                    # print(f"{volume.url}:{volume.acronym}:{volume.desc}:{volume.h1}:{volume.title}")
                    description = volume.acronym[:20] if volume.acronym else "?"
                    progress_bar.set_description(f"{description}")
                progress_bar.update()
        print(f"storing recreated volume table for {len(self.volumes)} volumes ({invalid} invalid)")
        self.store(replace=True)
        print(f"storing {len(paper_list)} papers")
        pm.store(replace=True)

    def loadFromIndexHtml(self, parser_config: ParserConfig | None = None, vol_limit: int | None = None):
        """
        load my content from the index.html file

        Args:
            parser_config(ParserConfig): the parser Configuration to use
        """
        force = parser_config.force_download if parser_config else True
        htmlText = self.getIndexHtml(force)
        indexParser = IndexHtmlParser(htmlText, parser_config)
        volumeRecords = indexParser.parse(vol_limit)
        for volumeRecord in volumeRecords.values():
            volume = Volume()
            volume.fromDict(volumeRecord)
            for attr in ["desc", "h1"]:
                if not hasattr(volume, attr):
                    setattr(volume, attr, "?")
            self.volumes.append(volume)

    def getIndexHtml(self, force: bool = False):
        """
        get the index html
        """
        cacheHtml = CEURWS.CACHE_HTML
        if cacheHtml.is_file() and not force:
            with open(cacheHtml, encoding="utf-8") as file:
                html_page = file.read()
        else:
            req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
            html_page = urlopen(req).read().decode()
            CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
            with open(cacheHtml, mode="w", encoding="utf-8") as htmlFile:
                print(html_page, file=htmlFile)
        return html_page

`getIndexHtml(force=False)`

get the index html

Source code in ceurws/ceur_ws.py

def getIndexHtml(self, force: bool = False):
    """
    get the index html
    """
    cacheHtml = CEURWS.CACHE_HTML
    if cacheHtml.is_file() and not force:
        with open(cacheHtml, encoding="utf-8") as file:
            html_page = file.read()
    else:
        req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
        html_page = urlopen(req).read().decode()
        CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
        with open(cacheHtml, mode="w", encoding="utf-8") as htmlFile:
            print(html_page, file=htmlFile)
    return html_page

`load()`

load the volumeManager

Source code in ceurws/ceur_ws.py

def load(self):
    """
    load the volumeManager
    """
    if Download.needsDownload(CEURWS.CACHE_FILE):
        self.loadFromIndexHtml()
        self.store()
    else:
        self.loadFromBackup()

`loadFromBackup()`

load from the SQLITE Cache file

Source code in ceurws/ceur_ws.py

def loadFromBackup(self):
    """
    load from the SQLITE Cache file
    """
    self.fromStore(cacheFile=CEURWS.CACHE_FILE)

`loadFromIndexHtml(parser_config=None, vol_limit=None)`

load my content from the index.html file

Parameters:

Name	Type	Description	Default
`parser_config(ParserConfig)`		the parser Configuration to use	required

Source code in ceurws/ceur_ws.py

def loadFromIndexHtml(self, parser_config: ParserConfig | None = None, vol_limit: int | None = None):
    """
    load my content from the index.html file

    Args:
        parser_config(ParserConfig): the parser Configuration to use
    """
    force = parser_config.force_download if parser_config else True
    htmlText = self.getIndexHtml(force)
    indexParser = IndexHtmlParser(htmlText, parser_config)
    volumeRecords = indexParser.parse(vol_limit)
    for volumeRecord in volumeRecords.values():
        volume = Volume()
        volume.fromDict(volumeRecord)
        for attr in ["desc", "h1"]:
            if not hasattr(volume, attr):
                setattr(volume, attr, "?")
        self.volumes.append(volume)

`recreate(parser_config)`

recreate me by a full parse of all volume files

Parameters:

Name	Type	Description	Default
`parser_config`	`ParserConfig`	parser configuration	required

Source code in ceurws/ceur_ws.py

def recreate(self, parser_config: ParserConfig):
    """
    recreate me by a full parse of all volume files

    Args:
        parser_config: parser configuration
    """

    self.update_or_recreate(parser_config)

`update(parser_config)`

update me by a checking for recently added volumes

Source code in ceurws/ceur_ws.py

def update(self, parser_config: ParserConfig):
    """
    update me by a checking for recently added volumes
    """
    self.set_down_to_volume(parser_config)
    self.update_or_recreate(parser_config)

`update_or_recreate(parser_config)`

recreate or update me by parsing the index.html file

Parameters:

Name	Type	Description	Default
`parser_config`	`ParserConfig`	parser configuration	required

Source code in ceurws/ceur_ws.py

def update_or_recreate(self, parser_config: ParserConfig):
    """
    recreate or update me by parsing the index.html file

    Args:
        parser_config: parser configuration
    """
    progress_bar = parser_config.progress_bar
    loctime_parser = LoctimeParser()
    pm = PaperManager()
    if parser_config.down_to_volume != 1:
        pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
    paper_list = pm.getList()

    # first reload me from the main index
    self.loadFromIndexHtml(parser_config)
    invalid = 0
    for volume in self.volumes:
        if volume.number and volume.number < parser_config.down_to_volume:
            break
        _volume_record, soup = volume.extractValuesFromVolumePage()
        if soup:
            ptp = PaperTocParser(number=str(volume.number), soup=soup, debug=self.debug)
            paper_records = ptp.parsePapers()
            for paper_record in paper_records:
                paper = Paper()
                paper.fromDict(paper_record)
                paper_list.append(paper)
        if not volume.valid:
            invalid += 1
        else:
            loctime = volume.get_loctime()
            if loctime:
                loc_time_dict = loctime_parser.parse(loctime)
                for key, value in loc_time_dict.items():
                    attr = f"loc_{key}"
                    setattr(volume, attr, value)
                volume.resolveLoctime()
        # update progress bar
        if progress_bar:
            if volume.valid:
                # print(f"{volume.url}:{volume.acronym}:{volume.desc}:{volume.h1}:{volume.title}")
                description = volume.acronym[:20] if volume.acronym else "?"
                progress_bar.set_description(f"{description}")
            progress_bar.update()
    print(f"storing recreated volume table for {len(self.volumes)} volumes ({invalid} invalid)")
    self.store(replace=True)
    print(f"storing {len(paper_list)} papers")
    pm.store(replace=True)

`ceur_ws_web_cmd`

Created on 2024-02-22

@author: wf

`CeurWsCmd`

Bases: WebserverCmd

command line handling for CEUR-WS Volume browser

Source code in ceurws/ceur_ws_web_cmd.py

class CeurWsCmd(WebserverCmd):
    """
    command line handling for CEUR-WS Volume browser
    """

    def __init__(self):
        """
        constructor
        """
        config = CeurWsWebServer.get_config()
        WebserverCmd.__init__(self, config, CeurWsWebServer, DEBUG)
        pass

    def getArgParser(self, description: str, version_msg) -> ArgumentParser:
        """
        override the default argparser call
        """
        parser = super().getArgParser(description, version_msg)
        parser.add_argument(
            "-dbu",
            "--dblp_update",
            action="store_true",
            help="update dblp cache",
        )
        parser.add_argument(
            "-nq",
            "--namedqueries",
            action="store_true",
            help="generate named queries [default: %(default)s]",
        )
        parser.add_argument(
            "-den",
            "--dblp_endpoint_name",
            help="name of dblp endpoint to use %(default)s",
            default="qlever-dblp",
        )
        parser.add_argument(
            "-f",
            "--force",
            action="store_true",
            help="force update [default: %(default)s]",
        )
        parser.add_argument(
            "--list",
            action="store_true",
            help="list all volumes [default: %(default)s]",
        )
        parser.add_argument(
            "-rc",
            "--recreate",
            action="store_true",
            help="recreate caches e.g. volume table",
        )
        parser.add_argument(
            "-uv",
            "--update",
            action="store_true",
            help="update volumes by parsing index.html adding recently published volumes",
        )
        parser.add_argument(
            "-wen",
            "--wikidata_endpoint_name",
            help="name of wikidata endpoint to use %(default)s",
            default="wikidata",
        )
        parser.add_argument(
            "-wdu",
            "--wikidata_update",
            action="store_true",
            help="update tables from wikidata",
        )
        return parser

    def handle_args(self) -> bool:
        """
        handle the command line arguments
        """
        args = self.args
        if args.namedqueries:
            nq = NamedQueries()
            yaml = nq.toYaml()
            print(yaml)
        if args.list:
            manager = VolumeManager()
            manager.loadFromBackup()
            for volume in manager.getList():
                print(volume)
        if args.recreate or args.update:
            manager = VolumeManager()
            manager.load()
            progress_bar = tqdm(total=len(manager.volumes))
            parser_config = ParserConfig(progress_bar, debug=args.debug)

            if args.recreate:
                manager.recreate(parser_config)
            else:
                manager.update(parser_config)
        if args.wikidata_update:
            wdsync = WikidataSync.from_args(args)
            wdsync.update(withStore=True)
        if args.dblp_update:
            wdsync = WikidataSync.from_args(args)
            endpoint = wdsync.dblpEndpoint
            print(f"updating dblp cache from SPARQL endpoint {endpoint.sparql.url}")
            # Instantiate the progress bar
            pbar = tqdm(total=len(wdsync.dblpEndpoint.dblp_managers))
            for _step, (cache_name, dblp_manager) in enumerate(endpoint.dblp_managers.items(), start=1):
                # Call the corresponding function to refresh cache data
                dblp_manager.load(force_query=args.force)
                # Update the progress bar description with the cache name and increment
                pbar.set_description(f"{cache_name} updated ...")

                # Update the progress bar manually
                pbar.update(1)  # Increment the progress bar by 1 for each iteration

            # Close the progress bar after the loop
            pbar.close()
            table_data = []
            for _step, cache_name in enumerate(endpoint.dblp_managers.keys(), start=1):
                cache = endpoint.cache_manager.get_cache_by_name(cache_name)
                table_data.append(asdict(cache))
            table = tabulate(table_data, headers="keys", tablefmt="grid")
            print(table)
            pass
        handled = super().handle_args()
        return handled

`init()`

constructor

Source code in ceurws/ceur_ws_web_cmd.py

def __init__(self):
    """
    constructor
    """
    config = CeurWsWebServer.get_config()
    WebserverCmd.__init__(self, config, CeurWsWebServer, DEBUG)
    pass

`getArgParser(description, version_msg)`

override the default argparser call

Source code in ceurws/ceur_ws_web_cmd.py

def getArgParser(self, description: str, version_msg) -> ArgumentParser:
    """
    override the default argparser call
    """
    parser = super().getArgParser(description, version_msg)
    parser.add_argument(
        "-dbu",
        "--dblp_update",
        action="store_true",
        help="update dblp cache",
    )
    parser.add_argument(
        "-nq",
        "--namedqueries",
        action="store_true",
        help="generate named queries [default: %(default)s]",
    )
    parser.add_argument(
        "-den",
        "--dblp_endpoint_name",
        help="name of dblp endpoint to use %(default)s",
        default="qlever-dblp",
    )
    parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="force update [default: %(default)s]",
    )
    parser.add_argument(
        "--list",
        action="store_true",
        help="list all volumes [default: %(default)s]",
    )
    parser.add_argument(
        "-rc",
        "--recreate",
        action="store_true",
        help="recreate caches e.g. volume table",
    )
    parser.add_argument(
        "-uv",
        "--update",
        action="store_true",
        help="update volumes by parsing index.html adding recently published volumes",
    )
    parser.add_argument(
        "-wen",
        "--wikidata_endpoint_name",
        help="name of wikidata endpoint to use %(default)s",
        default="wikidata",
    )
    parser.add_argument(
        "-wdu",
        "--wikidata_update",
        action="store_true",
        help="update tables from wikidata",
    )
    return parser

`handle_args()`

handle the command line arguments

Source code in ceurws/ceur_ws_web_cmd.py

def handle_args(self) -> bool:
    """
    handle the command line arguments
    """
    args = self.args
    if args.namedqueries:
        nq = NamedQueries()
        yaml = nq.toYaml()
        print(yaml)
    if args.list:
        manager = VolumeManager()
        manager.loadFromBackup()
        for volume in manager.getList():
            print(volume)
    if args.recreate or args.update:
        manager = VolumeManager()
        manager.load()
        progress_bar = tqdm(total=len(manager.volumes))
        parser_config = ParserConfig(progress_bar, debug=args.debug)

        if args.recreate:
            manager.recreate(parser_config)
        else:
            manager.update(parser_config)
    if args.wikidata_update:
        wdsync = WikidataSync.from_args(args)
        wdsync.update(withStore=True)
    if args.dblp_update:
        wdsync = WikidataSync.from_args(args)
        endpoint = wdsync.dblpEndpoint
        print(f"updating dblp cache from SPARQL endpoint {endpoint.sparql.url}")
        # Instantiate the progress bar
        pbar = tqdm(total=len(wdsync.dblpEndpoint.dblp_managers))
        for _step, (cache_name, dblp_manager) in enumerate(endpoint.dblp_managers.items(), start=1):
            # Call the corresponding function to refresh cache data
            dblp_manager.load(force_query=args.force)
            # Update the progress bar description with the cache name and increment
            pbar.set_description(f"{cache_name} updated ...")

            # Update the progress bar manually
            pbar.update(1)  # Increment the progress bar by 1 for each iteration

        # Close the progress bar after the loop
        pbar.close()
        table_data = []
        for _step, cache_name in enumerate(endpoint.dblp_managers.keys(), start=1):
            cache = endpoint.cache_manager.get_cache_by_name(cache_name)
            table_data.append(asdict(cache))
        table = tabulate(table_data, headers="keys", tablefmt="grid")
        print(table)
        pass
    handled = super().handle_args()
    return handled

`main(argv=None)`

main call

Source code in ceurws/ceur_ws_web_cmd.py

def main(argv: list | None = None):
    """
    main call
    """
    cmd = CeurWsCmd()
    exit_code = cmd.cmd_main(argv)
    return exit_code

`config`

`CEURWS`

CEUR-WS

Source code in ceurws/config.py

class CEURWS:
    """
    CEUR-WS
    """

    @staticmethod
    def get_home_path() -> Path:
        """
        Get home path
        """
        home = Path.home()
        if "GITHUB_WORKSPACE" in os.environ:
            home = Path(os.environ["GITHUB_WORKSPACE"])
        return home

    URL = "http://ceur-ws.org"
    home = get_home_path()
    CACHE_DIR = home.joinpath(".ceurws")
    CACHE_FILE = CACHE_DIR.joinpath("ceurws.db")
    CACHE_HTML = CACHE_DIR.joinpath("index.html")
    CONFIG = StorageConfig(cacheFile=str(CACHE_FILE))

`get_home_path()` `staticmethod`

Get home path

Source code in ceurws/config.py

@staticmethod
def get_home_path() -> Path:
    """
    Get home path
    """
    home = Path.home()
    if "GITHUB_WORKSPACE" in os.environ:
        home = Path(os.environ["GITHUB_WORKSPACE"])
    return home

`dblp`

Created on 2024-03-09

@author: wf

`DblpAuthorIdentifier` `dataclass`

represents an author id available in dblp and the corresponding property in wikidata

Source code in ceurws/dblp.py

@dataclass
class DblpAuthorIdentifier:
    """
    represents an author id available in dblp
    and the corresponding property in wikidata
    """

    name: str  # the name should be usable as SPARQL variable
    dblp_property: str
    wikidata_property: str | None

    @classmethod
    def all(cls) -> list["DblpAuthorIdentifier"]:
        """
        returns all available identifiers
        """
        res = [
            DblpAuthorIdentifier("dblp", "datacite:dblp", "P2456"),
            DblpAuthorIdentifier("wikidata", "datacite:wikidata", None),
            DblpAuthorIdentifier("orcid", "datacite:orcid", "P496"),
            DblpAuthorIdentifier("googleScholar", "datacite:google-scholar", "P1960"),
            DblpAuthorIdentifier("acm", "datacite:acm", "P864"),
            DblpAuthorIdentifier("twitter", "datacite:twitter", "P2002"),
            DblpAuthorIdentifier("github", "datacite:github", "P2037"),
            DblpAuthorIdentifier("viaf", "datacite:viaf", "P214"),
            DblpAuthorIdentifier("scigraph", "datacite:scigraph", "P10861"),
            DblpAuthorIdentifier("zbmath", "datacite:zbmath", "P1556"),
            DblpAuthorIdentifier("researchGate", "datacite:research-gate", "P6023"),
            DblpAuthorIdentifier("mathGenealogy", "datacite:math-genealogy", "P549"),
            DblpAuthorIdentifier("loc", "datacite:loc", "P244"),
            DblpAuthorIdentifier("linkedin", "datacite:linkedin", "P6634"),
            DblpAuthorIdentifier("lattes", "datacite:lattes", "P1007"),
            DblpAuthorIdentifier("isni", "datacite:isni", "P213"),
            DblpAuthorIdentifier("ieee", "datacite:ieee", "P6479"),
            DblpAuthorIdentifier("gepris", "datacite:gepris", "P4872"),
            DblpAuthorIdentifier("gnd", "datacite:gnd", "P227"),
        ]
        return res

    @classmethod
    def getAllAsMap(cls) -> dict[str, "DblpAuthorIdentifier"]:
        """
        return all all available identifiers as map
        """
        res = dict()
        for identifier in cls.all():
            res[identifier.name] = identifier
        return res

    @classmethod
    def getWikidataIdQueryPart(cls, id_name: str, value: str, var: str):
        """
        Generates for the given identifier the wikidata query
        Args:
            id_name: name of the identifier
            value: the identifier value
            var: name of the variable which should have the id
        """
        if not var.startswith("?"):
            var = "?" + var
        query = None
        dblp_author_ids = cls.getAllAsMap().get(id_name)
        if dblp_author_ids is None:
            # unknown identifier
            return ""
        wd_prop = dblp_author_ids.wikidata_property
        values: str | list[str]
        if id_name == "wikidata":
            values = value
            if isinstance(value, str):
                values = [value]
            value_urls = " ".join([f"wd:{value}" for value in values])
            query = f"""{{ SELECT * WHERE {{ VALUES ?person {{ {value_urls} }} }} }}# {id_name}"""
        elif id_name in cls.getAllAsMap():
            if isinstance(value, list):
                values = " ".join([f'"{value}"' for value in value])
                query = f"""{{OPTIONAL{{
                            VALUES ?{id_name} {{ {values} }}
                            {var} wdt:{wd_prop} ?{id_name}.}} 
                            }}  # {id_name}"""
            else:
                query = f"""{{ {var} wdt:{wd_prop} "{value}". }}  # {id_name}"""
        else:
            pass
        return query

`all()` `classmethod`

returns all available identifiers

Source code in ceurws/dblp.py

@classmethod
def all(cls) -> list["DblpAuthorIdentifier"]:
    """
    returns all available identifiers
    """
    res = [
        DblpAuthorIdentifier("dblp", "datacite:dblp", "P2456"),
        DblpAuthorIdentifier("wikidata", "datacite:wikidata", None),
        DblpAuthorIdentifier("orcid", "datacite:orcid", "P496"),
        DblpAuthorIdentifier("googleScholar", "datacite:google-scholar", "P1960"),
        DblpAuthorIdentifier("acm", "datacite:acm", "P864"),
        DblpAuthorIdentifier("twitter", "datacite:twitter", "P2002"),
        DblpAuthorIdentifier("github", "datacite:github", "P2037"),
        DblpAuthorIdentifier("viaf", "datacite:viaf", "P214"),
        DblpAuthorIdentifier("scigraph", "datacite:scigraph", "P10861"),
        DblpAuthorIdentifier("zbmath", "datacite:zbmath", "P1556"),
        DblpAuthorIdentifier("researchGate", "datacite:research-gate", "P6023"),
        DblpAuthorIdentifier("mathGenealogy", "datacite:math-genealogy", "P549"),
        DblpAuthorIdentifier("loc", "datacite:loc", "P244"),
        DblpAuthorIdentifier("linkedin", "datacite:linkedin", "P6634"),
        DblpAuthorIdentifier("lattes", "datacite:lattes", "P1007"),
        DblpAuthorIdentifier("isni", "datacite:isni", "P213"),
        DblpAuthorIdentifier("ieee", "datacite:ieee", "P6479"),
        DblpAuthorIdentifier("gepris", "datacite:gepris", "P4872"),
        DblpAuthorIdentifier("gnd", "datacite:gnd", "P227"),
    ]
    return res

`getAllAsMap()` `classmethod`

return all all available identifiers as map

Source code in ceurws/dblp.py

@classmethod
def getAllAsMap(cls) -> dict[str, "DblpAuthorIdentifier"]:
    """
    return all all available identifiers as map
    """
    res = dict()
    for identifier in cls.all():
        res[identifier.name] = identifier
    return res

`getWikidataIdQueryPart(id_name, value, var)` `classmethod`

Generates for the given identifier the wikidata query Args: id_name: name of the identifier value: the identifier value var: name of the variable which should have the id

Source code in ceurws/dblp.py

@classmethod
def getWikidataIdQueryPart(cls, id_name: str, value: str, var: str):
    """
    Generates for the given identifier the wikidata query
    Args:
        id_name: name of the identifier
        value: the identifier value
        var: name of the variable which should have the id
    """
    if not var.startswith("?"):
        var = "?" + var
    query = None
    dblp_author_ids = cls.getAllAsMap().get(id_name)
    if dblp_author_ids is None:
        # unknown identifier
        return ""
    wd_prop = dblp_author_ids.wikidata_property
    values: str | list[str]
    if id_name == "wikidata":
        values = value
        if isinstance(value, str):
            values = [value]
        value_urls = " ".join([f"wd:{value}" for value in values])
        query = f"""{{ SELECT * WHERE {{ VALUES ?person {{ {value_urls} }} }} }}# {id_name}"""
    elif id_name in cls.getAllAsMap():
        if isinstance(value, list):
            values = " ".join([f'"{value}"' for value in value])
            query = f"""{{OPTIONAL{{
                        VALUES ?{id_name} {{ {values} }}
                        {var} wdt:{wd_prop} ?{id_name}.}} 
                        }}  # {id_name}"""
        else:
            query = f"""{{ {var} wdt:{wd_prop} "{value}". }}  # {id_name}"""
    else:
        pass
    return query

`DblpAuthors`

Bases: DblpManager

Manage all authors of DBLP indexed volumes.

Source code in ceurws/dblp.py

class DblpAuthors(DblpManager):
    """
    Manage all authors of DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/authors", "CEUR-WS Paper Authors")
        self.authors: list[DblpScholar] | None = None

    def load(self, force_query: bool = False):
        """
        load my authors
        """
        if self.authors is None:
            super().load(force_query=force_query)
            self.authors = []
            for d in self.lod:
                author = DblpScholar(**d)
                self.authors.append(author)
            self.authorsById = {a.dblp_author_id: a for a in self.authors}

`load(force_query=False)`

load my authors

Source code in ceurws/dblp.py

def load(self, force_query: bool = False):
    """
    load my authors
    """
    if self.authors is None:
        super().load(force_query=force_query)
        self.authors = []
        for d in self.lod:
            author = DblpScholar(**d)
            self.authors.append(author)
        self.authorsById = {a.dblp_author_id: a for a in self.authors}

`DblpEditors`

Bases: DblpManager

Manage all editors of DBLP indexed volumes.

Source code in ceurws/dblp.py

class DblpEditors(DblpManager):
    """
    Manage all editors of DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/editors", "CEUR-WS all Editors")
        self.editors: list[DblpScholar] | None = None

    def load(self, force_query: bool = False):
        """
        load my editors
        """
        if self.editors is None:
            super().load(force_query=force_query)
            self.editors = []
            for d in self.lod:
                editor = DblpScholar(**d)
                self.editors.append(editor)
            self.editorsById = {e.dblp_author_id: e for e in self.editors}

`load(force_query=False)`

load my editors

Source code in ceurws/dblp.py

def load(self, force_query: bool = False):
    """
    load my editors
    """
    if self.editors is None:
        super().load(force_query=force_query)
        self.editors = []
        for d in self.lod:
            editor = DblpScholar(**d)
            self.editors.append(editor)
        self.editorsById = {e.dblp_author_id: e for e in self.editors}

`DblpEndpoint`

provides queries and a dblp endpoint to execute them

Source code in ceurws/dblp.py

class DblpEndpoint:
    """
    provides queries and a dblp endpoint to execute them
    """

    DBLP_REC_PREFIX = "https://dblp.org/rec/"
    DBLP_EVENT_PREFIX = "https://dblp.org/db/"

    def __init__(self, endpoint, debug: bool = False):
        """
        constructor
        """
        self.debug = debug
        self.sparql = SPARQL(endpoint)
        path = os.path.dirname(__file__)
        qYamlFile = f"{path}/resources/queries/dblp.yaml"
        if os.path.isfile(qYamlFile):
            self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
        # there is one cache manager for all our json caches
        self.cache_manager = CacheManager("ceurws")
        self.dblp_authors = DblpAuthors(endpoint=self)
        self.dblp_editors = DblpEditors(endpoint=self)
        self.dblp_papers = DblpPapers(endpoint=self)
        self.dblp_volumes = DblpVolumes(endpoint=self)
        self.dblp_managers = {
            "dblp/authors": self.dblp_authors,
            "dblp/editors": self.dblp_editors,
            "dblp/papers": self.dblp_papers,
            "dblp/volumes": self.dblp_volumes,
        }
        self.progress_bar = None

    def load_all(self, force_query: bool = False):
        """
        load all managers
        """
        for _key, manager in self.dblp_managers.items():
            manager.load(force_query=force_query)

    def get_lod(self, cache_name: str, query_name: str, force_query: bool = False) -> list:
        """
        Get the list of dictionaries for the given cache and query names,
        optionally forcing a query.

        Args:
            cache_name (str): The name of the cache to load or store the LOD.
            query_name (str): The name of the query to execute if the data is not cached or forced to query.
            force_query (bool): If True, forces the query execution even if the data is cached. Defaults to False.

        Returns:
            List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.
        """
        start_time = time.time()  # Record the start time of the operation
        cache = self.cache_manager.get_cache_by_name(cache_name)
        if cache.is_stored and not force_query:
            if self.debug:
                print(f"loading {cache_name} from cache")
            lod = self.cache_manager.load(cache_name)
        else:
            query = self.qm.queriesByName[query_name]
            if self.debug:
                print(f"loading {cache_name} from SPARQL query {query_name}")
            lod = self.sparql.queryAsListOfDicts(query.query)
            self.cache_manager.store(cache_name, lod)
        end_time = time.time()  # Record the end time of the operation
        duration = end_time - start_time  # Calculate the duration of the loading process

        if self.debug:
            print(f"loaded {len(lod)} records for {cache_name} in {duration:.2f} seconds")
        if self.progress_bar:
            self.progress_bar.update(duration * 100 / 36)
        return lod

    def get_ceur_volume_papers(self, volume_number: int) -> list[DblpPaper]:
        """
        Get all papers published in CEUR-WS from dblp
        """
        cache_name = f"dblp/Vol-{volume_number}/papers"
        lod = self.cache_manager.load(cache_name)
        papers = [DblpPaper(**d) for d in lod]
        return papers

    def get_ceur_proceeding(self, volume_number: int) -> DblpProceeding:
        """
        get ceur proceeding by volume number from dblp
        Args:
            volume_number: number of the volume
        """
        cache_name = f"dblp/Vol-{volume_number}/metadata"
        volume = self.cache_manager.load(cache_name, cls=DblpProceeding)
        return volume

    def getDblpIdByVolumeNumber(self, number) -> list[str]:
        """
        Get the dblp entity id by given volume number
        Args:
            number: volume number
        """
        query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
            SELECT *
            WHERE {{ 
                ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                            dblp:publishedInSeriesVolume "{number}".
                }}
        """
        try:
            qres = self.sparql.queryAsListOfDicts(query)
        except HTTPError:
            print("dblp sparql endpoint unavailable")
            qres = None
        qIds = []
        if qres is not None and qres != []:
            qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX) :] for record in qres]
        return qIds

    def getDblpUrlByDblpId(self, entityId: str | None = None) -> str | None:
        """
        Get the dblp url for given entity id
        Args:
            entityId: volume url
        """
        if entityId is None or entityId == "":
            return None
        entityUrl = self.DBLP_REC_PREFIX + entityId
        query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
                SELECT *
                WHERE {{ 
                    <{entityUrl}> dblp:listedOnTocPage ?url .
                    }}
            """
        qres = self.sparql.queryAsListOfDicts(query)
        qIds = []
        if qres is not None and qres != []:
            qIds = [record.get("url")[len(self.DBLP_EVENT_PREFIX) :] for record in qres]
        qId = qIds[0] if qIds is not None and len(qIds) > 0 else None
        return qId

    def convertEntityIdToUrlId(self, entityId: str | None) -> str | None:
        """
        Convert the given entityId to the id used in the url
        Note: use with care this conversion does not always work
        Args:
            entityId: id of the entity
        Example:
            conf/aaai/2022 → conf/aaai/aaai2022

        Returns
            str - id used in the url
            None - if the given entityId can not be converted
        """
        return self.getDblpUrlByDblpId(entityId)

    def toDblpUrl(self, entityId: str, withPostfix: bool = False) -> str | None:
        """
        Convert the given id to the corresponding dblp url
        Args:
            entityId: dblp event id
            withPostfix: If True add the postfix ".html"

        Returns:
            dblp url of None if the url can not be generated for the given input
        """
        urlId = self.convertEntityIdToUrlId(entityId)
        if urlId is None:
            return None
        postfix = ".html"
        url = self.DBLP_EVENT_PREFIX + urlId
        if withPostfix:
            url += postfix
        return url

    def getEditorsOfVolume(self, number: int | str | None) -> list[dict]:
        """
        Get the editors for the given volume number
        Args:
            number: number of the volume if none query for all ceur-ws editors

        Returns:
            list of dictionaries where a dict represents one editor containing all identifiers of the editor
        """
        number_var = "?volumeNumber" if number is None else f'"{number}"'
        dblp_identifiers = DblpAuthorIdentifier.all()
        optional_clauses: list[str] = []
        id_vars: list[str] = []
        for identifier in dblp_identifiers:
            id_var = f"?{identifier.name}"
            optional_clauses.append(
                f"""OPTIONAL{{
                ?editor datacite:hasIdentifier {id_var}_blank.
                {id_var}_blank datacite:usesIdentifierScheme {identifier.dblp_property};
                litre:hasLiteralValue {id_var}Var.}}"""
            )
            id_vars.append(id_var)
        id_selects = "\n".join(
            [f"(group_concat(DISTINCT {id_var}Var;separator='|') as {id_var})" for id_var in id_vars]
        )
        id_queries = "\n".join(optional_clauses)
        query = f"""PREFIX datacite: <http://purl.org/spar/datacite/>
                    PREFIX dblp: <https://dblp.org/rdf/schema#>
                    PREFIX litre: <http://purl.org/spar/literal/>
                    SELECT DISTINCT (group_concat(DISTINCT ?nameVar;separator='|') as ?name) 
                                    (group_concat(DISTINCT ?homepageVar;separator='|') as ?homepage)
                                    (group_concat(DISTINCT ?affiliationVar;separator='|') as ?affiliation)
                                    {id_selects}
                    WHERE{{
                        ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                                    dblp:publishedInSeriesVolume {number_var};
                                    dblp:editedBy ?editor.
                        ?editor dblp:primaryCreatorName ?nameVar.
                        OPTIONAL{{?editor dblp:primaryHomepage ?homepageVar.}}
                        OPTIONAL{{?editor dblp:primaryAffiliation ?affiliationVar.}}
                        {id_queries}
                    }}
                    GROUP BY ?editor
                """
        qres = self.sparql.queryAsListOfDicts(query)
        for record in qres:
            for key, value in record.items():
                if "|" in value:
                    record[key] = value.split(
                        '"|"'
                    )  # issue in qlever see https://github.com/ad-freiburg/qlever/discussions/806
        return qres

`init(endpoint, debug=False)`

constructor

Source code in ceurws/dblp.py

def __init__(self, endpoint, debug: bool = False):
    """
    constructor
    """
    self.debug = debug
    self.sparql = SPARQL(endpoint)
    path = os.path.dirname(__file__)
    qYamlFile = f"{path}/resources/queries/dblp.yaml"
    if os.path.isfile(qYamlFile):
        self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
    # there is one cache manager for all our json caches
    self.cache_manager = CacheManager("ceurws")
    self.dblp_authors = DblpAuthors(endpoint=self)
    self.dblp_editors = DblpEditors(endpoint=self)
    self.dblp_papers = DblpPapers(endpoint=self)
    self.dblp_volumes = DblpVolumes(endpoint=self)
    self.dblp_managers = {
        "dblp/authors": self.dblp_authors,
        "dblp/editors": self.dblp_editors,
        "dblp/papers": self.dblp_papers,
        "dblp/volumes": self.dblp_volumes,
    }
    self.progress_bar = None

`convertEntityIdToUrlId(entityId)`

Convert the given entityId to the id used in the url Note: use with care this conversion does not always work Args: entityId: id of the entity Example: conf/aaai/2022 → conf/aaai/aaai2022

Returns str - id used in the url None - if the given entityId can not be converted

Source code in ceurws/dblp.py

def convertEntityIdToUrlId(self, entityId: str | None) -> str | None:
    """
    Convert the given entityId to the id used in the url
    Note: use with care this conversion does not always work
    Args:
        entityId: id of the entity
    Example:
        conf/aaai/2022 → conf/aaai/aaai2022

    Returns
        str - id used in the url
        None - if the given entityId can not be converted
    """
    return self.getDblpUrlByDblpId(entityId)

`getDblpIdByVolumeNumber(number)`

Get the dblp entity id by given volume number Args: number: volume number

Source code in ceurws/dblp.py

def getDblpIdByVolumeNumber(self, number) -> list[str]:
    """
    Get the dblp entity id by given volume number
    Args:
        number: volume number
    """
    query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
        SELECT *
        WHERE {{ 
            ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                        dblp:publishedInSeriesVolume "{number}".
            }}
    """
    try:
        qres = self.sparql.queryAsListOfDicts(query)
    except HTTPError:
        print("dblp sparql endpoint unavailable")
        qres = None
    qIds = []
    if qres is not None and qres != []:
        qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX) :] for record in qres]
    return qIds

`getDblpUrlByDblpId(entityId=None)`

Get the dblp url for given entity id Args: entityId: volume url

Source code in ceurws/dblp.py

def getDblpUrlByDblpId(self, entityId: str | None = None) -> str | None:
    """
    Get the dblp url for given entity id
    Args:
        entityId: volume url
    """
    if entityId is None or entityId == "":
        return None
    entityUrl = self.DBLP_REC_PREFIX + entityId
    query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
            SELECT *
            WHERE {{ 
                <{entityUrl}> dblp:listedOnTocPage ?url .
                }}
        """
    qres = self.sparql.queryAsListOfDicts(query)
    qIds = []
    if qres is not None and qres != []:
        qIds = [record.get("url")[len(self.DBLP_EVENT_PREFIX) :] for record in qres]
    qId = qIds[0] if qIds is not None and len(qIds) > 0 else None
    return qId

`getEditorsOfVolume(number)`

Get the editors for the given volume number Args: number: number of the volume if none query for all ceur-ws editors

Returns:

Type	Description
`list[dict]`	list of dictionaries where a dict represents one editor containing all identifiers of the editor

Source code in ceurws/dblp.py

def getEditorsOfVolume(self, number: int | str | None) -> list[dict]:
    """
    Get the editors for the given volume number
    Args:
        number: number of the volume if none query for all ceur-ws editors

    Returns:
        list of dictionaries where a dict represents one editor containing all identifiers of the editor
    """
    number_var = "?volumeNumber" if number is None else f'"{number}"'
    dblp_identifiers = DblpAuthorIdentifier.all()
    optional_clauses: list[str] = []
    id_vars: list[str] = []
    for identifier in dblp_identifiers:
        id_var = f"?{identifier.name}"
        optional_clauses.append(
            f"""OPTIONAL{{
            ?editor datacite:hasIdentifier {id_var}_blank.
            {id_var}_blank datacite:usesIdentifierScheme {identifier.dblp_property};
            litre:hasLiteralValue {id_var}Var.}}"""
        )
        id_vars.append(id_var)
    id_selects = "\n".join(
        [f"(group_concat(DISTINCT {id_var}Var;separator='|') as {id_var})" for id_var in id_vars]
    )
    id_queries = "\n".join(optional_clauses)
    query = f"""PREFIX datacite: <http://purl.org/spar/datacite/>
                PREFIX dblp: <https://dblp.org/rdf/schema#>
                PREFIX litre: <http://purl.org/spar/literal/>
                SELECT DISTINCT (group_concat(DISTINCT ?nameVar;separator='|') as ?name) 
                                (group_concat(DISTINCT ?homepageVar;separator='|') as ?homepage)
                                (group_concat(DISTINCT ?affiliationVar;separator='|') as ?affiliation)
                                {id_selects}
                WHERE{{
                    ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                                dblp:publishedInSeriesVolume {number_var};
                                dblp:editedBy ?editor.
                    ?editor dblp:primaryCreatorName ?nameVar.
                    OPTIONAL{{?editor dblp:primaryHomepage ?homepageVar.}}
                    OPTIONAL{{?editor dblp:primaryAffiliation ?affiliationVar.}}
                    {id_queries}
                }}
                GROUP BY ?editor
            """
    qres = self.sparql.queryAsListOfDicts(query)
    for record in qres:
        for key, value in record.items():
            if "|" in value:
                record[key] = value.split(
                    '"|"'
                )  # issue in qlever see https://github.com/ad-freiburg/qlever/discussions/806
    return qres

`get_ceur_proceeding(volume_number)`

get ceur proceeding by volume number from dblp Args: volume_number: number of the volume

Source code in ceurws/dblp.py

def get_ceur_proceeding(self, volume_number: int) -> DblpProceeding:
    """
    get ceur proceeding by volume number from dblp
    Args:
        volume_number: number of the volume
    """
    cache_name = f"dblp/Vol-{volume_number}/metadata"
    volume = self.cache_manager.load(cache_name, cls=DblpProceeding)
    return volume

`get_ceur_volume_papers(volume_number)`

Get all papers published in CEUR-WS from dblp

Source code in ceurws/dblp.py

def get_ceur_volume_papers(self, volume_number: int) -> list[DblpPaper]:
    """
    Get all papers published in CEUR-WS from dblp
    """
    cache_name = f"dblp/Vol-{volume_number}/papers"
    lod = self.cache_manager.load(cache_name)
    papers = [DblpPaper(**d) for d in lod]
    return papers

`get_lod(cache_name, query_name, force_query=False)`

Get the list of dictionaries for the given cache and query names, optionally forcing a query.

Parameters:

Name	Type	Description	Default
`cache_name`	`str`	The name of the cache to load or store the LOD.	required
`query_name`	`str`	The name of the query to execute if the data is not cached or forced to query.	required
`force_query`	`bool`	If True, forces the query execution even if the data is cached. Defaults to False.	`False`

Returns:

Type	Description
`list`	List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.

Source code in ceurws/dblp.py

def get_lod(self, cache_name: str, query_name: str, force_query: bool = False) -> list:
    """
    Get the list of dictionaries for the given cache and query names,
    optionally forcing a query.

    Args:
        cache_name (str): The name of the cache to load or store the LOD.
        query_name (str): The name of the query to execute if the data is not cached or forced to query.
        force_query (bool): If True, forces the query execution even if the data is cached. Defaults to False.

    Returns:
        List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.
    """
    start_time = time.time()  # Record the start time of the operation
    cache = self.cache_manager.get_cache_by_name(cache_name)
    if cache.is_stored and not force_query:
        if self.debug:
            print(f"loading {cache_name} from cache")
        lod = self.cache_manager.load(cache_name)
    else:
        query = self.qm.queriesByName[query_name]
        if self.debug:
            print(f"loading {cache_name} from SPARQL query {query_name}")
        lod = self.sparql.queryAsListOfDicts(query.query)
        self.cache_manager.store(cache_name, lod)
    end_time = time.time()  # Record the end time of the operation
    duration = end_time - start_time  # Calculate the duration of the loading process

    if self.debug:
        print(f"loaded {len(lod)} records for {cache_name} in {duration:.2f} seconds")
    if self.progress_bar:
        self.progress_bar.update(duration * 100 / 36)
    return lod

`load_all(force_query=False)`

load all managers

Source code in ceurws/dblp.py

def load_all(self, force_query: bool = False):
    """
    load all managers
    """
    for _key, manager in self.dblp_managers.items():
        manager.load(force_query=force_query)

`toDblpUrl(entityId, withPostfix=False)`

Convert the given id to the corresponding dblp url Args: entityId: dblp event id withPostfix: If True add the postfix ".html"

Returns:

Type	Description
`str \| None`	dblp url of None if the url can not be generated for the given input

Source code in ceurws/dblp.py

def toDblpUrl(self, entityId: str, withPostfix: bool = False) -> str | None:
    """
    Convert the given id to the corresponding dblp url
    Args:
        entityId: dblp event id
        withPostfix: If True add the postfix ".html"

    Returns:
        dblp url of None if the url can not be generated for the given input
    """
    urlId = self.convertEntityIdToUrlId(entityId)
    if urlId is None:
        return None
    postfix = ".html"
    url = self.DBLP_EVENT_PREFIX + urlId
    if withPostfix:
        url += postfix
    return url

`DblpManager`

Manage DBLP entities.

Attributes:

Name	Type	Description
`endpoint`	`DblpEndpoint`	The endpoint for DBLP queries.
`cache_name`	`str`	The name of the cache to use.
`query_name`	`str`	The name of the query to execute.

Source code in ceurws/dblp.py

class DblpManager:
    """
    Manage DBLP entities.

    Attributes:
        endpoint (DblpEndpoint): The endpoint for DBLP queries.
        cache_name (str): The name of the cache to use.
        query_name (str): The name of the query to execute.
    """

    def __init__(self, endpoint: "DblpEndpoint", cache_name: str, query_name: str):
        """
        Initializes the DBLP Manager with the given endpoint, cache name, and query name.

        Args:
            endpoint (DblpEndpoint): The endpoint for DBLP queries.
            cache_name (str): The name of the cache to use.
            query_name (str): The name of the query to execute.
        """
        self.endpoint = endpoint
        self.cache_name = cache_name
        self.query_name = query_name

    def load(self, force_query: bool = False):
        """
        Loads a list of dictionaries from the DBLP endpoint.

        Args:
            force_query (bool): If True, forces a new query to the endpoint. Defaults to False.
        """
        self.lod = self.endpoint.get_lod(self.cache_name, self.query_name, force_query=force_query)

`init(endpoint, cache_name, query_name)`

Initializes the DBLP Manager with the given endpoint, cache name, and query name.

Parameters:

Name	Type	Description	Default
`endpoint`	`DblpEndpoint`	The endpoint for DBLP queries.	required
`cache_name`	`str`	The name of the cache to use.	required
`query_name`	`str`	The name of the query to execute.	required

Source code in ceurws/dblp.py

def __init__(self, endpoint: "DblpEndpoint", cache_name: str, query_name: str):
    """
    Initializes the DBLP Manager with the given endpoint, cache name, and query name.

    Args:
        endpoint (DblpEndpoint): The endpoint for DBLP queries.
        cache_name (str): The name of the cache to use.
        query_name (str): The name of the query to execute.
    """
    self.endpoint = endpoint
    self.cache_name = cache_name
    self.query_name = query_name

`load(force_query=False)`

Loads a list of dictionaries from the DBLP endpoint.

Parameters:

Name	Type	Description	Default
`force_query`	`bool`	If True, forces a new query to the endpoint. Defaults to False.	`False`

Source code in ceurws/dblp.py

def load(self, force_query: bool = False):
    """
    Loads a list of dictionaries from the DBLP endpoint.

    Args:
        force_query (bool): If True, forces a new query to the endpoint. Defaults to False.
    """
    self.lod = self.endpoint.get_lod(self.cache_name, self.query_name, force_query=force_query)

`DblpPapers`

Bases: DblpManager

manage all CEUR-WS papers indexed by dblp

Source code in ceurws/dblp.py

class DblpPapers(DblpManager):
    """
    manage all CEUR-WS papers indexed by dblp
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/papers", "CEUR-WS all Papers")
        self.papers: list[DblpPaper] | None = None
        self.papers_by_volume: dict[str, dict] = {}
        self.papersById: dict[str, DblpPaper] = {}
        self.papersByProceeding: dict[str, list[DblpPaper]] = {}

    def load(self, force_query: bool = False):
        """
        load my editors
        """
        if self.papers is None:
            super().load(force_query=force_query)
            dblp_authors = self.endpoint.dblp_authors
            dblp_authors.load(force_query=force_query)
            self.papers = []
            for d in self.lod:
                pdf_id = d.get("pdf_url", None)
                if pdf_id and isinstance(pdf_id, str):
                    pdf_id = pdf_id.replace("http://ceur-ws.org/", "")
                    pdf_id = pdf_id.replace("https://ceur-ws.org/", "")
                    pdf_id = pdf_id.replace(".pdf", "")
                authors = []
                # get the authors string
                authors_str = d.get("author", "")
                # >;<  qlever quirk until 2023-12
                delim = ">;<" if ">;<" in authors_str else ";"
                for dblp_author_id in authors_str.split(delim):  #
                    author = dblp_authors.authorsById.get(dblp_author_id, None)
                    if author:
                        authors.append(author)
                paper = DblpPaper(
                    dblp_publication_id=d.get("paper"),
                    volume_number=int(d.get("volume_number")),
                    dblp_proceeding_id=d.get("proceeding"),
                    title=d.get("title"),
                    pdf_id=pdf_id,
                    authors=authors,
                )  # type: ignore
                self.papers.append(paper)
            self.papers_by_volume = LOD.getLookup(self.papers, "volume_number", withDuplicates=True)
            self.papersByProceeding = {
                key: list(group) for key, group in groupby(self.papers, lambda paper: paper.dblp_proceeding_id)
            }
            self.papersById = {p.dblp_publication_id: p for p in self.papers} if self.papers is not None else {}
            # papers per volume
            for volume_number, vol_papers in sorted(self.papers_by_volume.items()):
                vol_paper_lod = [dataclasses.asdict(paper) for paper in vol_papers]
                cache_name = f"dblp/Vol-{volume_number}/papers"
                if self.endpoint.progress_bar:
                    self.endpoint.progress_bar.update(30 / 3650)
                    # print(f"caching {cache_name}")
                self.endpoint.cache_manager.store(
                    cache_name,
                    vol_paper_lod,
                )

`load(force_query=False)`

load my editors

Source code in ceurws/dblp.py

def load(self, force_query: bool = False):
    """
    load my editors
    """
    if self.papers is None:
        super().load(force_query=force_query)
        dblp_authors = self.endpoint.dblp_authors
        dblp_authors.load(force_query=force_query)
        self.papers = []
        for d in self.lod:
            pdf_id = d.get("pdf_url", None)
            if pdf_id and isinstance(pdf_id, str):
                pdf_id = pdf_id.replace("http://ceur-ws.org/", "")
                pdf_id = pdf_id.replace("https://ceur-ws.org/", "")
                pdf_id = pdf_id.replace(".pdf", "")
            authors = []
            # get the authors string
            authors_str = d.get("author", "")
            # >;<  qlever quirk until 2023-12
            delim = ">;<" if ">;<" in authors_str else ";"
            for dblp_author_id in authors_str.split(delim):  #
                author = dblp_authors.authorsById.get(dblp_author_id, None)
                if author:
                    authors.append(author)
            paper = DblpPaper(
                dblp_publication_id=d.get("paper"),
                volume_number=int(d.get("volume_number")),
                dblp_proceeding_id=d.get("proceeding"),
                title=d.get("title"),
                pdf_id=pdf_id,
                authors=authors,
            )  # type: ignore
            self.papers.append(paper)
        self.papers_by_volume = LOD.getLookup(self.papers, "volume_number", withDuplicates=True)
        self.papersByProceeding = {
            key: list(group) for key, group in groupby(self.papers, lambda paper: paper.dblp_proceeding_id)
        }
        self.papersById = {p.dblp_publication_id: p for p in self.papers} if self.papers is not None else {}
        # papers per volume
        for volume_number, vol_papers in sorted(self.papers_by_volume.items()):
            vol_paper_lod = [dataclasses.asdict(paper) for paper in vol_papers]
            cache_name = f"dblp/Vol-{volume_number}/papers"
            if self.endpoint.progress_bar:
                self.endpoint.progress_bar.update(30 / 3650)
                # print(f"caching {cache_name}")
            self.endpoint.cache_manager.store(
                cache_name,
                vol_paper_lod,
            )

`DblpVolumes`

Bases: DblpManager

Manage all DBLP indexed volumes.

Source code in ceurws/dblp.py

class DblpVolumes(DblpManager):
    """
    Manage all DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/volumes", "CEUR-WS all Volumes")
        self.volumes = None

    def load(self, force_query: bool = False):
        """
        load my volumes
        """
        if self.volumes is None:
            super().load(force_query=force_query)
            volumes = []
            dblp_editors = self.endpoint.dblp_editors
            dblp_editors.load(force_query=force_query)
            dblp_papers = self.endpoint.dblp_papers
            dblp_papers.load(force_query=force_query)
            for d in self.lod:
                if int(d.get("volume_number")) == 3000:
                    pass
                vol_editors = []
                editor_str = d.get("editor", "")
                # >;<  qlever quirk until 2023-12
                delim = ">;<" if ">;<" in editor_str else ";"
                for dblp_author_id in editor_str.split(delim):
                    editor = dblp_editors.editorsById.get(dblp_author_id, None)
                    if editor:
                        vol_editors.append(editor)
                volume = DblpProceeding(
                    dblp_publication_id=d.get("proceeding"),
                    volume_number=int(d.get("volume_number")),
                    dblp_event_id=d.get("dblp_event_id"),
                    title=d.get("title"),
                    editors=vol_editors,
                    papers=dblp_papers.papersByProceeding.get(d.get("proceeding")),
                )  # type: ignore
                volumes.append(volume)
            volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
            for number, volume in sorted(volume_by_number.items()):
                cache_name = f"dblp/Vol-{number}/metadata"
                if self.endpoint.progress_bar:
                    self.endpoint.progress_bar.update(int(30 / 3650))
                self.endpoint.cache_manager.store(cache_name, volume)
        return self.volumes

`load(force_query=False)`

load my volumes

Source code in ceurws/dblp.py

def load(self, force_query: bool = False):
    """
    load my volumes
    """
    if self.volumes is None:
        super().load(force_query=force_query)
        volumes = []
        dblp_editors = self.endpoint.dblp_editors
        dblp_editors.load(force_query=force_query)
        dblp_papers = self.endpoint.dblp_papers
        dblp_papers.load(force_query=force_query)
        for d in self.lod:
            if int(d.get("volume_number")) == 3000:
                pass
            vol_editors = []
            editor_str = d.get("editor", "")
            # >;<  qlever quirk until 2023-12
            delim = ">;<" if ">;<" in editor_str else ";"
            for dblp_author_id in editor_str.split(delim):
                editor = dblp_editors.editorsById.get(dblp_author_id, None)
                if editor:
                    vol_editors.append(editor)
            volume = DblpProceeding(
                dblp_publication_id=d.get("proceeding"),
                volume_number=int(d.get("volume_number")),
                dblp_event_id=d.get("dblp_event_id"),
                title=d.get("title"),
                editors=vol_editors,
                papers=dblp_papers.papersByProceeding.get(d.get("proceeding")),
            )  # type: ignore
            volumes.append(volume)
        volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
        for number, volume in sorted(volume_by_number.items()):
            cache_name = f"dblp/Vol-{number}/metadata"
            if self.endpoint.progress_bar:
                self.endpoint.progress_bar.update(int(30 / 3650))
            self.endpoint.cache_manager.store(cache_name, volume)
    return self.volumes

`indexparser`

Created on 11.08.2022

@author: wf

`IndexHtmlParser`

Bases: Textparser

CEUR-WS Index.html parser

Source code in ceurws/indexparser.py

class IndexHtmlParser(Textparser):
    """
    CEUR-WS Index.html parser
    """

    def __init__(self, htmlText: str, config: ParserConfig | None = None):
        """
        Constructor

        Args:
            htmlText(str): the HTML text of the index page
        """
        if config is None:
            config = ParserConfig()
        self.config = config
        Textparser.__init__(self, debug=config.debug)
        self.htmlText = htmlText
        # soup (in memory is slow)
        # soup = BeautifulSoup(html_page, 'html.parser'
        self.lines = htmlText.split("\n")
        # trStart, trEnd = makeHTMLTags("tr")
        # self.tr = trStart + SkipTo(trEnd).setResultsName("tr") + trEnd.suppress()
        self.linkPattern = re.compile(r""".*href=[\'"]?([^\'" >]+).*""", re.I)
        self.volPattern = re.compile("http://ceur-ws.org/Vol-([0-9]+)")
        self.volLinkPattern = re.compile(
            r""".*<a\s+href=[\'"]http://ceur-ws.org/Vol-([0-9]+)[/]?[\'"]>([^<]*)</a>.*""",
            re.I | re.DOTALL,
        )
        # Pre-compile patterns used in find and findVolume
        self.thColspanPattern = re.compile(r"^.*<th\s*colspan", re.I)
        self.trStartPattern = re.compile(r"^\s*<tr>", re.I)
        self.trEndPattern = re.compile(r"^\s*</tr>", re.I)
        # Pre-compile patterns used in setVolumeTitle
        self.editedByPattern = re.compile("Edited by:")
        self.tdBgColorPattern = re.compile("<td bgcolor", re.I)

    def find(self, startLine: int, compiledPattern, step: int = 1) -> int | None:
        """
        find the next line with the given compiled regular expression pattern

        Args:
            startLine(int): index of the line to start search
            compiledPattern(re.Pattern): the compiled regular expression pattern to search for
            step(int): the steps to take e.g. +1 for forward -1 for backwards

        Return:
            int: the line number of the line or None if nothing was found
        """
        lineNo = startLine
        while 0 < lineNo < len(self.lines) + 1:
            line = self.lines[lineNo - 1]
            if compiledPattern.match(line):
                return lineNo
            lineNo += step
        return None

    def findVolume(
        self,
        volCount: int,
        startLine: int,
        expectedTr: int = 3,
        progress: int = 10,
    ) -> tuple[int | None, int | None]:
        """
        find Volume lines from the given startLine

        Args:
            volCount(int): the volumeCount before the startLine
            startLine(int): index of the line to search
            expectedTr(int): number of <tr> tags expected
            progress(int): how often to show the progress

        Returns:
            endLine of the volume html or None
        """
        trStartLine = self.find(startLine, self.thColspanPattern)
        if trStartLine is not None:
            lineNo = trStartLine + 1
            trCount = 1
            while lineNo < len(self.lines):
                trLine = self.find(lineNo, self.trStartPattern)
                if trLine is None:
                    break
                else:
                    lineNo = trLine + 1
                    trCount += 1
                    if trCount == expectedTr:
                        trEndLine = self.find(lineNo + 1, self.trEndPattern)
                        if volCount % progress == 0 and self.config.verbose:
                            print(f"volume count {volCount+1:4}: lines {trStartLine:6}-{trEndLine:6}")
                        return trStartLine, trEndLine
        return None, None

    def setVolumeNumber(self, volume, href):
        """
        set the volumen number
        """
        if href is None:
            return
        volNumber = self.getMatch(self.volPattern, href, 1)
        if volNumber is not None:
            volume["number"] = int(volNumber)

    def setVolumeName(self, volume, line):
        """
        set the volume name
        """
        volName = self.getMatch(self.volLinkPattern, line, 2)
        if volName is not None:
            valid = True
            if not volName.startswith("http:"):
                invalidKeys = ["deleted upon editor request", "Not used"]
                for invalidKey in invalidKeys:
                    if invalidKey in volName:
                        href = self.getMatch(self.linkPattern, line, 1)
                        self.setVolumeNumber(volume, href)
                        valid = False
                volume["valid"] = valid
                if valid:
                    volName = html.unescape(volName)
                    volName = Textparser.sanitize(volName)
                    volume["volname"] = volName

    def setVolumeTitle(self, volume: dict, lineIndex: int):
        """
        set the volume title

        Args:
            volume(dict): the volumeRecord to modify
            lineIndex: where to start setting the volumeTitle
        """
        editedByLine = self.find(lineIndex, self.editedByPattern)
        if editedByLine is not None:
            tdLine = self.find(editedByLine, self.tdBgColorPattern, step=-1)
            if tdLine is not None:
                tdIndex = tdLine - 1
                title = ""
                delim = ""
                while tdIndex < len(self.lines):
                    line = self.lines[tdIndex]
                    if line.startswith("Edited by:"):
                        break
                    for tag in [
                        '<TD bgcolor="#FFFFFF">&nbsp;</TD><TD bgcolor="#FFFFFF">',
                        '<TD bgcolor="#FFFFFF">',
                        '<td bgcolor="#FFFFFF">',
                        "<BR>",
                        "<br>",
                    ]:
                        line = line.replace(tag, "")
                    line = line.replace("\r", " ")
                    title += line + delim
                    delim = " "
                    tdIndex += 1
                volume["tdtitle"] = html.unescape(title).strip()

    def setSeeAlsoVolumes(self, volume: dict, firstLine: int, lastLine: int):
        """
        Extract and set the volume numbers form the see also list
        Example result {"seealso": ["Vol-3067"]}

        Args:
            volume: the volumeRecord to modify
            lineIndex: where to start setting the volumeTitle
        """
        volumes = []
        see_also = ""
        for line in range(firstLine, lastLine):
            see_also += self.lines[line]
        see_also_section = re.search(r"see also:(.*?)</font>", see_also, re.DOTALL | re.IGNORECASE)

        if see_also_section:
            # Extract the volumes using regex from the see also section
            volumes = re.findall(
                r'<a href="#(Vol-\d+)">',
                see_also_section.group(1),
                re.IGNORECASE,
            )
        volume["seealso"] = volumes

    def getInfo(self, volume: dict, info: str, pattern, line: str):
        """
        get the info for the given patterns trying to match the pattern on
        the given line

        Args:
            volume(dict): the result dict
            info(str): the name of the dict key to fill
            pattern(regexp): the regular expression to check
            line(str): the line to check
        """
        infoValue = self.getMatch(pattern, line, 1)
        if infoValue is not None:
            for delim in ["<BR>", "<br>"]:
                infoValue = infoValue.replace(delim, "")
            infoValue = infoValue.strip()
            if info in ["editors", "submittedBy"]:
                infoValue = html.unescape(infoValue)
            if info == "pubDate":
                try:
                    infoValue = datetime.datetime.strptime(infoValue, "%d-%b-%Y")
                    published = infoValue.strftime("%Y-%m-%d")
                    volume["published"] = published
                    volume["year"] = infoValue.year
                except ValueError as ve:
                    msg = f"pubDate: {infoValue} of {volume} parsing failed with {ve}"
                    self.log(msg)
            if info in ["urn", "url", "archive"]:
                href = self.getMatch(self.linkPattern, infoValue, 1)
                if href is not None:
                    infoValue = href
                    if info == "url":
                        self.setVolumeNumber(volume, href)
                    if info == "urn":
                        infoValue = href.replace("https://nbn-resolving.org/", "")
            volume[info] = infoValue

    def parseVolume(self, volCount: int, fromLine: int, toLine: int, verbose: bool):
        """
        parse a volume from the given line range
        """
        lineCount = toLine - fromLine
        volume = {
            "fromLine": fromLine,
            "toLine": toLine,
            "valid": None,
            "url": None,
            "acronym": None,
            "title": None,
            "loctime": None,
        }
        self.setVolumeTitle(volume, fromLine)
        self.setSeeAlsoVolumes(volume, fromLine, toLine)

        infoPattern = {}
        infoMappings = [
            ("URN", "urn"),
            ("ONLINE", "url"),
            ("ARCHIVE", "archive"),
            ("Edited by", "editors"),
            ("Submitted by", "submittedBy"),
            ("Published on CEUR-WS", "pubDate"),
        ]
        for prefix, info in infoMappings:
            infoPattern[info] = re.compile(rf"^\s*{prefix}:(.*)")
        for lineIndex in range(fromLine, toLine):
            line = self.lines[lineIndex]
            for info, pattern in infoPattern.items():
                self.getInfo(volume, info, pattern, line)
            self.setVolumeName(volume, line)
            if verbose:
                print(line)
        volumeNumber = volume.get("number", "?")
        acronym = volume.get("acronym", "?")
        self.log(f"{volumeNumber:4}-{volCount:4}:{fromLine}+{lineCount} {acronym}")
        return volume

    def parse(self, vol_limit: int | None = None):
        """
        parse my html code for Volume info
        """
        # Compile the regex pattern right before its usage
        mainTablePattern = re.compile(r'\s*<TABLE id="MAINTABLE"', re.I)
        lineNo = self.find(1, mainTablePattern)
        volCount = 0
        volumes = {}
        while self.lines and lineNo and lineNo < len(self.lines):
            if vol_limit and volCount >= vol_limit:
                break
            expectedTr = 3
            volStartLine, volEndLine = self.findVolume(volCount, lineNo, expectedTr=expectedTr)
            if volStartLine is None or volEndLine is None:
                break
            else:
                volCount += 1
                volume = self.parseVolume(
                    volCount,
                    volStartLine,
                    volEndLine,
                    verbose=self.config.verbose,
                )
                # synchronize on <tr><th and not on end since trailing TR might be missing
                lineNo = volStartLine + 1
                if "number" in volume:
                    volume_number = volume["number"]
                    if volume_number < self.config.down_to_volume:
                        break
                    volumes[volume_number] = volume
                    if self.config.progress_bar:
                        self.config.progress_bar.update()
                else:
                    self.log(f"volume not found for volume at {volStartLine}")
        return volumes

`init(htmlText, config=None)`

Constructor

Parameters:

Name	Type	Description	Default
`htmlText(str)`		the HTML text of the index page	required

Source code in ceurws/indexparser.py

def __init__(self, htmlText: str, config: ParserConfig | None = None):
    """
    Constructor

    Args:
        htmlText(str): the HTML text of the index page
    """
    if config is None:
        config = ParserConfig()
    self.config = config
    Textparser.__init__(self, debug=config.debug)
    self.htmlText = htmlText
    # soup (in memory is slow)
    # soup = BeautifulSoup(html_page, 'html.parser'
    self.lines = htmlText.split("\n")
    # trStart, trEnd = makeHTMLTags("tr")
    # self.tr = trStart + SkipTo(trEnd).setResultsName("tr") + trEnd.suppress()
    self.linkPattern = re.compile(r""".*href=[\'"]?([^\'" >]+).*""", re.I)
    self.volPattern = re.compile("http://ceur-ws.org/Vol-([0-9]+)")
    self.volLinkPattern = re.compile(
        r""".*<a\s+href=[\'"]http://ceur-ws.org/Vol-([0-9]+)[/]?[\'"]>([^<]*)</a>.*""",
        re.I | re.DOTALL,
    )
    # Pre-compile patterns used in find and findVolume
    self.thColspanPattern = re.compile(r"^.*<th\s*colspan", re.I)
    self.trStartPattern = re.compile(r"^\s*<tr>", re.I)
    self.trEndPattern = re.compile(r"^\s*</tr>", re.I)
    # Pre-compile patterns used in setVolumeTitle
    self.editedByPattern = re.compile("Edited by:")
    self.tdBgColorPattern = re.compile("<td bgcolor", re.I)

`find(startLine, compiledPattern, step=1)`

find the next line with the given compiled regular expression pattern

Parameters:

Name	Description	Default
`startLine(int)`	index of the line to start search	required
`compiledPattern(re.Pattern)`	the compiled regular expression pattern to search for	required
`step(int)`	the steps to take e.g. +1 for forward -1 for backwards	required

Return

int: the line number of the line or None if nothing was found

Source code in ceurws/indexparser.py

def find(self, startLine: int, compiledPattern, step: int = 1) -> int | None:
    """
    find the next line with the given compiled regular expression pattern

    Args:
        startLine(int): index of the line to start search
        compiledPattern(re.Pattern): the compiled regular expression pattern to search for
        step(int): the steps to take e.g. +1 for forward -1 for backwards

    Return:
        int: the line number of the line or None if nothing was found
    """
    lineNo = startLine
    while 0 < lineNo < len(self.lines) + 1:
        line = self.lines[lineNo - 1]
        if compiledPattern.match(line):
            return lineNo
        lineNo += step
    return None

`findVolume(volCount, startLine, expectedTr=3, progress=10)`

find Volume lines from the given startLine

Parameters:

tags expected

Name	Description	Default
`volCount(int)`	the volumeCount before the startLine	required
`startLine(int)`	index of the line to search	required
`expectedTr(int)`	number of
required
`progress(int)`	how often to show the progress	required

Returns:

Type	Description
`tuple[int \| None, int \| None]`	endLine of the volume html or None

Source code in ceurws/indexparser.py

def findVolume(
    self,
    volCount: int,
    startLine: int,
    expectedTr: int = 3,
    progress: int = 10,
) -> tuple[int | None, int | None]:
    """
    find Volume lines from the given startLine

    Args:
        volCount(int): the volumeCount before the startLine
        startLine(int): index of the line to search
        expectedTr(int): number of <tr> tags expected
        progress(int): how often to show the progress

    Returns:
        endLine of the volume html or None
    """
    trStartLine = self.find(startLine, self.thColspanPattern)
    if trStartLine is not None:
        lineNo = trStartLine + 1
        trCount = 1
        while lineNo < len(self.lines):
            trLine = self.find(lineNo, self.trStartPattern)
            if trLine is None:
                break
            else:
                lineNo = trLine + 1
                trCount += 1
                if trCount == expectedTr:
                    trEndLine = self.find(lineNo + 1, self.trEndPattern)
                    if volCount % progress == 0 and self.config.verbose:
                        print(f"volume count {volCount+1:4}: lines {trStartLine:6}-{trEndLine:6}")
                    return trStartLine, trEndLine
    return None, None

`getInfo(volume, info, pattern, line)`

get the info for the given patterns trying to match the pattern on the given line

Parameters:

Name	Description	Default
`volume(dict)`	the result dict	required
`info(str)`	the name of the dict key to fill	required
`pattern(regexp)`	the regular expression to check	required
`line(str)`	the line to check	required

Source code in ceurws/indexparser.py

def getInfo(self, volume: dict, info: str, pattern, line: str):
    """
    get the info for the given patterns trying to match the pattern on
    the given line

    Args:
        volume(dict): the result dict
        info(str): the name of the dict key to fill
        pattern(regexp): the regular expression to check
        line(str): the line to check
    """
    infoValue = self.getMatch(pattern, line, 1)
    if infoValue is not None:
        for delim in ["<BR>", "<br>"]:
            infoValue = infoValue.replace(delim, "")
        infoValue = infoValue.strip()
        if info in ["editors", "submittedBy"]:
            infoValue = html.unescape(infoValue)
        if info == "pubDate":
            try:
                infoValue = datetime.datetime.strptime(infoValue, "%d-%b-%Y")
                published = infoValue.strftime("%Y-%m-%d")
                volume["published"] = published
                volume["year"] = infoValue.year
            except ValueError as ve:
                msg = f"pubDate: {infoValue} of {volume} parsing failed with {ve}"
                self.log(msg)
        if info in ["urn", "url", "archive"]:
            href = self.getMatch(self.linkPattern, infoValue, 1)
            if href is not None:
                infoValue = href
                if info == "url":
                    self.setVolumeNumber(volume, href)
                if info == "urn":
                    infoValue = href.replace("https://nbn-resolving.org/", "")
        volume[info] = infoValue

`parse(vol_limit=None)`

parse my html code for Volume info

Source code in ceurws/indexparser.py

def parse(self, vol_limit: int | None = None):
    """
    parse my html code for Volume info
    """
    # Compile the regex pattern right before its usage
    mainTablePattern = re.compile(r'\s*<TABLE id="MAINTABLE"', re.I)
    lineNo = self.find(1, mainTablePattern)
    volCount = 0
    volumes = {}
    while self.lines and lineNo and lineNo < len(self.lines):
        if vol_limit and volCount >= vol_limit:
            break
        expectedTr = 3
        volStartLine, volEndLine = self.findVolume(volCount, lineNo, expectedTr=expectedTr)
        if volStartLine is None or volEndLine is None:
            break
        else:
            volCount += 1
            volume = self.parseVolume(
                volCount,
                volStartLine,
                volEndLine,
                verbose=self.config.verbose,
            )
            # synchronize on <tr><th and not on end since trailing TR might be missing
            lineNo = volStartLine + 1
            if "number" in volume:
                volume_number = volume["number"]
                if volume_number < self.config.down_to_volume:
                    break
                volumes[volume_number] = volume
                if self.config.progress_bar:
                    self.config.progress_bar.update()
            else:
                self.log(f"volume not found for volume at {volStartLine}")
    return volumes

`parseVolume(volCount, fromLine, toLine, verbose)`

parse a volume from the given line range

Source code in ceurws/indexparser.py

def parseVolume(self, volCount: int, fromLine: int, toLine: int, verbose: bool):
    """
    parse a volume from the given line range
    """
    lineCount = toLine - fromLine
    volume = {
        "fromLine": fromLine,
        "toLine": toLine,
        "valid": None,
        "url": None,
        "acronym": None,
        "title": None,
        "loctime": None,
    }
    self.setVolumeTitle(volume, fromLine)
    self.setSeeAlsoVolumes(volume, fromLine, toLine)

    infoPattern = {}
    infoMappings = [
        ("URN", "urn"),
        ("ONLINE", "url"),
        ("ARCHIVE", "archive"),
        ("Edited by", "editors"),
        ("Submitted by", "submittedBy"),
        ("Published on CEUR-WS", "pubDate"),
    ]
    for prefix, info in infoMappings:
        infoPattern[info] = re.compile(rf"^\s*{prefix}:(.*)")
    for lineIndex in range(fromLine, toLine):
        line = self.lines[lineIndex]
        for info, pattern in infoPattern.items():
            self.getInfo(volume, info, pattern, line)
        self.setVolumeName(volume, line)
        if verbose:
            print(line)
    volumeNumber = volume.get("number", "?")
    acronym = volume.get("acronym", "?")
    self.log(f"{volumeNumber:4}-{volCount:4}:{fromLine}+{lineCount} {acronym}")
    return volume

`setSeeAlsoVolumes(volume, firstLine, lastLine)`

Extract and set the volume numbers form the see also list Example result {"seealso": ["Vol-3067"]}

Parameters:

Name	Type	Description	Default
`volume`	`dict`	the volumeRecord to modify	required
`lineIndex`		where to start setting the volumeTitle	required

Source code in ceurws/indexparser.py

def setSeeAlsoVolumes(self, volume: dict, firstLine: int, lastLine: int):
    """
    Extract and set the volume numbers form the see also list
    Example result {"seealso": ["Vol-3067"]}

    Args:
        volume: the volumeRecord to modify
        lineIndex: where to start setting the volumeTitle
    """
    volumes = []
    see_also = ""
    for line in range(firstLine, lastLine):
        see_also += self.lines[line]
    see_also_section = re.search(r"see also:(.*?)</font>", see_also, re.DOTALL | re.IGNORECASE)

    if see_also_section:
        # Extract the volumes using regex from the see also section
        volumes = re.findall(
            r'<a href="#(Vol-\d+)">',
            see_also_section.group(1),
            re.IGNORECASE,
        )
    volume["seealso"] = volumes

`setVolumeName(volume, line)`

set the volume name

Source code in ceurws/indexparser.py

def setVolumeName(self, volume, line):
    """
    set the volume name
    """
    volName = self.getMatch(self.volLinkPattern, line, 2)
    if volName is not None:
        valid = True
        if not volName.startswith("http:"):
            invalidKeys = ["deleted upon editor request", "Not used"]
            for invalidKey in invalidKeys:
                if invalidKey in volName:
                    href = self.getMatch(self.linkPattern, line, 1)
                    self.setVolumeNumber(volume, href)
                    valid = False
            volume["valid"] = valid
            if valid:
                volName = html.unescape(volName)
                volName = Textparser.sanitize(volName)
                volume["volname"] = volName

`setVolumeNumber(volume, href)`

set the volumen number

Source code in ceurws/indexparser.py

def setVolumeNumber(self, volume, href):
    """
    set the volumen number
    """
    if href is None:
        return
    volNumber = self.getMatch(self.volPattern, href, 1)
    if volNumber is not None:
        volume["number"] = int(volNumber)

`setVolumeTitle(volume, lineIndex)`

set the volume title

Parameters:

Name	Type	Description	Default
`volume(dict)`		the volumeRecord to modify	required
`lineIndex`	`int`	where to start setting the volumeTitle	required

Source code in ceurws/indexparser.py

def setVolumeTitle(self, volume: dict, lineIndex: int):
    """
    set the volume title

    Args:
        volume(dict): the volumeRecord to modify
        lineIndex: where to start setting the volumeTitle
    """
    editedByLine = self.find(lineIndex, self.editedByPattern)
    if editedByLine is not None:
        tdLine = self.find(editedByLine, self.tdBgColorPattern, step=-1)
        if tdLine is not None:
            tdIndex = tdLine - 1
            title = ""
            delim = ""
            while tdIndex < len(self.lines):
                line = self.lines[tdIndex]
                if line.startswith("Edited by:"):
                    break
                for tag in [
                    '<TD bgcolor="#FFFFFF">&nbsp;</TD><TD bgcolor="#FFFFFF">',
                    '<TD bgcolor="#FFFFFF">',
                    '<td bgcolor="#FFFFFF">',
                    "<BR>",
                    "<br>",
                ]:
                    line = line.replace(tag, "")
                line = line.replace("\r", " ")
                title += line + delim
                delim = " "
                tdIndex += 1
            volume["tdtitle"] = html.unescape(title).strip()

`ParserConfig`

parser configuration

Source code in ceurws/indexparser.py

class ParserConfig:
    """
    parser configuration
    """

    def __init__(
        self,
        progress_bar: tqdm | None = None,
        down_to_volume: int = 1,
        force_download: bool = False,
        verbose: bool = False,
        debug: bool = False,
    ):
        """
        Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

        Args:
            progress_bar : An instance of a Progressbar class to be used for showing progress
                during parsing.
            down_to_volume (int, optional): The volume threshold for parsing.
                Only volumes equal to or less than this value will be considered. Defaults to 1.
            force_download(bool): if True download the file to parse
            verbose(bool): if True give verbose feedback
            debug (bool, optional): Indicates whether debugging mode is enabled.
                If True, additional debug information will be provided during parsing. Defaults to False.
        """
        self.progress_bar = progress_bar
        self.down_to_volume = down_to_volume
        self.force_download = force_download
        self.verbose = verbose
        self.debug = debug

`init(progress_bar=None, down_to_volume=1, force_download=False, verbose=False, debug=False)`

Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

Parameters:

Name	Type	Description	Default
`progress_bar`		An instance of a Progressbar class to be used for showing progress during parsing.	`None`
`down_to_volume`	`int`	The volume threshold for parsing. Only volumes equal to or less than this value will be considered. Defaults to 1.	`1`
`force_download(bool)`		if True download the file to parse	required
`verbose(bool)`		if True give verbose feedback	required
`debug`	`bool`	Indicates whether debugging mode is enabled. If True, additional debug information will be provided during parsing. Defaults to False.	`False`

Source code in ceurws/indexparser.py

def __init__(
    self,
    progress_bar: tqdm | None = None,
    down_to_volume: int = 1,
    force_download: bool = False,
    verbose: bool = False,
    debug: bool = False,
):
    """
    Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

    Args:
        progress_bar : An instance of a Progressbar class to be used for showing progress
            during parsing.
        down_to_volume (int, optional): The volume threshold for parsing.
            Only volumes equal to or less than this value will be considered. Defaults to 1.
        force_download(bool): if True download the file to parse
        verbose(bool): if True give verbose feedback
        debug (bool, optional): Indicates whether debugging mode is enabled.
            If True, additional debug information will be provided during parsing. Defaults to False.
    """
    self.progress_bar = progress_bar
    self.down_to_volume = down_to_volume
    self.force_download = force_download
    self.verbose = verbose
    self.debug = debug

`location`

Created on 2023-07-15

@author: wf

`LocationLookup`

Class for location lookup.

Source code in ceurws/location.py

class LocationLookup:
    """
    Class for location lookup.
    """

    predefinedLocations: dict[str, str | None] = {}

    @classmethod
    def initPredefinedLocations(cls):
        """
        Initialize predefined locations.
        """
        locMap = {
            "Not Known": None,
            "Online": None,
            "Virtual": None,
            "Virtual, USA": None,
            "Virtual Event, USA": None,
            "Amsterdam": "Q727",
            "Amsterdam, Amsterdam": "Q727",
            "Amsterdam Netherlands": "Q727",
            "Amsterdam, Netherlands": "Q727",
            "Amsterdam, The Netherlands": "Q727",
            "Amsterdam The Netherlands": "Q727",
            # ... add more predefined locations ...
        }
        cls.predefinedLocations = locMap

    def __init__(self):
        """
        Constructor for LocationLookup.
        """
        LocationLookup.initPredefinedLocations()
        self.locationContext = LocationContext.fromCache()
        cacheRootDir = LocationContext.getDefaultConfig().cacheRootDir
        cacheDir = f"{cacheRootDir}/.nominatim"
        self.nominatimWrapper = NominatimWrapper(cacheDir=cacheDir)

    def getCityByWikiDataId(self, wikidataID: str):
        """
        Get the city for the given wikidataID.

        Args:
            wikidataID (str): The wikidata ID.

        Returns:
            City: The city with the given wikidataID.
        """
        citiesGen = self.locationContext.cityManager.getLocationsByWikidataId(wikidataID)
        if citiesGen is not None:
            cities = list(citiesGen)
            if len(cities) > 0:
                return cities[0]
        else:
            return None

    def lookupNominatim(self, locationText: str):
        """
        Lookup the location for the given locationText (if any).

        Args:
            locationText (str): The location text to search for.

        Returns:
            City: The location found by Nominatim.
        """
        location = None
        wikidataId = self.nominatimWrapper.lookupWikiDataId(locationText)
        if wikidataId is not None:
            location = self.getCityByWikiDataId(wikidataId)
        return location

    def lookup(self, locationText: str, logFile=sys.stdout):
        """
        Lookup a location based on the given locationText.

        Args:
            locationText (str): The location to lookup.
            logFile (file): The log file to write the output.

        Returns:
            City: The located city based on the locationText.
        """
        if locationText in LocationLookup.predefinedLocations:
            locationId = LocationLookup.predefinedLocations[locationText]
            if locationId is None:
                return None
            else:
                location = self.getCityByWikiDataId(locationId)
                if location is None:
                    print(
                        f"❌❌-predefinedLocation {locationText}→{locationId} wikidataId not resolved",
                        file=logFile,
                    )
                return location
        lg = self.lookupGeograpy(locationText)
        ln = self.lookupNominatim(locationText)
        if ln is not None and lg is not None and ln.wikidataid != lg.wikidataid:
            print(f"❌❌{locationText}→{lg}!={ln}", file=logFile)
            return None
        return lg

    def lookupGeograpy(self, locationText: str):
        """
        Lookup the given location by the given locationText.

        Args:
            locationText (str): The location to lookup.

        Returns:
            City: The located city based on the locationText.
        """
        locations = self.locationContext.locateLocation(locationText)
        if len(locations) > 0:
            return locations[0]
        else:
            return None

`init()`

Constructor for LocationLookup.

Source code in ceurws/location.py

def __init__(self):
    """
    Constructor for LocationLookup.
    """
    LocationLookup.initPredefinedLocations()
    self.locationContext = LocationContext.fromCache()
    cacheRootDir = LocationContext.getDefaultConfig().cacheRootDir
    cacheDir = f"{cacheRootDir}/.nominatim"
    self.nominatimWrapper = NominatimWrapper(cacheDir=cacheDir)

`getCityByWikiDataId(wikidataID)`

Get the city for the given wikidataID.

Parameters:

Name	Type	Description	Default
`wikidataID`	`str`	The wikidata ID.	required

Returns:

Name	Type	Description
`City`		The city with the given wikidataID.

Source code in ceurws/location.py

def getCityByWikiDataId(self, wikidataID: str):
    """
    Get the city for the given wikidataID.

    Args:
        wikidataID (str): The wikidata ID.

    Returns:
        City: The city with the given wikidataID.
    """
    citiesGen = self.locationContext.cityManager.getLocationsByWikidataId(wikidataID)
    if citiesGen is not None:
        cities = list(citiesGen)
        if len(cities) > 0:
            return cities[0]
    else:
        return None

`initPredefinedLocations()` `classmethod`

Initialize predefined locations.

Source code in ceurws/location.py

@classmethod
def initPredefinedLocations(cls):
    """
    Initialize predefined locations.
    """
    locMap = {
        "Not Known": None,
        "Online": None,
        "Virtual": None,
        "Virtual, USA": None,
        "Virtual Event, USA": None,
        "Amsterdam": "Q727",
        "Amsterdam, Amsterdam": "Q727",
        "Amsterdam Netherlands": "Q727",
        "Amsterdam, Netherlands": "Q727",
        "Amsterdam, The Netherlands": "Q727",
        "Amsterdam The Netherlands": "Q727",
        # ... add more predefined locations ...
    }
    cls.predefinedLocations = locMap

`lookup(locationText, logFile=sys.stdout)`

Lookup a location based on the given locationText.

Parameters:

Name	Type	Description	Default
`locationText`	`str`	The location to lookup.	required
`logFile`	`file`	The log file to write the output.	`stdout`

Returns:

Name	Type	Description
`City`		The located city based on the locationText.

Source code in ceurws/location.py

def lookup(self, locationText: str, logFile=sys.stdout):
    """
    Lookup a location based on the given locationText.

    Args:
        locationText (str): The location to lookup.
        logFile (file): The log file to write the output.

    Returns:
        City: The located city based on the locationText.
    """
    if locationText in LocationLookup.predefinedLocations:
        locationId = LocationLookup.predefinedLocations[locationText]
        if locationId is None:
            return None
        else:
            location = self.getCityByWikiDataId(locationId)
            if location is None:
                print(
                    f"❌❌-predefinedLocation {locationText}→{locationId} wikidataId not resolved",
                    file=logFile,
                )
            return location
    lg = self.lookupGeograpy(locationText)
    ln = self.lookupNominatim(locationText)
    if ln is not None and lg is not None and ln.wikidataid != lg.wikidataid:
        print(f"❌❌{locationText}→{lg}!={ln}", file=logFile)
        return None
    return lg

`lookupGeograpy(locationText)`

Lookup the given location by the given locationText.

Parameters:

Name	Type	Description	Default
`locationText`	`str`	The location to lookup.	required

Returns:

Name	Type	Description
`City`		The located city based on the locationText.

Source code in ceurws/location.py

def lookupGeograpy(self, locationText: str):
    """
    Lookup the given location by the given locationText.

    Args:
        locationText (str): The location to lookup.

    Returns:
        City: The located city based on the locationText.
    """
    locations = self.locationContext.locateLocation(locationText)
    if len(locations) > 0:
        return locations[0]
    else:
        return None

`lookupNominatim(locationText)`

Lookup the location for the given locationText (if any).

Parameters:

Name	Type	Description	Default
`locationText`	`str`	The location text to search for.	required

Returns:

Name	Type	Description
`City`		The location found by Nominatim.

Source code in ceurws/location.py

def lookupNominatim(self, locationText: str):
    """
    Lookup the location for the given locationText (if any).

    Args:
        locationText (str): The location text to search for.

    Returns:
        City: The location found by Nominatim.
    """
    location = None
    wikidataId = self.nominatimWrapper.lookupWikiDataId(locationText)
    if wikidataId is not None:
        location = self.getCityByWikiDataId(wikidataId)
    return location

`loctime`

Created on 2023-12-22

@author: wf

`LoctimeParser`

A parser class for handling loctime lookups. This class provides methods to load, parse, and update loctime data using a dictionary of dictionaries structure.

Attributes:

Name	Type	Description
`filepath`	`str`	The file path to the loctime YAML configuration.
`lookups`	`dict`	The loaded lookup dictionaries from the YAML file.
`multi_word`	`dict`	A dictionary to handle multi-word keys.
`multi_word_lookups`	`dict`	A version of lookups with keys as concatenated words.
`counters`	`dict`	A dictionary of Counter objects for various categories.
`year_pattern`	`Pattern`	A compiled regex pattern to match 4-digit years.
`total_loctimes`	`int`	The total count of processed loctimes.

Source code in ceurws/loctime.py

class LoctimeParser:
    """
    A parser class for handling loctime lookups. This class provides methods to
    load, parse, and update loctime data using a dictionary of dictionaries structure.

    Attributes:
        filepath (str): The file path to the loctime YAML configuration.
        lookups (dict): The loaded lookup dictionaries from the YAML file.
        multi_word (dict): A dictionary to handle multi-word keys.
        multi_word_lookups (dict): A version of lookups with keys as concatenated words.
        counters (dict): A dictionary of Counter objects for various categories.
        year_pattern (re.Pattern): A compiled regex pattern to match 4-digit years.
        total_loctimes (int): The total count of processed loctimes.
    """

    def __init__(self, filepath: str | None = None):
        """
        Initializes the LoctimeParser object, setting up paths, loading lookups,
        and initializing counters and patterns.

        Args:
            filepath (Path, optional): The path to the loctime YAML file.
                                      Defaults to a predefined path if None is provided.
        Raises:
            FileNotFoundError: Raises an error if the specified YAML file does not exist.
        """
        if filepath is None:
            self.ceurws_path = CEURWS.CACHE_DIR
            self.filepath: Path = self.ceurws_path.joinpath("loctime.yaml")
        else:
            self.file_path = Path(filepath)
        self.lookups = self.load()
        self.setup()
        self.counters: dict[str, Counter] = {"4digit-year": Counter()}
        for reverse_pos in range(1, 8):
            self.counters[str(reverse_pos)] = Counter()
        for key in self.lookups:
            self.counters[key] = Counter()

        # Compile a pattern to match a 4-digit year
        self.year_pattern = re.compile(r"\b\d{4}\b")
        self.total_loctimes = 0

    def setup(self):
        """
        Prepares the parser by initializing multi-word handling and creating
        a modified version of the lookup dictionaries with keys as concatenated words.
        This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries
        to facilitate the parsing process, especially for multi-word keys.
        """
        self.multi_word = {}
        for lookup in self.lookups.values():
            for key in lookup:
                if " " in key:
                    self.multi_word[key] = key.replace(" ", "_")

        # Initialize a dictionary derived from self.lookups with underscored keys
        self.multi_word_lookups = {}
        for category, lookup in self.lookups.items():
            self.multi_word_lookups[category] = {key.replace(" ", "_"): value for key, value in lookup.items()}

    def load(
        self,
    ) -> dict:
        """
        Loads the lookup data from the YAML file specified by the filepath attribute.

        This method attempts to open and read the YAML file, converting its contents
        into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

        Returns:
            dict: A dictionary representing the loaded data from the YAML file. If the file
                  is empty or non-existent, an empty dictionary is returned.

        Raises:
            FileNotFoundError: If the specified file does not exist.
            yaml.YAMLError: If there is an error parsing the YAML file.
        """
        data_dict = {}
        if os.path.isfile(self.filepath) and os.path.getsize(self.filepath) > 0:
            with open(self.filepath) as yaml_file:
                data_dict = yaml.safe_load(yaml_file)
        return data_dict

    def save(self):
        """
        Saves the current lookup dictionary to a YAML file.
        """
        os.makedirs(os.path.dirname(self.filepath), exist_ok=True)  # Ensure directory exists
        with open(self.filepath, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                self.lookups,
                yaml_file,
                default_flow_style=False,
                allow_unicode=True,
            )

    def get_parts(self, loctime):
        """
        Splits the loctime string into parts and subparts, considering multi-word entries.

        Args:
            loctime (str): The loctime string to split.

        Returns:
            list: A list of parts and subparts.
        """
        # Replace known multi-word entries with their underscore versions
        for original, underscored in self.multi_word.items():
            loctime = loctime.replace(original, underscored)

        parts = loctime.split(",")  # First, split by comma
        all_parts = []
        for part in parts:
            # Further split each part by whitespace, considering underscore as part of the word
            subparts = part.strip().split()
            all_parts.extend(subparts)  # Add all subparts to the list

        return all_parts

    def parse(self, loctime: str) -> dict:
        """
        Alternative parse of CEUR-WS loctimes using lookups

        Args:
            loctime (str): The loctime string to parse.

        """
        result = {}
        self.total_loctimes += 1
        lt_parts = self.get_parts(loctime)

        # Process each part of loctime
        for index, part in enumerate(lt_parts):
            part = part.strip()
            reverse_pos = len(lt_parts) - index  # Position from end

            found_in_lookup = False
            # Check against each lookup and update corresponding counter
            for (
                lookup_key,
                lookup_dict,
            ) in self.multi_word_lookups.items():
                if part in lookup_dict:
                    self.counters[lookup_key][part] += 1  # Increment the lookup counter
                    found_in_lookup = True
                    # set result dict
                    result[lookup_key] = part
                    break  # Break if found, assuming part can't be in multiple lookups
            if not found_in_lookup:
                # Update counter for each part's position from end
                key = str(reverse_pos)
                if key in self.counters:
                    self.counters[key][part] += 1

            # Special handling for 4-digit years
            if index == len(lt_parts) - 1 and self.year_pattern.match(part):
                self.counters["4digit-year"][part] += 1
        return result

    def update_lookup_counts(self):
        """
        to be called  ffter processing all loctimes
        and updating counters update lookup dicts with new counts
        """
        for category, counter in self.counters.items():
            if category in self.lookups:
                for underscore_key, count in counter.items():
                    # Convert underscore_key back to space-separated key
                    original_key = underscore_key.replace("_", " ")
                    if original_key in self.lookups[category]:
                        # Update the count for the original key
                        self.lookups[category][original_key] += count
                    else:
                        # Initialize count for the original key
                        self.lookups[category][original_key] = count

    def create_pareto_analysis(self, level: int = 3, outof: int = 5):
        """
        Creates a Pareto analysis for each category in the lookups and returns
        the percentage table for the distribution across the specified levels.

        Args:
            level (int): The number of segments to divide the data into within the top "outof" parts.
            outof (int): 1 out of n value e.g. on level 1 we have 1:5 which leads to
                the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent
                which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8)
                percent which leads to 80%,96%,99.2% thresholds
        """
        pareto_dict = {}
        for category, counter in self.counters.items():
            # Sort items by count in descending order
            sorted_items = counter.most_common()
            total = sum(counter.values())

            # Calculate segment thresholds based on the diminishing series
            thresholds = []
            threshold = 0.0
            for _ in range(1, level + 1):
                # current range to calculate out of for
                trange = 100 - threshold  # 100/80/96/99.2 ...
                # right side of range
                right_range = trange / outof  # 20/4/0.8 ...
                # left threshold is new threshold
                threshold = 100 - right_range
                thresholds.append(threshold)
            thresholds.append(100)

            segment_counts = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
            segment_cutoff = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
            tindex = 0
            current_threshold = thresholds[tindex]
            total_pc = 0.0
            # Calculate cumulative counts for each segment
            for _, count in sorted_items:
                item_percentage = count / total * 100
                if total_pc + item_percentage > current_threshold + 0.000000000001:
                    segment_cutoff[current_threshold] = count
                    tindex += 1
                    if tindex >= len(thresholds):
                        break
                    current_threshold = thresholds[tindex]
                total_pc += item_percentage
                segment_counts[current_threshold] += count

            pareto_dict[category] = segment_cutoff
        return pareto_dict

`init(filepath=None)`

Initializes the LoctimeParser object, setting up paths, loading lookups, and initializing counters and patterns.

Parameters:

Name	Type	Description	Default
`filepath`	`Path`	The path to the loctime YAML file. Defaults to a predefined path if None is provided.	`None`

Raises: FileNotFoundError: Raises an error if the specified YAML file does not exist.

Source code in ceurws/loctime.py

def __init__(self, filepath: str | None = None):
    """
    Initializes the LoctimeParser object, setting up paths, loading lookups,
    and initializing counters and patterns.

    Args:
        filepath (Path, optional): The path to the loctime YAML file.
                                  Defaults to a predefined path if None is provided.
    Raises:
        FileNotFoundError: Raises an error if the specified YAML file does not exist.
    """
    if filepath is None:
        self.ceurws_path = CEURWS.CACHE_DIR
        self.filepath: Path = self.ceurws_path.joinpath("loctime.yaml")
    else:
        self.file_path = Path(filepath)
    self.lookups = self.load()
    self.setup()
    self.counters: dict[str, Counter] = {"4digit-year": Counter()}
    for reverse_pos in range(1, 8):
        self.counters[str(reverse_pos)] = Counter()
    for key in self.lookups:
        self.counters[key] = Counter()

    # Compile a pattern to match a 4-digit year
    self.year_pattern = re.compile(r"\b\d{4}\b")
    self.total_loctimes = 0

`create_pareto_analysis(level=3, outof=5)`

Creates a Pareto analysis for each category in the lookups and returns the percentage table for the distribution across the specified levels.

Parameters:

Name	Type	Description	Default
`level`	`int`	The number of segments to divide the data into within the top "outof" parts.	`3`
`outof`	`int`	1 out of n value e.g. on level 1 we have 1:5 which leads to the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8) percent which leads to 80%,96%,99.2% thresholds	`5`

Source code in ceurws/loctime.py

def create_pareto_analysis(self, level: int = 3, outof: int = 5):
    """
    Creates a Pareto analysis for each category in the lookups and returns
    the percentage table for the distribution across the specified levels.

    Args:
        level (int): The number of segments to divide the data into within the top "outof" parts.
        outof (int): 1 out of n value e.g. on level 1 we have 1:5 which leads to
            the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent
            which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8)
            percent which leads to 80%,96%,99.2% thresholds
    """
    pareto_dict = {}
    for category, counter in self.counters.items():
        # Sort items by count in descending order
        sorted_items = counter.most_common()
        total = sum(counter.values())

        # Calculate segment thresholds based on the diminishing series
        thresholds = []
        threshold = 0.0
        for _ in range(1, level + 1):
            # current range to calculate out of for
            trange = 100 - threshold  # 100/80/96/99.2 ...
            # right side of range
            right_range = trange / outof  # 20/4/0.8 ...
            # left threshold is new threshold
            threshold = 100 - right_range
            thresholds.append(threshold)
        thresholds.append(100)

        segment_counts = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
        segment_cutoff = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
        tindex = 0
        current_threshold = thresholds[tindex]
        total_pc = 0.0
        # Calculate cumulative counts for each segment
        for _, count in sorted_items:
            item_percentage = count / total * 100
            if total_pc + item_percentage > current_threshold + 0.000000000001:
                segment_cutoff[current_threshold] = count
                tindex += 1
                if tindex >= len(thresholds):
                    break
                current_threshold = thresholds[tindex]
            total_pc += item_percentage
            segment_counts[current_threshold] += count

        pareto_dict[category] = segment_cutoff
    return pareto_dict

`get_parts(loctime)`

Splits the loctime string into parts and subparts, considering multi-word entries.

Parameters:

Name	Type	Description	Default
`loctime`	`str`	The loctime string to split.	required

Returns:

Name	Type	Description
`list`		A list of parts and subparts.

Source code in ceurws/loctime.py

def get_parts(self, loctime):
    """
    Splits the loctime string into parts and subparts, considering multi-word entries.

    Args:
        loctime (str): The loctime string to split.

    Returns:
        list: A list of parts and subparts.
    """
    # Replace known multi-word entries with their underscore versions
    for original, underscored in self.multi_word.items():
        loctime = loctime.replace(original, underscored)

    parts = loctime.split(",")  # First, split by comma
    all_parts = []
    for part in parts:
        # Further split each part by whitespace, considering underscore as part of the word
        subparts = part.strip().split()
        all_parts.extend(subparts)  # Add all subparts to the list

    return all_parts

`load()`

Loads the lookup data from the YAML file specified by the filepath attribute.

This method attempts to open and read the YAML file, converting its contents into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representing the loaded data from the YAML file. If the file is empty or non-existent, an empty dictionary is returned.

Raises:

Type	Description
`FileNotFoundError`	If the specified file does not exist.
`YAMLError`	If there is an error parsing the YAML file.

Source code in ceurws/loctime.py

def load(
    self,
) -> dict:
    """
    Loads the lookup data from the YAML file specified by the filepath attribute.

    This method attempts to open and read the YAML file, converting its contents
    into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

    Returns:
        dict: A dictionary representing the loaded data from the YAML file. If the file
              is empty or non-existent, an empty dictionary is returned.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        yaml.YAMLError: If there is an error parsing the YAML file.
    """
    data_dict = {}
    if os.path.isfile(self.filepath) and os.path.getsize(self.filepath) > 0:
        with open(self.filepath) as yaml_file:
            data_dict = yaml.safe_load(yaml_file)
    return data_dict

`parse(loctime)`

Alternative parse of CEUR-WS loctimes using lookups

Parameters:

Name	Type	Description	Default
`loctime`	`str`	The loctime string to parse.	required

Source code in ceurws/loctime.py

def parse(self, loctime: str) -> dict:
    """
    Alternative parse of CEUR-WS loctimes using lookups

    Args:
        loctime (str): The loctime string to parse.

    """
    result = {}
    self.total_loctimes += 1
    lt_parts = self.get_parts(loctime)

    # Process each part of loctime
    for index, part in enumerate(lt_parts):
        part = part.strip()
        reverse_pos = len(lt_parts) - index  # Position from end

        found_in_lookup = False
        # Check against each lookup and update corresponding counter
        for (
            lookup_key,
            lookup_dict,
        ) in self.multi_word_lookups.items():
            if part in lookup_dict:
                self.counters[lookup_key][part] += 1  # Increment the lookup counter
                found_in_lookup = True
                # set result dict
                result[lookup_key] = part
                break  # Break if found, assuming part can't be in multiple lookups
        if not found_in_lookup:
            # Update counter for each part's position from end
            key = str(reverse_pos)
            if key in self.counters:
                self.counters[key][part] += 1

        # Special handling for 4-digit years
        if index == len(lt_parts) - 1 and self.year_pattern.match(part):
            self.counters["4digit-year"][part] += 1
    return result

`save()`

Saves the current lookup dictionary to a YAML file.

Source code in ceurws/loctime.py

def save(self):
    """
    Saves the current lookup dictionary to a YAML file.
    """
    os.makedirs(os.path.dirname(self.filepath), exist_ok=True)  # Ensure directory exists
    with open(self.filepath, "w", encoding="utf-8") as yaml_file:
        yaml.dump(
            self.lookups,
            yaml_file,
            default_flow_style=False,
            allow_unicode=True,
        )

`setup()`

Prepares the parser by initializing multi-word handling and creating a modified version of the lookup dictionaries with keys as concatenated words. This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries to facilitate the parsing process, especially for multi-word keys.

Source code in ceurws/loctime.py

def setup(self):
    """
    Prepares the parser by initializing multi-word handling and creating
    a modified version of the lookup dictionaries with keys as concatenated words.
    This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries
    to facilitate the parsing process, especially for multi-word keys.
    """
    self.multi_word = {}
    for lookup in self.lookups.values():
        for key in lookup:
            if " " in key:
                self.multi_word[key] = key.replace(" ", "_")

    # Initialize a dictionary derived from self.lookups with underscored keys
    self.multi_word_lookups = {}
    for category, lookup in self.lookups.items():
        self.multi_word_lookups[category] = {key.replace(" ", "_"): value for key, value in lookup.items()}

`update_lookup_counts()`

to be called ffter processing all loctimes and updating counters update lookup dicts with new counts

Source code in ceurws/loctime.py

def update_lookup_counts(self):
    """
    to be called  ffter processing all loctimes
    and updating counters update lookup dicts with new counts
    """
    for category, counter in self.counters.items():
        if category in self.lookups:
            for underscore_key, count in counter.items():
                # Convert underscore_key back to space-separated key
                original_key = underscore_key.replace("_", " ")
                if original_key in self.lookups[category]:
                    # Update the count for the original key
                    self.lookups[category][original_key] += count
                else:
                    # Initialize count for the original key
                    self.lookups[category][original_key] = count

`PercentageTable`

A class for creating a table that displays values and their corresponding percentages of a total.

Attributes:

Name	Type	Description
`total`	`float`	The total value used for calculating percentages.
`column_title`	`str`	The title for the first column in the table.
`digits`	`int`	The number of decimal places for rounding percentages.
`rows`	`list`	A list of dictionaries representing rows in the table.

Source code in ceurws/loctime.py

class PercentageTable:
    """
    A class for creating a table that displays values and their corresponding percentages of a total.

    Attributes:
        total (float): The total value used for calculating percentages.
        column_title (str): The title for the first column in the table.
        digits (int): The number of decimal places for rounding percentages.
        rows (list): A list of dictionaries representing rows in the table.
    """

    def __init__(self, column_title: str, total: float, digits: int):
        """
        Initializes the PercentageTable with a title for the column,
        a total value, and specified precision for percentages.

        Args:
            column_title (str): The title for the first column.
            total (float): The total value for calculating percentages.
            digits (int): The precision for percentage values.
        """
        self.total = total
        self.column_title = column_title
        self.digits = digits
        self.rows = [{self.column_title: "Total", "#": total, "%": 100.0}]

    def add_value(self, row_title: str, value: float):
        """
        Adds a row to the table with the given title and value, calculating the percentage of the total.

        Args:
            row_title (str): The title for the row.
            value (float): The value for the row, which is used to calculate its percentage of the total.
        """
        percentage = round((value / self.total) * 100, self.digits) if self.total else 0
        self.rows.append({self.column_title: row_title, "#": value, "%": percentage})

    def generate_table(self, tablefmt="grid") -> str:
        """
        Generates a string representation of the table using the tabulate library.

        Returns:
            str: The string representation of the table with headers and formatted rows.
        """
        if not self.rows:
            return ""
        tabulate_markup = tabulate(
            self.rows,
            headers="keys",
            tablefmt=tablefmt,
            floatfmt=f".{self.digits}f",
        )
        return tabulate_markup

`init(column_title, total, digits)`

Initializes the PercentageTable with a title for the column, a total value, and specified precision for percentages.

Parameters:

Name	Type	Description	Default
`column_title`	`str`	The title for the first column.	required
`total`	`float`	The total value for calculating percentages.	required
`digits`	`int`	The precision for percentage values.	required

Source code in ceurws/loctime.py

def __init__(self, column_title: str, total: float, digits: int):
    """
    Initializes the PercentageTable with a title for the column,
    a total value, and specified precision for percentages.

    Args:
        column_title (str): The title for the first column.
        total (float): The total value for calculating percentages.
        digits (int): The precision for percentage values.
    """
    self.total = total
    self.column_title = column_title
    self.digits = digits
    self.rows = [{self.column_title: "Total", "#": total, "%": 100.0}]

`add_value(row_title, value)`

Adds a row to the table with the given title and value, calculating the percentage of the total.

Parameters:

Name	Type	Description	Default
`row_title`	`str`	The title for the row.	required
`value`	`float`	The value for the row, which is used to calculate its percentage of the total.	required

Source code in ceurws/loctime.py

def add_value(self, row_title: str, value: float):
    """
    Adds a row to the table with the given title and value, calculating the percentage of the total.

    Args:
        row_title (str): The title for the row.
        value (float): The value for the row, which is used to calculate its percentage of the total.
    """
    percentage = round((value / self.total) * 100, self.digits) if self.total else 0
    self.rows.append({self.column_title: row_title, "#": value, "%": percentage})

`generate_table(tablefmt='grid')`

Generates a string representation of the table using the tabulate library.

Returns:

Name	Type	Description
`str`	`str`	The string representation of the table with headers and formatted rows.

Source code in ceurws/loctime.py

def generate_table(self, tablefmt="grid") -> str:
    """
    Generates a string representation of the table using the tabulate library.

    Returns:
        str: The string representation of the table with headers and formatted rows.
    """
    if not self.rows:
        return ""
    tabulate_markup = tabulate(
        self.rows,
        headers="keys",
        tablefmt=tablefmt,
        floatfmt=f".{self.digits}f",
    )
    return tabulate_markup

`models`

`ceur`

Created on 2024-03-17

CEUR Workshop Proceedings (CEUR-WS.org)

Metamodel @author: wf

`Paper`

Bases: SQLModel

Represents a paper with details such as authors, volume number, and title.

Source code in ceurws/models/ceur.py

class Paper(SQLModel, table=True):  # type: ignore
    """
    Represents a paper with details such as authors, volume number, and title.
    """

    __tablename__ = "papers"
    authors: str | None = Field(default=None, index=False)
    vol_number: int | None = Field(default=None, index=True)
    pdf_name: str | None = Field(default=None, index=False)
    id: str = Field(primary_key=True)
    title: str | None = Field(default=None, index=False)
    pages: str | None = Field(default=None, index=False)
    fail: str | None = Field(default=None, index=False)

`Volume`

Bases: SQLModel

a single CEUR-WS Volume

Source code in ceurws/models/ceur.py

class Volume(SQLModel, table=True):  # type: ignore
    """
    a single CEUR-WS Volume
    """

    __tablename__ = "volumes"

    fromLine: int | None = Field(default=None)
    toLine: int | None = Field(default=None)
    valid: int | None = Field(default=None)
    url: str | None = Field(default=None)
    acronym: str | None = Field(default=None)
    title: str | None = Field(default=None)
    seealso: str | None = Field(default=None)
    editors: str | None = Field(default=None)
    submittedBy: str | None = Field(default=None)
    published: str | None = Field(default=None)
    pubDate: datetime | None = Field(default=None)
    number: int = Field(primary_key=True)
    archive: str | None = Field(default=None)
    desc: str | None = Field(alias="description", default=None)  # 'desc' is a SQL keyword, so it's aliased
    h1: str | None = Field(default=None)
    h3: str | None = Field(default=None)
    volname: str | None = Field(default=None)
    homepage: str | None = Field(default=None)
    year: str | None = Field(default=None)
    urn: str | None = Field(default=None)
    # vol_number: Optional[int] = Field(default=None)
    loctime: str | None = Field(default=None)
    volume_number: str | None = Field(default=None)
    voltitle: str | None = Field(default=None)
    dateFrom: date | None = Field(default=None)
    dateTo: date | None = Field(default=None)
    city: str | None = Field(default=None)
    cityWikidataId: str | None = Field(default=None)
    country: str | None = Field(default=None)
    countryWikidataId: str | None = Field(default=None)
    urn_check_digit: int | None = Field(default=None)
    urn_ok: int | None = Field(default=None)
    ceurpubdate: str | None = Field(default=None)
    colocated: str | None = Field(default=None)
    virtualEvent: int | None = Field(default=None)
    tdtitle: str | None = Field(default=None)

`dblp`

Created on 2023 @author: Tim Holzheim

refactored 2024-03-09 by wf

`DblpPaper`

a paper indexed by dblp.org

Source code in ceurws/models/dblp.py

@lod_storable
class DblpPaper:
    """
    a paper indexed by dblp.org
    """

    dblp_publication_id: str
    dblp_proceeding_id: str
    volume_number: int
    title: str
    authors: list[DblpScholar] | None = field(default_factory=list)
    pdf_id: str | None = None

    def __post_init__(self):
        for i, author in enumerate(self.authors):
            if isinstance(author, dict):
                self.authors[i] = DblpScholar(**author)

`DblpProceeding`

a proceeding indexed by dblp.org

Source code in ceurws/models/dblp.py

@lod_storable
class DblpProceeding:
    """
    a proceeding indexed by dblp.org
    """

    dblp_publication_id: str
    volume_number: int
    title: str
    dblp_event_id: str | None = None
    papers: list[DblpPaper] | None = field(default_factory=list)
    editors: list[DblpScholar] | None = field(default_factory=list)

    def __post_init__(self):
        if self.editors:
            for i, editor in enumerate(self.editors):
                if isinstance(editor, dict):
                    self.editors[i] = DblpScholar(**editor)
        if self.papers:
            for i, paper in enumerate(self.papers):
                if isinstance(paper, dict):
                    self.papers[i] = DblpPaper(**paper)

`DblpScholar`

a scholar indexed by dblp.org

example: Tim Berners-Lee https://dblp.org/pid/b/TimBernersLee.html

Source code in ceurws/models/dblp.py

@lod_storable
class DblpScholar:
    """
    a scholar indexed by dblp.org

    example: Tim Berners-Lee
    https://dblp.org/pid/b/TimBernersLee.html

    """

    dblp_author_id: str
    label: str | None = None
    wikidata_id: str | None = None
    orcid_id: str | None = None
    gnd_id: str | None = None

`dblp2`

Created on 2024-03-16

@author: wf

`Authorship`

Bases: SQLModel

Represents the relationship between a scholar and a paper, capturing the authorship details.

Source code in ceurws/models/dblp2.py

class Authorship(SQLModel, table=True):  # type: ignore
    """
    Represents the relationship between a scholar and a paper, capturing the authorship details.
    """

    paper: str = Field(foreign_key="paper.paper", primary_key=True)
    dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)

`Editorship`

Bases: SQLModel

Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.

Source code in ceurws/models/dblp2.py

class Editorship(SQLModel, table=True):  # type: ignore
    """
    Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.
    """

    volume_number: int = Field(foreign_key="proceeding.volume_number", primary_key=True)
    dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)

`Paper`

Bases: SQLModel

A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.

Source code in ceurws/models/dblp2.py

class Paper(SQLModel, table=True):  # type: ignore
    """
    A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.
    """

    paper: str = Field(primary_key=True)
    proceeding: str | None = Field(foreign_key="proceeding.proceeding")
    volume_number: str = Field(index=True)
    title: str
    pdf_url: str | None = None

`Proceeding`

Bases: SQLModel

A proceeding indexed in DBLP with additional details.

Source code in ceurws/models/dblp2.py

class Proceeding(SQLModel, table=True):  # type: ignore
    """
    A proceeding indexed in DBLP with additional details.
    """

    proceeding: str = Field(primary_key=True)
    volume_number: int = Field(index=True)
    title: str
    dblp_event_id: str | None = None

`Scholar`

Bases: SQLModel

Represents a scholar with information fetched from DBLP and possibly other sources.

Source code in ceurws/models/dblp2.py

class Scholar(SQLModel, table=True):  # type: ignore
    """
    Represents a scholar with information fetched from DBLP and possibly other sources.
    """

    dblp_author_id: str = Field(primary_key=True)
    label: str | None = None
    wikidata_id: str | None = None
    orcid_id: str | None = None
    gnd_id: str | None = None

`namedqueries`

Created on 2023-03-21

@author: wf

`NamedQueries`

get named queries

Source code in ceurws/namedqueries.py

class NamedQueries:
    """
    get named queries
    """

    def __init__(self, wikiId: str = "cr"):
        """ """
        self.wikiId = wikiId
        self.wikiClient = WikiClient.ofWikiId(wikiId)
        if self.wikiClient.needsLogin():
            self.wikiClient.login()
        self.smw = SMWClient(self.wikiClient.getSite())
        self.qm: QueryManager | None = None

    def query(self):
        """
        run query
        """
        ask_query = """
        {{#ask: [[Concept:Query]]
|mainlabel=Query
|?Query id = id
|?Query name=name
|?Query title = title
|?Query tryiturl = tryiturl
|?Query wdqsurl = wdqsurl
|?Query sparql=sparql
|?Query relevance = relevance
|?Query task = task
|limit=200
|sort=Query task,Query id
|order=ascending
}}"""
        self.q_records = self.smw.query(ask_query)

    def toQueryManager(self) -> QueryManager:
        """
        convert me to a QueryManager
        """
        self.qm = QueryManager(lang="sparql")
        self.qm.queriesByName = {}
        for q_record in self.q_records.values():
            name = q_record["name"]
            sparql = q_record["sparql"]
            if name and sparql:
                query = Query(name, query=sparql)
                self.qm.queriesByName[name] = query
        return self.qm

    def toYaml(self) -> str:
        if self.qm is None:
            self.query()
            qm = self.toQueryManager()
        else:
            qm = self.qm
        yaml_str = "# named queries\n"
        for query in qm.queriesByName.values():
            yaml_str += f"""'{query.name}':
    sparql: |
"""
            for line in query.query.split("\n"):
                yaml_str += f"      {line}\n"
        return yaml_str

`init(wikiId='cr')`

Source code in ceurws/namedqueries.py

def __init__(self, wikiId: str = "cr"):
    """ """
    self.wikiId = wikiId
    self.wikiClient = WikiClient.ofWikiId(wikiId)
    if self.wikiClient.needsLogin():
        self.wikiClient.login()
    self.smw = SMWClient(self.wikiClient.getSite())
    self.qm: QueryManager | None = None

`query()`

run query

Source code in ceurws/namedqueries.py

    def query(self):
        """
        run query
        """
        ask_query = """
        {{#ask: [[Concept:Query]]
|mainlabel=Query
|?Query id = id
|?Query name=name
|?Query title = title
|?Query tryiturl = tryiturl
|?Query wdqsurl = wdqsurl
|?Query sparql=sparql
|?Query relevance = relevance
|?Query task = task
|limit=200
|sort=Query task,Query id
|order=ascending
}}"""
        self.q_records = self.smw.query(ask_query)

`toQueryManager()`

convert me to a QueryManager

Source code in ceurws/namedqueries.py

def toQueryManager(self) -> QueryManager:
    """
    convert me to a QueryManager
    """
    self.qm = QueryManager(lang="sparql")
    self.qm.queriesByName = {}
    for q_record in self.q_records.values():
        name = q_record["name"]
        sparql = q_record["sparql"]
        if name and sparql:
            query = Query(name, query=sparql)
            self.qm.queriesByName[name] = query
    return self.qm

`papertocparser`

Created on 2023-03-22

@author: wf

`PaperTocParser`

Bases: Textparser

parser for paper table of contents

Source code in ceurws/papertocparser.py

class PaperTocParser(Textparser):
    """
    parser for paper table of contents
    """

    def __init__(self, number: str, soup: BeautifulSoup, debug: bool = False):
        """
        constructor

        Args:
            number(str): the volume number
            soup(BeautifulSoup): the parser input
            debug(bool): if True print out debug info
        """
        Textparser.__init__(self, debug=debug)
        self.number = number
        self.soup = soup
        self.scrape = WebScrape()
        self.scrapeDescr = [
            ScrapeDescription(key="title", tag="span", attribute="class", value="CEURTITLE"),
            ScrapeDescription(
                key="authors",
                tag="span",
                attribute="class",
                value="CEURAUTHOR",
                multi=True,
            ),
            ScrapeDescription(key="pages", tag="span", attribute="class", value="CEURPAGES"),
            # ScrapeDescription(key='submitted_papers', tag='span', attribute='class', value='CEURSUBMITTEDPAPERS'),
            # ScrapeDescription(key='accepted_papers', tag='span', attribute='class', value='CEURACCEPTEDPAPERS'),
        ]

    def parsePapers(self):
        """
        parse the toc to papers
        """
        paper_records = []
        toc = self.soup.find(attrs={"class": "CEURTOC"})
        if toc:
            paper_ids = []
            for index, paper_li in enumerate(toc.findAll("li")):
                paper_record = self.scrape.parseWithScrapeDescription(paper_li, self.scrapeDescr)
                paper_record["vol_number"] = self.number
                href_node = paper_li.find("a", href=True)
                if href_node:
                    href = href_node.attrs["href"]
                    href = Textparser.sanitize(href)
                    paper_record["pdf_name"] = href
                if "id" in paper_li.attrs:
                    paper_id = paper_li.attrs["id"]
                    if paper_id in paper_ids:
                        paper_id = f"{paper_id}-duplicate-{index}"
                    paper_ids.append(paper_id)
                    key = f"Vol-{self.number}/{paper_id}"
                    paper_record["id"] = key
                paper_records.append(paper_record)
                pass
        else:
            toc = self.soup.find("h2", string=re.compile(".*Contents.*"))
            if toc:
                index = 0
                for paper_li in self.soup.find_all("li", recursive=True):
                    href_node = paper_li.find("a", href=True)
                    if href_node:
                        href = href_node.attrs["href"]
                        href = Textparser.sanitize(href)
                        if ".pdf" in href:
                            title = Textparser.sanitize(href_node.text)
                            index += 1
                            key = f"Vol-{self.number}/paper-{index}"
                            paper_record = {
                                "vol_number": self.number,
                                "title": title,
                                "pdf_name": href,
                                "id": key,
                            }
                            authors = ""
                            # authors are after next br tag
                            br = paper_li.find("br")
                            if not br:
                                paper_record["fail"] = "authors br not found"
                            else:
                                author_part = br.next_sibling
                                if not author_part:
                                    paper_record["fail"] = "authors br not found"
                                else:
                                    authors = author_part.text
                            authors = Textparser.sanitize(authors)
                            author_list = authors.split(",")
                            for i, author in enumerate(author_list):
                                author_list[i] = author.strip()
                            paper_record["authors"] = author_list
                            paper_records.append(paper_record)
            else:
                if self.debug:
                    print(f"no toc for {self.number}")
        return paper_records

`init(number, soup, debug=False)`

constructor

Parameters:

Name	Description	Default
`number(str)`	the volume number	required
`soup(BeautifulSoup)`	the parser input	required
`debug(bool)`	if True print out debug info	required

Source code in ceurws/papertocparser.py

def __init__(self, number: str, soup: BeautifulSoup, debug: bool = False):
    """
    constructor

    Args:
        number(str): the volume number
        soup(BeautifulSoup): the parser input
        debug(bool): if True print out debug info
    """
    Textparser.__init__(self, debug=debug)
    self.number = number
    self.soup = soup
    self.scrape = WebScrape()
    self.scrapeDescr = [
        ScrapeDescription(key="title", tag="span", attribute="class", value="CEURTITLE"),
        ScrapeDescription(
            key="authors",
            tag="span",
            attribute="class",
            value="CEURAUTHOR",
            multi=True,
        ),
        ScrapeDescription(key="pages", tag="span", attribute="class", value="CEURPAGES"),
        # ScrapeDescription(key='submitted_papers', tag='span', attribute='class', value='CEURSUBMITTEDPAPERS'),
        # ScrapeDescription(key='accepted_papers', tag='span', attribute='class', value='CEURACCEPTEDPAPERS'),
    ]

`parsePapers()`

parse the toc to papers

Source code in ceurws/papertocparser.py

def parsePapers(self):
    """
    parse the toc to papers
    """
    paper_records = []
    toc = self.soup.find(attrs={"class": "CEURTOC"})
    if toc:
        paper_ids = []
        for index, paper_li in enumerate(toc.findAll("li")):
            paper_record = self.scrape.parseWithScrapeDescription(paper_li, self.scrapeDescr)
            paper_record["vol_number"] = self.number
            href_node = paper_li.find("a", href=True)
            if href_node:
                href = href_node.attrs["href"]
                href = Textparser.sanitize(href)
                paper_record["pdf_name"] = href
            if "id" in paper_li.attrs:
                paper_id = paper_li.attrs["id"]
                if paper_id in paper_ids:
                    paper_id = f"{paper_id}-duplicate-{index}"
                paper_ids.append(paper_id)
                key = f"Vol-{self.number}/{paper_id}"
                paper_record["id"] = key
            paper_records.append(paper_record)
            pass
    else:
        toc = self.soup.find("h2", string=re.compile(".*Contents.*"))
        if toc:
            index = 0
            for paper_li in self.soup.find_all("li", recursive=True):
                href_node = paper_li.find("a", href=True)
                if href_node:
                    href = href_node.attrs["href"]
                    href = Textparser.sanitize(href)
                    if ".pdf" in href:
                        title = Textparser.sanitize(href_node.text)
                        index += 1
                        key = f"Vol-{self.number}/paper-{index}"
                        paper_record = {
                            "vol_number": self.number,
                            "title": title,
                            "pdf_name": href,
                            "id": key,
                        }
                        authors = ""
                        # authors are after next br tag
                        br = paper_li.find("br")
                        if not br:
                            paper_record["fail"] = "authors br not found"
                        else:
                            author_part = br.next_sibling
                            if not author_part:
                                paper_record["fail"] = "authors br not found"
                            else:
                                authors = author_part.text
                        authors = Textparser.sanitize(authors)
                        author_list = authors.split(",")
                        for i, author in enumerate(author_list):
                            author_list[i] = author.strip()
                        paper_record["authors"] = author_list
                        paper_records.append(paper_record)
        else:
            if self.debug:
                print(f"no toc for {self.number}")
    return paper_records

`services`

`entity_fishing`

`CeurEntityFishing`

EntityFishing component for spaCy pipeline. modified version of https://github.com/Lucaterre/spacyfishing/blob/main/spacyfishing/entity_fishing_linker.py

Source code in ceurws/services/entity_fishing.py

@Language.factory(
    name=ENTITY_FISHING_PIPELINE,
    default_config={
        "api_ef_base": f"{ENTITY_FISHING_ENDPOINT}/service",
        "language": "en",
        "extra_info": False,
        "filter_statements": [],
        "verbose": False,
    },
)
class CeurEntityFishing:
    """
    EntityFishing component for spaCy pipeline.
    modified version of https://github.com/Lucaterre/spacyfishing/blob/main/spacyfishing/entity_fishing_linker.py
    """

    def __init__(
        self,
        nlp: Language,
        name: str,
        api_ef_base: str,
        language: str,
        extra_info: bool,
        filter_statements: list,
        verbose: bool,
    ):
        """
        `EntityFishing` main class component.

        Note:
            Show default config for default attributes values.

        Parameters:
            api_ef_base (str): describes url of the entity-fishing API used.
            language (str): matches the language of the resources to
            be disambiguated (matches the language model for the NER task).
            extra_info (bool): attach extra information to spans as normalised term,
            description, others knowledge base ids.
            filter_statements (list): filter others KB ids
            that relies on QID  eg. ['P214', 'P244'].
            verbose (bool): display logging messages.

        Attributes:
            api_ef_base (str): cf. `api_ef_base` in parameters section.
            language (dict): cf. `language` in parameters section.
            prepare the language argument for the query.
            wikidata_url_base (str): wikidata base url (to concatenate QID identifiers).
            flag_extra (bool): cf. `extra_info` in parameters section.
            filter_statements (list): cf. `filter_statements` in parameters section.
            verbose (bool): cf. `verbose` in parameters section.
        """
        if not api_ef_base.endswith("/"):
            api_ef_base += "/"
        self.api_ef_base = api_ef_base
        self.language = dict(lang=language)
        self.wikidata_url_base = "https://www.wikidata.org/wiki/"

        self.flag_extra = extra_info
        self.filter_statements = filter_statements
        self.verbose = verbose

        # Set doc extensions to attaches raw response from Entity-Fishing API to doc
        Doc.set_extension("annotations", default={}, force=True)
        Doc.set_extension("metadata", default={}, force=True)

        # Set spans extensions to enhance spans with new information
        # come from Wikidata knowledge base.
        # default spans :
        Span.set_extension("kb_qid", default=None, force=True)
        Span.set_extension("wikipedia_page_ref", default=None, force=True)
        Span.set_extension("url_wikidata", default=None, force=True)
        Span.set_extension("nerd_score", default=None, force=True)

        # spans if extra_info set to True
        Span.set_extension("normal_term", default=None, force=True)
        Span.set_extension("description", default=None, force=True)
        Span.set_extension("src_description", default=None, force=True)
        Span.set_extension("other_ids", default=None, force=True)

    @staticmethod
    def generic_client_batch(
        method: str,
        url_batch: list[str],
        verbose: bool,
        params: dict | None = None,
        files_batch: list[dict] | None = None,
    ) -> list[requests.Response]:
        """
        It takes a list of urls and a list of files, and it sends a request to each url with the
        corresponding file

        :param method: str,
        :type method: str
        :param url_batch: a list of urls to send requests to
        :type url_batch: list[str]
        :param verbose: if True, the client will print out the status of each request
        :type verbose: bool
        :param params: dict = None,
        :type params: dict
        :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated
        :type files_batch: list[dict]
        :return: A list of responses.
        """
        if params is None:
            params = {}
        if files_batch is None:
            files_batch = [{} for url in url_batch]

        def load_url(type_url, type_files):
            if method == "POST":
                return requests.post(
                    url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
                )
            else:
                return requests.get(
                    url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
                )

        response_batch = []
        resp_err, resp_ok = 0, 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            future_to_url = {
                executor.submit(load_url, type_url, type_files): (type_url, type_files)
                for type_url, type_files in zip(url_batch, files_batch, strict=False)
            }
            for future in concurrent.futures.as_completed(future_to_url):
                # url = future_to_url[future]
                try:
                    response_batch.append(future.result())
                except Exception:
                    resp_err = resp_err + 1
                else:
                    resp_ok = resp_ok + 1

        def client_log(msg: str) -> None:
            if verbose:
                logging.warning(msg)

        # Manage response status code :
        # cf. https://nerd.readthedocs.io/en/latest/restAPI.html#response-status-codes
        for idx, response in enumerate(response_batch):
            if response.status_code == 400:
                client_log(
                    f"Request {idx}. Wrong request, missing parameters, "
                    "missing header, text too short (<= 5 characters). (400)"
                )
            elif response.status_code == 500:
                client_log(f"Request {idx}. Entity-Fishing API service seems broken. (500)")
            elif response.status_code == 404:
                client_log(f"Request {idx}. Property was not found in request body. (404)")
            elif response.status_code == 406:
                client_log(f"Request {idx}. Language is not supported by Entity-Fishing. (406)")

        return response_batch

    @staticmethod
    def process_response(response: requests.models.Response) -> tuple[dict, dict]:
        """
        It takes a response object from the `requests` library and returns a tuple of two dictionaries.
        The first dictionary is the JSON response from the API, and the second dictionary contains
        metadata about the response

        :param response: The response object returned by the requests library
        :type response: requests.models.Response
        :return: A tuple of two dictionaries.
        """
        try:
            res_json = response.json()
        except json.decoder.JSONDecodeError:
            res_json = {}

        metadata = {
            "status_code": response.status_code,
            "reason": response.reason,
            "ok": response.ok,
            "encoding": response.encoding,
        }

        return res_json, metadata

    @staticmethod
    def prepare_data(text: str, terms: str, entities: list, language: dict, full: bool = False) -> dict:
        """
        > The function takes in a text, a list of entities, a language dictionary and a boolean value.
        It then returns a dictionary with a key called "query" and a value that is a JSON object

        :param text: The text to be analyzed
        :type text: str
        :param terms: the terms to be searched for
        :type terms: str
        :param entities: list of entities in the text
        :type entities: list
        :param language: the language of the text
        :type language: dict
        :param full: if True, the response will contain the full text of the article, defaults to False
        :type full: bool (optional)
        :return: A dictionary with a key of "query" and a value of a json object.
        """
        return {
            "query": json.dumps(
                {
                    "text": text,
                    "shortText": terms,
                    "language": language,
                    "entities": [
                        {
                            "rawName": ent.text,
                            "offsetStart": ent.start_char,
                            "offsetEnd": ent.end_char,
                        }
                        for ent in entities
                    ],
                    "mentions": [],
                    "customisation": "generic",
                    "full": "true" if full else "false",
                },
                ensure_ascii=False,
            )
        }

    def updated_entities(self, doc: Doc, response: list) -> None:
        """
        > The function `updated_entities` takes a `Doc` object and a list of entities as input. It then
        iterates over the list of entities and updates the `Doc` object with the information contained
        in the list of entities

        :param doc: the document to be processed
        :type doc: Doc
        :param response: the response from the NERD API
        :type response: list
        """
        for entity in response:
            with contextlib.suppress(AttributeError):
                span = doc.char_span(start_idx=entity["offsetStart"], end_idx=entity["offsetEnd"])
                with contextlib.suppress(KeyError):
                    span._.kb_qid = str(entity["wikidataId"])
                    span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
                with contextlib.suppress(KeyError):
                    span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
                    # if flag_extra : search other info on entity
                    # => attach extra entity info to span
                    if self.flag_extra:
                        self.look_extra_informations_on_entity(span, entity)
                with contextlib.suppress(KeyError):
                    span._.nerd_score = entity["confidence_score"]

    # ~ Entity-fishing call service methods ~:
    def concept_look_up_batch(self, wiki_id_batch: str) -> list[requests.Response]:
        """
        > This function takes a list of wikipedia ids and returns a list of responses from the API

        :param wiki_id_batch: a list of wikipedia ids
        :type wiki_id_batch: str
        :return: A list of requests.Response objects.
        """
        url_concept_lookup_batch = [self.api_ef_base + "kb/concept/" + wiki_id for wiki_id in wiki_id_batch]
        return self.generic_client_batch(
            method="GET", url_batch=url_concept_lookup_batch, params=self.language, verbose=self.verbose
        )

    def disambiguate_text_batch(self, files_batch: list[dict]) -> list[requests.Response]:
        """
        > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each
        dictionary contains the text to be disambiguated and the corresponding language. The function
        returns a list of responses, where each response contains the disambiguated text

        :param files_batch: a list of dictionaries, each dictionary containing the following keys:
        :type files_batch: list[dict]
        :return: A list of responses.
        """
        url_disambiguate = self.api_ef_base + "disambiguate"
        url_disambiguate_batch = [url_disambiguate for file in files_batch]
        return self.generic_client_batch(
            method="POST", url_batch=url_disambiguate_batch, files_batch=files_batch, verbose=self.verbose
        )

    def look_extra_informations_on_entity(self, span: Span, res_desc: dict) -> None:
        """
        It takes a span and a dictionary of information about the entity, and adds the information to
        the span

        :param span: The Span object that the extension is being applied to
        :type span: Span
        :param res_desc: the result of the query to Wikidata
        :type res_desc: dict
        """
        # normalised term name
        with contextlib.suppress(KeyError):
            span._.normal_term = res_desc["preferredTerm"]
        # description and source description (filter by language)
        with contextlib.suppress(KeyError, IndexError):
            span._.description = res_desc["definitions"][0]["definition"]
            span._.src_description = res_desc["definitions"][0]["source"]
        # others identifiers attach to QID
        # in Wikidata KB with filter properties or not
        try:
            ids = []
            for content in res_desc["statements"]:
                new_id = {k: content[k] for k in ["propertyName", "propertyId", "value"]}
                if len(self.filter_statements) != 0:
                    if content["propertyId"] in self.filter_statements:
                        ids.append(new_id)
                else:
                    ids.append(new_id)

            span._.other_ids = ids
        except KeyError:
            pass
        except json.decoder.JSONDecodeError:
            pass

    def main_disambiguation_process_batch(
        self, text_batch: list[str], terms_batch: list[str], entities_batch: list[list]
    ) -> list[tuple[dict, dict, list]]:
        """
        It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

        :param text_batch: a list of strings, each string is a text to be disambiguated
        :type text_batch: list[str]
        :param terms_batch: a list of strings, each string is a list of terms separated by a space
        :type terms_batch: list[str]
        :param entities_batch: a list of lists of entities, where each entity is a dictionary with the
        following keys:
        :type entities_batch: list[list]
        :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.
        """
        data_to_post_batch = [
            self.prepare_data(text=text, terms=terms, entities=entities, language=self.language, full=self.flag_extra)
            for text, terms, entities in zip(text_batch, terms_batch, entities_batch, strict=False)
        ]
        reqs = self.disambiguate_text_batch(files_batch=data_to_post_batch)

        response_tuples = []
        for req in reqs:
            res, metadata = self.process_response(response=req)
            try:
                entities_enhanced = res["entities"]
            except KeyError:
                entities_enhanced = []
            response_tuples.append((res, metadata, entities_enhanced))
        return response_tuples

    def process_single_doc_after_call(self, doc: Doc, result_from_ef_text) -> Doc:
        """
        - The function takes a document and a list of entities from the Entity-Fishing service.
        - It then checks if there are any entities in the document that were not disambiguated by the
        Entity-Fishing service.
        - If there are, it passes the text of these entities to the Entity-Fishing service again, but
        this time without the text of the document.
        - It then merges the results of the two calls to the Entity-Fishing service and attaches the
        information from the Entity-Fishing service to the entities in the document

        :param doc: The document to be processed
        :type doc: Doc
        :param result_from_ef_text: a list of three elements:
        :return: A list of dictionaries, each dictionary contains the information of a single entity.
        """
        entities_from_text = result_from_ef_text[2]

        # 1a. Attach raw response (with text method in Entity-Fishing service) to doc
        if len(result_from_ef_text[0]) > 0:
            doc._.annotations["disambiguation_text_service"] = result_from_ef_text[0]

        doc._.metadata["disambiguation_text_service"] = result_from_ef_text[1]

        # 2 .Because some named entities have not been disambiguated,
        # create a list with these unrelated entities ("nil clustering").
        # Pass them back in Entity-fishing without the text but with all
        # the named entities surrounding these entities, to create a context
        # of neighboring terms.
        # nil_clustering = named entities in doc - actual disambiguated entities by EF
        nil_clustering = []
        if len(result_from_ef_text[0]) > 0:
            with contextlib.suppress(KeyError):
                nil_clustering = [
                    doc.char_span(start_idx=ent[1], end_idx=ent[2])
                    for ent in [(ent.text, ent.start_char, ent.end_char) for ent in doc.ents]
                    if ent
                    not in [
                        (ent_ef["rawName"], ent_ef["offsetStart"], ent_ef["offsetEnd"])
                        for ent_ef in result_from_ef_text[0]["entities"]
                    ]
                ]
        entities_from_terms = []
        if len(nil_clustering) != 0:
            # prepare query for Entity-Fishing terms disambiguation
            terms = " ".join([ent.text for ent in doc.ents])
            result_from_ef_terms = self.main_disambiguation_process_batch(
                text_batch=[""], terms_batch=[terms], entities_batch=[nil_clustering]
            )[0]

            entities_from_terms = result_from_ef_terms[2]

            # 2b. Attach raw response (with terms method in Entity-Fishing service) to doc
            if len(result_from_ef_terms[0]) > 0:
                doc._.annotations["disambiguation_terms_service"] = result_from_ef_terms[0]
            doc._.metadata["disambiguation_terms_service"] = result_from_ef_terms[1]

        # 3. Merge two list of entities (first and second pass in EF service)
        # and attach information from Entity-Fishing to spans
        result = (
            entities_from_text
            + [entity_term for entity_term in entities_from_terms if entity_term not in entities_from_text]
            if len(entities_from_terms) > 0
            else entities_from_text
        )

        if len(result) > 0:
            with contextlib.suppress(KeyError):
                self.updated_entities(doc, result)
        return doc

    def __call__(self, doc: Doc) -> Doc:
        """
        > The function takes a spaCy Doc object, and returns a Doc object with the entities
        disambiguated and linked

        :param doc: Doc
        :type doc: Doc
        :return: A Doc object with the entities linked to the corresponding Wikipedia page.
        """
        # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
        result_from_ef_text = self.main_disambiguation_process_batch(
            text_batch=[doc.text], terms_batch=[""], entities_batch=[doc.ents]
        )[0]
        return self.process_single_doc_after_call(doc, result_from_ef_text)

    def pipe(self, stream: Iterable, batch_size: int = 128) -> Doc:
        """
        For each batch of documents, we disambiguate the named entities in the documents, and then yield
        the results

        :param stream: a generator that yields Doc objects
        :type stream: iterator
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        :type batch_size: int
        """
        for docs in util.minibatch(stream, size=batch_size):
            text_batch = [doc.text for doc in docs]
            entities_batch = [doc.ents for doc in docs]
            terms_batch = ["" for _ in text_batch]

            # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
            result_from_ef_text_batch = self.main_disambiguation_process_batch(
                text_batch=text_batch, terms_batch=terms_batch, entities_batch=entities_batch
            )

            for doc, result_from_ef_text in zip(docs, result_from_ef_text_batch, strict=False):
                yield self.process_single_doc_after_call(doc, result_from_ef_text)

`call(doc)`

The function takes a spaCy Doc object, and returns a Doc object with the entities disambiguated and linked

:param doc: Doc :type doc: Doc :return: A Doc object with the entities linked to the corresponding Wikipedia page.

Source code in ceurws/services/entity_fishing.py

def __call__(self, doc: Doc) -> Doc:
    """
    > The function takes a spaCy Doc object, and returns a Doc object with the entities
    disambiguated and linked

    :param doc: Doc
    :type doc: Doc
    :return: A Doc object with the entities linked to the corresponding Wikipedia page.
    """
    # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
    result_from_ef_text = self.main_disambiguation_process_batch(
        text_batch=[doc.text], terms_batch=[""], entities_batch=[doc.ents]
    )[0]
    return self.process_single_doc_after_call(doc, result_from_ef_text)

`init(nlp, name, api_ef_base, language, extra_info, filter_statements, verbose)`

EntityFishing main class component.

Note

Show default config for default attributes values.

Parameters:

Name	Type	Description	Default
`api_ef_base`	`str`	describes url of the entity-fishing API used.	required
`language`	`str`	matches the language of the resources to	required
`extra_info`	`bool`	attach extra information to spans as normalised term,	required
`filter_statements`	`list`	filter others KB ids	required
`verbose`	`bool`	display logging messages.	required

Attributes:

Name	Type	Description
`api_ef_base`	`str`	cf. `api_ef_base` in parameters section.
`language`	`dict`	cf. `language` in parameters section.
`wikidata_url_base`	`str`	wikidata base url (to concatenate QID identifiers).
`flag_extra`	`bool`	cf. `extra_info` in parameters section.
`filter_statements`	`list`	cf. `filter_statements` in parameters section.
`verbose`	`bool`	cf. `verbose` in parameters section.

Source code in ceurws/services/entity_fishing.py

def __init__(
    self,
    nlp: Language,
    name: str,
    api_ef_base: str,
    language: str,
    extra_info: bool,
    filter_statements: list,
    verbose: bool,
):
    """
    `EntityFishing` main class component.

    Note:
        Show default config for default attributes values.

    Parameters:
        api_ef_base (str): describes url of the entity-fishing API used.
        language (str): matches the language of the resources to
        be disambiguated (matches the language model for the NER task).
        extra_info (bool): attach extra information to spans as normalised term,
        description, others knowledge base ids.
        filter_statements (list): filter others KB ids
        that relies on QID  eg. ['P214', 'P244'].
        verbose (bool): display logging messages.

    Attributes:
        api_ef_base (str): cf. `api_ef_base` in parameters section.
        language (dict): cf. `language` in parameters section.
        prepare the language argument for the query.
        wikidata_url_base (str): wikidata base url (to concatenate QID identifiers).
        flag_extra (bool): cf. `extra_info` in parameters section.
        filter_statements (list): cf. `filter_statements` in parameters section.
        verbose (bool): cf. `verbose` in parameters section.
    """
    if not api_ef_base.endswith("/"):
        api_ef_base += "/"
    self.api_ef_base = api_ef_base
    self.language = dict(lang=language)
    self.wikidata_url_base = "https://www.wikidata.org/wiki/"

    self.flag_extra = extra_info
    self.filter_statements = filter_statements
    self.verbose = verbose

    # Set doc extensions to attaches raw response from Entity-Fishing API to doc
    Doc.set_extension("annotations", default={}, force=True)
    Doc.set_extension("metadata", default={}, force=True)

    # Set spans extensions to enhance spans with new information
    # come from Wikidata knowledge base.
    # default spans :
    Span.set_extension("kb_qid", default=None, force=True)
    Span.set_extension("wikipedia_page_ref", default=None, force=True)
    Span.set_extension("url_wikidata", default=None, force=True)
    Span.set_extension("nerd_score", default=None, force=True)

    # spans if extra_info set to True
    Span.set_extension("normal_term", default=None, force=True)
    Span.set_extension("description", default=None, force=True)
    Span.set_extension("src_description", default=None, force=True)
    Span.set_extension("other_ids", default=None, force=True)

`concept_look_up_batch(wiki_id_batch)`

This function takes a list of wikipedia ids and returns a list of responses from the API

:param wiki_id_batch: a list of wikipedia ids :type wiki_id_batch: str :return: A list of requests.Response objects.

Source code in ceurws/services/entity_fishing.py

def concept_look_up_batch(self, wiki_id_batch: str) -> list[requests.Response]:
    """
    > This function takes a list of wikipedia ids and returns a list of responses from the API

    :param wiki_id_batch: a list of wikipedia ids
    :type wiki_id_batch: str
    :return: A list of requests.Response objects.
    """
    url_concept_lookup_batch = [self.api_ef_base + "kb/concept/" + wiki_id for wiki_id in wiki_id_batch]
    return self.generic_client_batch(
        method="GET", url_batch=url_concept_lookup_batch, params=self.language, verbose=self.verbose
    )

`disambiguate_text_batch(files_batch)`

The function disambiguate_text_batch takes a list of dictionaries as input, where each dictionary contains the text to be disambiguated and the corresponding language. The function returns a list of responses, where each response contains the disambiguated text

:param files_batch: a list of dictionaries, each dictionary containing the following keys: :type files_batch: list[dict] :return: A list of responses.

Source code in ceurws/services/entity_fishing.py

def disambiguate_text_batch(self, files_batch: list[dict]) -> list[requests.Response]:
    """
    > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each
    dictionary contains the text to be disambiguated and the corresponding language. The function
    returns a list of responses, where each response contains the disambiguated text

    :param files_batch: a list of dictionaries, each dictionary containing the following keys:
    :type files_batch: list[dict]
    :return: A list of responses.
    """
    url_disambiguate = self.api_ef_base + "disambiguate"
    url_disambiguate_batch = [url_disambiguate for file in files_batch]
    return self.generic_client_batch(
        method="POST", url_batch=url_disambiguate_batch, files_batch=files_batch, verbose=self.verbose
    )

`generic_client_batch(method, url_batch, verbose, params=None, files_batch=None)` `staticmethod`

It takes a list of urls and a list of files, and it sends a request to each url with the corresponding file

:param method: str, :type method: str :param url_batch: a list of urls to send requests to :type url_batch: list[str] :param verbose: if True, the client will print out the status of each request :type verbose: bool :param params: dict = None, :type params: dict :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated :type files_batch: list[dict] :return: A list of responses.

Source code in ceurws/services/entity_fishing.py

@staticmethod
def generic_client_batch(
    method: str,
    url_batch: list[str],
    verbose: bool,
    params: dict | None = None,
    files_batch: list[dict] | None = None,
) -> list[requests.Response]:
    """
    It takes a list of urls and a list of files, and it sends a request to each url with the
    corresponding file

    :param method: str,
    :type method: str
    :param url_batch: a list of urls to send requests to
    :type url_batch: list[str]
    :param verbose: if True, the client will print out the status of each request
    :type verbose: bool
    :param params: dict = None,
    :type params: dict
    :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated
    :type files_batch: list[dict]
    :return: A list of responses.
    """
    if params is None:
        params = {}
    if files_batch is None:
        files_batch = [{} for url in url_batch]

    def load_url(type_url, type_files):
        if method == "POST":
            return requests.post(
                url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
            )
        else:
            return requests.get(
                url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
            )

    response_batch = []
    resp_err, resp_ok = 0, 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {
            executor.submit(load_url, type_url, type_files): (type_url, type_files)
            for type_url, type_files in zip(url_batch, files_batch, strict=False)
        }
        for future in concurrent.futures.as_completed(future_to_url):
            # url = future_to_url[future]
            try:
                response_batch.append(future.result())
            except Exception:
                resp_err = resp_err + 1
            else:
                resp_ok = resp_ok + 1

    def client_log(msg: str) -> None:
        if verbose:
            logging.warning(msg)

    # Manage response status code :
    # cf. https://nerd.readthedocs.io/en/latest/restAPI.html#response-status-codes
    for idx, response in enumerate(response_batch):
        if response.status_code == 400:
            client_log(
                f"Request {idx}. Wrong request, missing parameters, "
                "missing header, text too short (<= 5 characters). (400)"
            )
        elif response.status_code == 500:
            client_log(f"Request {idx}. Entity-Fishing API service seems broken. (500)")
        elif response.status_code == 404:
            client_log(f"Request {idx}. Property was not found in request body. (404)")
        elif response.status_code == 406:
            client_log(f"Request {idx}. Language is not supported by Entity-Fishing. (406)")

    return response_batch

`look_extra_informations_on_entity(span, res_desc)`

It takes a span and a dictionary of information about the entity, and adds the information to the span

:param span: The Span object that the extension is being applied to :type span: Span :param res_desc: the result of the query to Wikidata :type res_desc: dict

Source code in ceurws/services/entity_fishing.py

def look_extra_informations_on_entity(self, span: Span, res_desc: dict) -> None:
    """
    It takes a span and a dictionary of information about the entity, and adds the information to
    the span

    :param span: The Span object that the extension is being applied to
    :type span: Span
    :param res_desc: the result of the query to Wikidata
    :type res_desc: dict
    """
    # normalised term name
    with contextlib.suppress(KeyError):
        span._.normal_term = res_desc["preferredTerm"]
    # description and source description (filter by language)
    with contextlib.suppress(KeyError, IndexError):
        span._.description = res_desc["definitions"][0]["definition"]
        span._.src_description = res_desc["definitions"][0]["source"]
    # others identifiers attach to QID
    # in Wikidata KB with filter properties or not
    try:
        ids = []
        for content in res_desc["statements"]:
            new_id = {k: content[k] for k in ["propertyName", "propertyId", "value"]}
            if len(self.filter_statements) != 0:
                if content["propertyId"] in self.filter_statements:
                    ids.append(new_id)
            else:
                ids.append(new_id)

        span._.other_ids = ids
    except KeyError:
        pass
    except json.decoder.JSONDecodeError:
        pass

`main_disambiguation_process_batch(text_batch, terms_batch, entities_batch)`

It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

:param text_batch: a list of strings, each string is a text to be disambiguated :type text_batch: list[str] :param terms_batch: a list of strings, each string is a list of terms separated by a space :type terms_batch: list[str] :param entities_batch: a list of lists of entities, where each entity is a dictionary with the following keys: :type entities_batch: list[list] :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.

Source code in ceurws/services/entity_fishing.py

def main_disambiguation_process_batch(
    self, text_batch: list[str], terms_batch: list[str], entities_batch: list[list]
) -> list[tuple[dict, dict, list]]:
    """
    It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

    :param text_batch: a list of strings, each string is a text to be disambiguated
    :type text_batch: list[str]
    :param terms_batch: a list of strings, each string is a list of terms separated by a space
    :type terms_batch: list[str]
    :param entities_batch: a list of lists of entities, where each entity is a dictionary with the
    following keys:
    :type entities_batch: list[list]
    :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.
    """
    data_to_post_batch = [
        self.prepare_data(text=text, terms=terms, entities=entities, language=self.language, full=self.flag_extra)
        for text, terms, entities in zip(text_batch, terms_batch, entities_batch, strict=False)
    ]
    reqs = self.disambiguate_text_batch(files_batch=data_to_post_batch)

    response_tuples = []
    for req in reqs:
        res, metadata = self.process_response(response=req)
        try:
            entities_enhanced = res["entities"]
        except KeyError:
            entities_enhanced = []
        response_tuples.append((res, metadata, entities_enhanced))
    return response_tuples

`pipe(stream, batch_size=128)`

For each batch of documents, we disambiguate the named entities in the documents, and then yield the results

:param stream: a generator that yields Doc objects :type stream: iterator :param batch_size: The number of documents to process at a time, defaults to 128 (optional) :type batch_size: int

Source code in ceurws/services/entity_fishing.py

def pipe(self, stream: Iterable, batch_size: int = 128) -> Doc:
    """
    For each batch of documents, we disambiguate the named entities in the documents, and then yield
    the results

    :param stream: a generator that yields Doc objects
    :type stream: iterator
    :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
    :type batch_size: int
    """
    for docs in util.minibatch(stream, size=batch_size):
        text_batch = [doc.text for doc in docs]
        entities_batch = [doc.ents for doc in docs]
        terms_batch = ["" for _ in text_batch]

        # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
        result_from_ef_text_batch = self.main_disambiguation_process_batch(
            text_batch=text_batch, terms_batch=terms_batch, entities_batch=entities_batch
        )

        for doc, result_from_ef_text in zip(docs, result_from_ef_text_batch, strict=False):
            yield self.process_single_doc_after_call(doc, result_from_ef_text)

`prepare_data(text, terms, entities, language, full=False)` `staticmethod`

The function takes in a text, a list of entities, a language dictionary and a boolean value. It then returns a dictionary with a key called "query" and a value that is a JSON object

:param text: The text to be analyzed :type text: str :param terms: the terms to be searched for :type terms: str :param entities: list of entities in the text :type entities: list :param language: the language of the text :type language: dict :param full: if True, the response will contain the full text of the article, defaults to False :type full: bool (optional) :return: A dictionary with a key of "query" and a value of a json object.

Source code in ceurws/services/entity_fishing.py

@staticmethod
def prepare_data(text: str, terms: str, entities: list, language: dict, full: bool = False) -> dict:
    """
    > The function takes in a text, a list of entities, a language dictionary and a boolean value.
    It then returns a dictionary with a key called "query" and a value that is a JSON object

    :param text: The text to be analyzed
    :type text: str
    :param terms: the terms to be searched for
    :type terms: str
    :param entities: list of entities in the text
    :type entities: list
    :param language: the language of the text
    :type language: dict
    :param full: if True, the response will contain the full text of the article, defaults to False
    :type full: bool (optional)
    :return: A dictionary with a key of "query" and a value of a json object.
    """
    return {
        "query": json.dumps(
            {
                "text": text,
                "shortText": terms,
                "language": language,
                "entities": [
                    {
                        "rawName": ent.text,
                        "offsetStart": ent.start_char,
                        "offsetEnd": ent.end_char,
                    }
                    for ent in entities
                ],
                "mentions": [],
                "customisation": "generic",
                "full": "true" if full else "false",
            },
            ensure_ascii=False,
        )
    }

`process_response(response)` `staticmethod`

It takes a response object from the requests library and returns a tuple of two dictionaries. The first dictionary is the JSON response from the API, and the second dictionary contains metadata about the response

:param response: The response object returned by the requests library :type response: requests.models.Response :return: A tuple of two dictionaries.

Source code in ceurws/services/entity_fishing.py

@staticmethod
def process_response(response: requests.models.Response) -> tuple[dict, dict]:
    """
    It takes a response object from the `requests` library and returns a tuple of two dictionaries.
    The first dictionary is the JSON response from the API, and the second dictionary contains
    metadata about the response

    :param response: The response object returned by the requests library
    :type response: requests.models.Response
    :return: A tuple of two dictionaries.
    """
    try:
        res_json = response.json()
    except json.decoder.JSONDecodeError:
        res_json = {}

    metadata = {
        "status_code": response.status_code,
        "reason": response.reason,
        "ok": response.ok,
        "encoding": response.encoding,
    }

    return res_json, metadata

`process_single_doc_after_call(doc, result_from_ef_text)`

The function takes a document and a list of entities from the Entity-Fishing service.
It then checks if there are any entities in the document that were not disambiguated by the Entity-Fishing service.
If there are, it passes the text of these entities to the Entity-Fishing service again, but this time without the text of the document.
It then merges the results of the two calls to the Entity-Fishing service and attaches the information from the Entity-Fishing service to the entities in the document

:param doc: The document to be processed :type doc: Doc :param result_from_ef_text: a list of three elements: :return: A list of dictionaries, each dictionary contains the information of a single entity.

Source code in ceurws/services/entity_fishing.py

def process_single_doc_after_call(self, doc: Doc, result_from_ef_text) -> Doc:
    """
    - The function takes a document and a list of entities from the Entity-Fishing service.
    - It then checks if there are any entities in the document that were not disambiguated by the
    Entity-Fishing service.
    - If there are, it passes the text of these entities to the Entity-Fishing service again, but
    this time without the text of the document.
    - It then merges the results of the two calls to the Entity-Fishing service and attaches the
    information from the Entity-Fishing service to the entities in the document

    :param doc: The document to be processed
    :type doc: Doc
    :param result_from_ef_text: a list of three elements:
    :return: A list of dictionaries, each dictionary contains the information of a single entity.
    """
    entities_from_text = result_from_ef_text[2]

    # 1a. Attach raw response (with text method in Entity-Fishing service) to doc
    if len(result_from_ef_text[0]) > 0:
        doc._.annotations["disambiguation_text_service"] = result_from_ef_text[0]

    doc._.metadata["disambiguation_text_service"] = result_from_ef_text[1]

    # 2 .Because some named entities have not been disambiguated,
    # create a list with these unrelated entities ("nil clustering").
    # Pass them back in Entity-fishing without the text but with all
    # the named entities surrounding these entities, to create a context
    # of neighboring terms.
    # nil_clustering = named entities in doc - actual disambiguated entities by EF
    nil_clustering = []
    if len(result_from_ef_text[0]) > 0:
        with contextlib.suppress(KeyError):
            nil_clustering = [
                doc.char_span(start_idx=ent[1], end_idx=ent[2])
                for ent in [(ent.text, ent.start_char, ent.end_char) for ent in doc.ents]
                if ent
                not in [
                    (ent_ef["rawName"], ent_ef["offsetStart"], ent_ef["offsetEnd"])
                    for ent_ef in result_from_ef_text[0]["entities"]
                ]
            ]
    entities_from_terms = []
    if len(nil_clustering) != 0:
        # prepare query for Entity-Fishing terms disambiguation
        terms = " ".join([ent.text for ent in doc.ents])
        result_from_ef_terms = self.main_disambiguation_process_batch(
            text_batch=[""], terms_batch=[terms], entities_batch=[nil_clustering]
        )[0]

        entities_from_terms = result_from_ef_terms[2]

        # 2b. Attach raw response (with terms method in Entity-Fishing service) to doc
        if len(result_from_ef_terms[0]) > 0:
            doc._.annotations["disambiguation_terms_service"] = result_from_ef_terms[0]
        doc._.metadata["disambiguation_terms_service"] = result_from_ef_terms[1]

    # 3. Merge two list of entities (first and second pass in EF service)
    # and attach information from Entity-Fishing to spans
    result = (
        entities_from_text
        + [entity_term for entity_term in entities_from_terms if entity_term not in entities_from_text]
        if len(entities_from_terms) > 0
        else entities_from_text
    )

    if len(result) > 0:
        with contextlib.suppress(KeyError):
            self.updated_entities(doc, result)
    return doc

`updated_entities(doc, response)`

The function updated_entities takes a Doc object and a list of entities as input. It then iterates over the list of entities and updates the Doc object with the information contained in the list of entities

:param doc: the document to be processed :type doc: Doc :param response: the response from the NERD API :type response: list

Source code in ceurws/services/entity_fishing.py

def updated_entities(self, doc: Doc, response: list) -> None:
    """
    > The function `updated_entities` takes a `Doc` object and a list of entities as input. It then
    iterates over the list of entities and updates the `Doc` object with the information contained
    in the list of entities

    :param doc: the document to be processed
    :type doc: Doc
    :param response: the response from the NERD API
    :type response: list
    """
    for entity in response:
        with contextlib.suppress(AttributeError):
            span = doc.char_span(start_idx=entity["offsetStart"], end_idx=entity["offsetEnd"])
            with contextlib.suppress(KeyError):
                span._.kb_qid = str(entity["wikidataId"])
                span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
            with contextlib.suppress(KeyError):
                span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
                # if flag_extra : search other info on entity
                # => attach extra entity info to span
                if self.flag_extra:
                    self.look_extra_informations_on_entity(span, entity)
            with contextlib.suppress(KeyError):
                span._.nerd_score = entity["confidence_score"]

`opentapioca`

@author: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py

`EntityLinker`

Sends raw data to the OpenTapioca API. Attaches entities to the document. Based on: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py

Source code in ceurws/services/opentapioca.py

@Language.factory(OPENTAPIOCA_PIPELINE, default_config={"url": f"{OPENTAPIOCA_ENDPOINT}/api/annotate"})
class EntityLinker:
    """
    Sends raw data to the OpenTapioca API. Attaches entities to the document.
    Based on: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py
    """

    def __init__(self, nlp, name, url):
        """Passes url. Registers OpenTapioca extensions for Doc and Span."""
        self.url = url
        Doc.set_extension("annotations", default=None, force=True)
        Doc.set_extension("metadata", default=None, force=True)
        Span.set_extension("annotations", default=None, force=True)
        Span.set_extension("description", default=None, force=True)
        Span.set_extension("aliases", default=None, force=True)
        Span.set_extension("rank", default=None, force=True)
        Span.set_extension("score", default=None, force=True)
        Span.set_extension("types", default=None, force=True)
        Span.set_extension("label", default=None, force=True)
        Span.set_extension("extra_aliases", default=None, force=True)
        Span.set_extension("nb_sitelinks", default=None, force=True)
        Span.set_extension("nb_statements", default=None, force=True)

    def process_single_doc_after_call(self, doc: Doc, r) -> Doc:
        r.raise_for_status()
        data = r.json()

        # Attaches raw data to doc
        doc._.annotations = data.get("annotations")
        doc._.metadata = {"status_code": r.status_code, "reason": r.reason, "ok": r.ok, "encoding": r.encoding}

        # Attaches indexes, label and QID to spans
        # Processes annotations: if 'best_qid'==None, then no annotation
        ents = []
        for ent in data.get("annotations"):
            start, end = ent["start"], ent["end"]
            if ent.get("best_qid"):
                ent_kb_id = ent["best_qid"]
                try:  # to identify the type of entities
                    t = ent["tags"][0]["types"]
                    types = {
                        "PERSON": t["Q5"] + t["P496"],
                        "ORG": t["Q43229"] + t["P2427"],
                        "LOC": t["Q618123"] + t["P1566"],
                    }
                    m = max(types.values())
                    etype = "".join([k for k, v in types.items() if v == m])
                except Exception as e:
                    log.error(e, extra=ent)
                    etype = ""
                span = doc.char_span(start, end, etype, ent_kb_id)
            else:
                etype, ent_kb_id = "", ""
                span = doc.char_span(start, end, etype)
            if not span:
                span = doc.char_span(start, end, etype, ent_kb_id, alignment_mode="expand")
                log.warning(
                    'The OpenTapioca-entity "%s" %s does not fit the span "%s" %s in spaCy. EXPANDED!',
                    ent["tags"][0]["label"][0],
                    (start, end),
                    span.text,
                    (span.start_char, span.end_char),
                )
            span._.annotations = ent
            span._.description = ent["tags"][0]["desc"]
            span._.aliases = ent["tags"][0]["aliases"]
            span._.rank = ent["tags"][0]["rank"]
            span._.score = ent["tags"][0]["score"]
            span._.types = ent["tags"][0]["types"]
            span._.label = ent["tags"][0]["label"]
            span._.extra_aliases = ent["tags"][0]["extra_aliases"]
            span._.nb_sitelinks = ent["tags"][0]["nb_sitelinks"]
            span._.nb_statements = ent["tags"][0]["nb_statements"]
            ents.append(span)

        # Attach processed entities to doc.ents
        try:
            # this works with non-overlapping spans
            doc.ents = list(doc.ents) + ents
        except Exception:
            # filter the overlapping spans, keep the (first) longest one
            doc.ents = spacy.util.filter_spans(ents)
        # Attach all entities found by OpenTapioca to spans
        doc.spans["all_entities_opentapioca"] = ents
        return doc

    def make_request(self, doc: Doc):
        return requests.post(url=self.url, data={"query": doc.text}, headers={"User-Agent": "spaCyOpenTapioca"})

    def __call__(self, doc):
        """Requests the OpenTapioca API. Attaches entities to spans and doc."""

        # Post request to the OpenTapioca API
        r = self.make_request(doc)

        return self.process_single_doc_after_call(doc, r)

    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each batch of documents, it makes a request to the API
        for each document in the batch, and then yields the processed results of each document

        :param stream: the stream of documents to be processed
        :param batch_size: The number of documents to send to the API in a single request, defaults to
        128 (optional)
        """
        for docs in util.minibatch(stream, size=batch_size):
            with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
                future_to_url = {executor.submit(self.make_request, doc): doc for doc in docs}
                for future in concurrent.futures.as_completed(future_to_url):
                    doc = future_to_url[future]
                    yield self.process_single_doc_after_call(doc, future.result())

`call(doc)`

Requests the OpenTapioca API. Attaches entities to spans and doc.

Source code in ceurws/services/opentapioca.py

def __call__(self, doc):
    """Requests the OpenTapioca API. Attaches entities to spans and doc."""

    # Post request to the OpenTapioca API
    r = self.make_request(doc)

    return self.process_single_doc_after_call(doc, r)

`init(nlp, name, url)`

Passes url. Registers OpenTapioca extensions for Doc and Span.

Source code in ceurws/services/opentapioca.py

def __init__(self, nlp, name, url):
    """Passes url. Registers OpenTapioca extensions for Doc and Span."""
    self.url = url
    Doc.set_extension("annotations", default=None, force=True)
    Doc.set_extension("metadata", default=None, force=True)
    Span.set_extension("annotations", default=None, force=True)
    Span.set_extension("description", default=None, force=True)
    Span.set_extension("aliases", default=None, force=True)
    Span.set_extension("rank", default=None, force=True)
    Span.set_extension("score", default=None, force=True)
    Span.set_extension("types", default=None, force=True)
    Span.set_extension("label", default=None, force=True)
    Span.set_extension("extra_aliases", default=None, force=True)
    Span.set_extension("nb_sitelinks", default=None, force=True)
    Span.set_extension("nb_statements", default=None, force=True)

`pipe(stream, batch_size=128)`

It takes a stream of documents, and for each batch of documents, it makes a request to the API for each document in the batch, and then yields the processed results of each document

:param stream: the stream of documents to be processed :param batch_size: The number of documents to send to the API in a single request, defaults to 128 (optional)

Source code in ceurws/services/opentapioca.py

def pipe(self, stream, batch_size=128):
    """
    It takes a stream of documents, and for each batch of documents, it makes a request to the API
    for each document in the batch, and then yields the processed results of each document

    :param stream: the stream of documents to be processed
    :param batch_size: The number of documents to send to the API in a single request, defaults to
    128 (optional)
    """
    for docs in util.minibatch(stream, size=batch_size):
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            future_to_url = {executor.submit(self.make_request, doc): doc for doc in docs}
            for future in concurrent.futures.as_completed(future_to_url):
                doc = future_to_url[future]
                yield self.process_single_doc_after_call(doc, future.result())

`sql_cache`

Created on 2024-03-16 @author: wf

`Cached`

Manage cached entities.

Source code in ceurws/sql_cache.py

class Cached:
    """
    Manage cached entities.
    """

    def __init__(
        self, clazz: type[Any], sparql: SPARQL, sql_db: SqlDB, query_name: str, max_errors: int = 0, debug: bool = False
    ):
        """
        Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string,
        query name, and an optional debug flag.
        Args:
            clazz (type[Any]): The class reference for the type of objects managed by this manager.
            sparql (SPARQL): a SPARQL endpoint.
            sql_db (SqlDB): SQL database object
            query_name (str): The name of the query to be executed.
            debug (bool, optional): Flag to enable debug mode. Defaults to False.
        """
        self.clazz = clazz
        self.sparql = sparql
        self.sql_db = sql_db
        self.query_name = query_name
        self.max_errors = max_errors
        self.debug = debug
        self.entities: list[object] = []
        self.errors: list[Exception] = []
        # Ensure the table for the class exists
        clazz.metadata.create_all(self.sql_db.engine)

    def fetch_or_query(self, qm, force_query=False):
        """
        Fetches data from the local cache if available.
        If the data is not in the cache or if force_query is True,
        it queries via SPARQL and caches the results.

        Args:
            qm (QueryManager): The query manager object used for making SPARQL queries.
            force_query (bool, optional): A flag to force querying via SPARQL even
                if the data exists in the local cache. Defaults to False.
        """
        if not force_query and self.check_local_cache():
            self.fetch_from_local()
        else:
            self.get_lod(qm)
            self.store()

    def check_local_cache(self) -> bool:
        """
        Checks if there is data in the local cache (SQL database).

        Returns:
            bool: True if  there is at least one record in the local SQL cache table
        """
        with self.sql_db.get_session() as session:
            result = session.exec(select(self.clazz)).first()
            return result is not None

    def fetch_from_local(self):
        """
        Fetches data from the local SQL database.
        """
        profiler = Profiler(f"fetch {self.query_name} from local", profile=self.debug)
        with self.sql_db.get_session() as session:
            self.entities = session.exec(select(self.clazz)).all()
            self.lod = [entity.dict() for entity in self.entities]
            if self.debug:
                print(f"Loaded {len(self.entities)} records from local cache")
        profiler.time()

    def get_lod(self, qm: QueryManager) -> list[dict]:
        """
        Fetches data using the SPARQL query specified by my query_name.

        Args:
            qm (QueryManager): The query manager object used for making SPARQL queries.
        Returns:
            list[dict]: A list of dictionaries representing the data fetched.
        """
        profiler = Profiler(f"fetch {self.query_name} from SPARQL endpoint {self.sparql.url}", profile=self.debug)
        query = qm.queriesByName[self.query_name]
        self.lod = self.sparql.queryAsListOfDicts(query.query)
        profiler.time()
        if self.debug:
            print(f"Found {len(self.lod)} records for {self.query_name}")
        return self.lod

    def to_entities(self, max_errors: int | None = None) -> list[Any]:
        """
        Converts records fetched from the LOD into entity instances, applying validation.
        Args:
            max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
        Returns:
            list[Any]: A list of entity instances that have passed validation.
        """
        self.entities = []
        self.errors = []
        error_records = []
        if max_errors is None:
            max_errors = self.max_errors
        for record in self.lod:
            try:
                entity = self.clazz.model_validate(record)
                self.entities.append(entity)
            except Exception as e:
                self.errors.append(e)
                error_records.append(record)
        error_count = len(self.errors)
        if error_count > max_errors:
            msg = f"found {error_count} errors > maximum allowed {max_errors} errors"
            if self.debug:
                print(msg)
                for i, error in enumerate(self.errors):
                    print(f"{i}:{error} for \n{error_records[i]}")
            raise Exception(msg)
        return self.entities

    def store(self, max_errors: int | None = None) -> list[Any]:
        """
        Stores the fetched data into the local SQL database.

        Args:
            max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
        Returns:
            list[Any]: A list of entity instances that were stored in the database.

        """
        profiler = Profiler(f"store {self.query_name}", profile=self.debug)
        self.to_entities(max_errors=max_errors)
        with self.sql_db.get_session() as session:
            session.add_all(self.entities)
            session.commit()
            if self.debug:
                print(f"Stored {len(self.entities)} records in local cache")
        profiler.time()
        return self.entities

`init(clazz, sparql, sql_db, query_name, max_errors=0, debug=False)`

Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string, query name, and an optional debug flag. Args: clazz (type[Any]): The class reference for the type of objects managed by this manager. sparql (SPARQL): a SPARQL endpoint. sql_db (SqlDB): SQL database object query_name (str): The name of the query to be executed. debug (bool, optional): Flag to enable debug mode. Defaults to False.

Source code in ceurws/sql_cache.py

def __init__(
    self, clazz: type[Any], sparql: SPARQL, sql_db: SqlDB, query_name: str, max_errors: int = 0, debug: bool = False
):
    """
    Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string,
    query name, and an optional debug flag.
    Args:
        clazz (type[Any]): The class reference for the type of objects managed by this manager.
        sparql (SPARQL): a SPARQL endpoint.
        sql_db (SqlDB): SQL database object
        query_name (str): The name of the query to be executed.
        debug (bool, optional): Flag to enable debug mode. Defaults to False.
    """
    self.clazz = clazz
    self.sparql = sparql
    self.sql_db = sql_db
    self.query_name = query_name
    self.max_errors = max_errors
    self.debug = debug
    self.entities: list[object] = []
    self.errors: list[Exception] = []
    # Ensure the table for the class exists
    clazz.metadata.create_all(self.sql_db.engine)

`check_local_cache()`

Checks if there is data in the local cache (SQL database).

Returns:

Name	Type	Description
`bool`	`bool`	True if there is at least one record in the local SQL cache table

Source code in ceurws/sql_cache.py

def check_local_cache(self) -> bool:
    """
    Checks if there is data in the local cache (SQL database).

    Returns:
        bool: True if  there is at least one record in the local SQL cache table
    """
    with self.sql_db.get_session() as session:
        result = session.exec(select(self.clazz)).first()
        return result is not None

`fetch_from_local()`

Fetches data from the local SQL database.

Source code in ceurws/sql_cache.py

def fetch_from_local(self):
    """
    Fetches data from the local SQL database.
    """
    profiler = Profiler(f"fetch {self.query_name} from local", profile=self.debug)
    with self.sql_db.get_session() as session:
        self.entities = session.exec(select(self.clazz)).all()
        self.lod = [entity.dict() for entity in self.entities]
        if self.debug:
            print(f"Loaded {len(self.entities)} records from local cache")
    profiler.time()

`fetch_or_query(qm, force_query=False)`

Fetches data from the local cache if available. If the data is not in the cache or if force_query is True, it queries via SPARQL and caches the results.

Parameters:

Name	Type	Description	Default
`qm`	`QueryManager`	The query manager object used for making SPARQL queries.	required
`force_query`	`bool`	A flag to force querying via SPARQL even if the data exists in the local cache. Defaults to False.	`False`

Source code in ceurws/sql_cache.py

def fetch_or_query(self, qm, force_query=False):
    """
    Fetches data from the local cache if available.
    If the data is not in the cache or if force_query is True,
    it queries via SPARQL and caches the results.

    Args:
        qm (QueryManager): The query manager object used for making SPARQL queries.
        force_query (bool, optional): A flag to force querying via SPARQL even
            if the data exists in the local cache. Defaults to False.
    """
    if not force_query and self.check_local_cache():
        self.fetch_from_local()
    else:
        self.get_lod(qm)
        self.store()

`get_lod(qm)`

Fetches data using the SPARQL query specified by my query_name.

Parameters:

Name	Type	Description	Default
`qm`	`QueryManager`	The query manager object used for making SPARQL queries.	required

Returns: list[dict]: A list of dictionaries representing the data fetched.

Source code in ceurws/sql_cache.py

def get_lod(self, qm: QueryManager) -> list[dict]:
    """
    Fetches data using the SPARQL query specified by my query_name.

    Args:
        qm (QueryManager): The query manager object used for making SPARQL queries.
    Returns:
        list[dict]: A list of dictionaries representing the data fetched.
    """
    profiler = Profiler(f"fetch {self.query_name} from SPARQL endpoint {self.sparql.url}", profile=self.debug)
    query = qm.queriesByName[self.query_name]
    self.lod = self.sparql.queryAsListOfDicts(query.query)
    profiler.time()
    if self.debug:
        print(f"Found {len(self.lod)} records for {self.query_name}")
    return self.lod

`store(max_errors=None)`

Stores the fetched data into the local SQL database.

Parameters:

Name	Type	Description	Default
`max_errors`	`int`	Maximum allowed validation errors. Defaults to 0.	`None`

Returns: list[Any]: A list of entity instances that were stored in the database.

Source code in ceurws/sql_cache.py

def store(self, max_errors: int | None = None) -> list[Any]:
    """
    Stores the fetched data into the local SQL database.

    Args:
        max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
    Returns:
        list[Any]: A list of entity instances that were stored in the database.

    """
    profiler = Profiler(f"store {self.query_name}", profile=self.debug)
    self.to_entities(max_errors=max_errors)
    with self.sql_db.get_session() as session:
        session.add_all(self.entities)
        session.commit()
        if self.debug:
            print(f"Stored {len(self.entities)} records in local cache")
    profiler.time()
    return self.entities

`to_entities(max_errors=None)`

Converts records fetched from the LOD into entity instances, applying validation. Args: max_errors (int, optional): Maximum allowed validation errors. Defaults to 0. Returns: list[Any]: A list of entity instances that have passed validation.

Source code in ceurws/sql_cache.py

def to_entities(self, max_errors: int | None = None) -> list[Any]:
    """
    Converts records fetched from the LOD into entity instances, applying validation.
    Args:
        max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
    Returns:
        list[Any]: A list of entity instances that have passed validation.
    """
    self.entities = []
    self.errors = []
    error_records = []
    if max_errors is None:
        max_errors = self.max_errors
    for record in self.lod:
        try:
            entity = self.clazz.model_validate(record)
            self.entities.append(entity)
        except Exception as e:
            self.errors.append(e)
            error_records.append(record)
    error_count = len(self.errors)
    if error_count > max_errors:
        msg = f"found {error_count} errors > maximum allowed {max_errors} errors"
        if self.debug:
            print(msg)
            for i, error in enumerate(self.errors):
                print(f"{i}:{error} for \n{error_records[i]}")
        raise Exception(msg)
    return self.entities

`SqlDB`

general SQL database

Source code in ceurws/sql_cache.py

class SqlDB:
    """
    general SQL database
    """

    def __init__(self, sqlite_file_path: str, debug: bool = False):
        debug = debug
        sqlite_url = f"sqlite:///{sqlite_file_path}"
        connect_args = {"check_same_thread": False}
        self.engine = create_engine(sqlite_url, echo=debug, connect_args=connect_args)

    def get_session(self) -> Session:
        """
        Provide a session for database operations.

        Returns:
            Session: A SQLAlchemy Session object bound to the engine for database operations.
        """
        return Session(bind=self.engine)

`get_session()`

Provide a session for database operations.

Returns:

Name	Type	Description
`Session`	`Session`	A SQLAlchemy Session object bound to the engine for database operations.

Source code in ceurws/sql_cache.py

def get_session(self) -> Session:
    """
    Provide a session for database operations.

    Returns:
        Session: A SQLAlchemy Session object bound to the engine for database operations.
    """
    return Session(bind=self.engine)

`textparser`

Created on 2022-08-15

@author: wf

`Textparser`

general text parser

Source code in ceurws/textparser.py

class Textparser:
    """
    general text parser
    """

    def __init__(self, debug: bool):
        """
        Constructor

        Args:
            debug(bool): if TRUE switch debugging on
        """
        self.debug = debug

    @classmethod
    def sanitize(cls, text, replaceList=None) -> str:
        """
        sanitize given text

        Args:
            text: text to sanitize
            replaceList: list of strings to remove from the given text

        Returns:
            str: sanitized string
        """
        if replaceList is None:
            replaceList = []
        if text is not None:
            sanitizeChars = "\n\t\r., "
            text = text.strip(sanitizeChars)
            text = text.replace("\n", " ")
            text = text.replace("\r", "")
            for replace in replaceList:
                text = text.replace(replace, "")
            # compress multiple spaces
            text = " ".join(text.split())
        return text

    def log(self, msg: str):
        """
        log the given message if debug is on

        Args:
            msg(str): the message to log
        """
        if self.debug:
            print(msg)

    def hasValue(self, d, key):
        """
        check that the given attribute in the given dict is available and not none

        Args:
            d(dict): the dict
            key(str): the key

        Returns:
            True: if a not None Value is available
        """
        result = key in d and d[key] is not None
        return result

    def getMatch(self, pattern, text, groupNo: int = 1):
        """
        get the match for the given regular expression for the given text returning the given group number

        Args:
            regexp(str): the regular expression to check
            text(str): the text to check
            groupNo(int): the number of the regular expression group to return

        Returns:
            str: the matching result or None if no match was found
        """
        matchResult = pattern.match(text)
        if matchResult:
            return matchResult.group(groupNo)
        else:
            return None

`init(debug)`

Constructor

Parameters:

Name	Type	Description	Default
`debug(bool)`		if TRUE switch debugging on	required

Source code in ceurws/textparser.py

def __init__(self, debug: bool):
    """
    Constructor

    Args:
        debug(bool): if TRUE switch debugging on
    """
    self.debug = debug

`getMatch(pattern, text, groupNo=1)`

get the match for the given regular expression for the given text returning the given group number

Parameters:

Name	Description	Default
`regexp(str)`	the regular expression to check	required
`text(str)`	the text to check	required
`groupNo(int)`	the number of the regular expression group to return	required

Returns:

Name	Type	Description
`str`		the matching result or None if no match was found

Source code in ceurws/textparser.py

def getMatch(self, pattern, text, groupNo: int = 1):
    """
    get the match for the given regular expression for the given text returning the given group number

    Args:
        regexp(str): the regular expression to check
        text(str): the text to check
        groupNo(int): the number of the regular expression group to return

    Returns:
        str: the matching result or None if no match was found
    """
    matchResult = pattern.match(text)
    if matchResult:
        return matchResult.group(groupNo)
    else:
        return None

`hasValue(d, key)`

check that the given attribute in the given dict is available and not none

Parameters:

Name	Type	Description	Default
`d(dict)`		the dict	required
`key(str)`		the key	required

Returns:

Name	Type	Description
`True`		if a not None Value is available

Source code in ceurws/textparser.py

def hasValue(self, d, key):
    """
    check that the given attribute in the given dict is available and not none

    Args:
        d(dict): the dict
        key(str): the key

    Returns:
        True: if a not None Value is available
    """
    result = key in d and d[key] is not None
    return result

`log(msg)`

log the given message if debug is on

Parameters:

Name	Type	Description	Default
`msg(str)`		the message to log	required

Source code in ceurws/textparser.py

def log(self, msg: str):
    """
    log the given message if debug is on

    Args:
        msg(str): the message to log
    """
    if self.debug:
        print(msg)

`sanitize(text, replaceList=None)` `classmethod`

sanitize given text

Parameters:

Name	Type	Description	Default
`text`		text to sanitize	required
`replaceList`		list of strings to remove from the given text	`None`

Returns:

Name	Type	Description
`str`	`str`	sanitized string

Source code in ceurws/textparser.py

@classmethod
def sanitize(cls, text, replaceList=None) -> str:
    """
    sanitize given text

    Args:
        text: text to sanitize
        replaceList: list of strings to remove from the given text

    Returns:
        str: sanitized string
    """
    if replaceList is None:
        replaceList = []
    if text is not None:
        sanitizeChars = "\n\t\r., "
        text = text.strip(sanitizeChars)
        text = text.replace("\n", " ")
        text = text.replace("\r", "")
        for replace in replaceList:
            text = text.replace(replace, "")
        # compress multiple spaces
        text = " ".join(text.split())
    return text

`urn`

Created on 2023-12-28

@author: wf / ChatGPT-4 as instructed

Class URN is designed to verify and calculate check digits for URNs (Uniform Resource Names) as used in the DNB URN service. The class provides methods for both verifying a full URN's check digit (check_urn_checksum) and calculating the check digit for a given URN (calc_urn_checksum). It's adapted from PHP and JavaScript sources, following the guidelines and methods outlined by the DNB (German National Library) URN service.

`URN`

URN check digit calculator for DNB URN service:

see https://www.dnb.de/DE/Professionell/Services/URN-Service/urn-service_node.html

and https://d-nb.info/1045320641/34 http://nbn-resolving.de/nbnpruefziffer.php

Source code in ceurws/urn.py

class URN:
    """
    URN check digit calculator for DNB URN service:

    see https://www.dnb.de/DE/Professionell/Services/URN-Service/urn-service_node.html

    and
        https://d-nb.info/1045320641/34
        http://nbn-resolving.de/nbnpruefziffer.php

    """

    @classmethod
    def check_urn_checksum(cls, urn: str, debug: bool = False) -> bool:
        urn_check_digit_str = urn[-1]
        urn_prefix = urn[:-1]
        check_digit = cls.calc_urn_checksum(urn_prefix, debug)
        urn_ok = str(check_digit) == urn_check_digit_str
        return urn_ok

    @classmethod
    def calc_urn_checksum(cls, test_urn: str, debug: bool = False) -> int:
        """
        converted from PHP and JavaScript code see
        see https://github.com/bohnelang/URN-Pruefziffer

        Args:
            debug(bool) if True show the internal values while calculating
        """
        # Code string provided in the original PHP function
        code = "3947450102030405060708094117############1814191516212223242542262713282931123233113435363738########43"

        # Initialization of variables
        _sum = 0
        pos = 1

        # Iterating through each character in the URN
        for i, char in enumerate(test_urn.upper()):
            # Getting the ASCII value and adjusting it based on the character '-' (45 in ASCII)
            x = ord(char) - 45
            # Extracting two consecutive values from the code string
            v1 = int(code[x * 2]) if code[x * 2] != "#" else 0
            v2 = int(code[x * 2 + 1]) if code[x * 2 + 1] != "#" else 0

            if v1 == 0:
                # If v1 is 0, increment pos after multiplying v2 with its current value
                _sum += v2 * pos
                pos += 1  # post-increment equivalent in Python
            else:
                # If v1 is not 0, use pos for the first term, increment pos,
                # then use the new value of pos for the second term
                # This effectively increases pos by 2 in this branch
                _sum += pos * v1
                pos += 1  # increment for the first term
                _sum += v2 * pos  # use incremented pos for the second term
                pos += 1  # increment for the second term

            if debug:
                print(f"i: {i:2} pos: {pos:2} x: {x:2} v1: {v1:2} v2: {v2:2} sum: {_sum:4}")

        # Assuming v2 is not 0 at the end of your URN calculations
        check_digit = (_sum // v2) % 10  # Using integer division for floor behavior

        return check_digit

`calc_urn_checksum(test_urn, debug=False)` `classmethod`

converted from PHP and JavaScript code see see https://github.com/bohnelang/URN-Pruefziffer

Source code in ceurws/urn.py

@classmethod
def calc_urn_checksum(cls, test_urn: str, debug: bool = False) -> int:
    """
    converted from PHP and JavaScript code see
    see https://github.com/bohnelang/URN-Pruefziffer

    Args:
        debug(bool) if True show the internal values while calculating
    """
    # Code string provided in the original PHP function
    code = "3947450102030405060708094117############1814191516212223242542262713282931123233113435363738########43"

    # Initialization of variables
    _sum = 0
    pos = 1

    # Iterating through each character in the URN
    for i, char in enumerate(test_urn.upper()):
        # Getting the ASCII value and adjusting it based on the character '-' (45 in ASCII)
        x = ord(char) - 45
        # Extracting two consecutive values from the code string
        v1 = int(code[x * 2]) if code[x * 2] != "#" else 0
        v2 = int(code[x * 2 + 1]) if code[x * 2 + 1] != "#" else 0

        if v1 == 0:
            # If v1 is 0, increment pos after multiplying v2 with its current value
            _sum += v2 * pos
            pos += 1  # post-increment equivalent in Python
        else:
            # If v1 is not 0, use pos for the first term, increment pos,
            # then use the new value of pos for the second term
            # This effectively increases pos by 2 in this branch
            _sum += pos * v1
            pos += 1  # increment for the first term
            _sum += v2 * pos  # use incremented pos for the second term
            pos += 1  # increment for the second term

        if debug:
            print(f"i: {i:2} pos: {pos:2} x: {x:2} v1: {v1:2} v2: {v2:2} sum: {_sum:4}")

    # Assuming v2 is not 0 at the end of your URN calculations
    check_digit = (_sum // v2) % 10  # Using integer division for floor behavior

    return check_digit

`utils`

`download`

Created on 2021-08-21

this is a redundant copy see e.g. https://github.com/WolfgangFahl/ConferenceCorpus/blob/main/corpus/utils/download.py

@author: wf

`Download`

Utility functions for downloading data

Source code in ceurws/utils/download.py

class Download:
    """
    Utility functions for downloading data
    """

    @staticmethod
    def getURLContent(url: str):
        with urllib.request.urlopen(url) as urlResponse:
            content = urlResponse.read().decode()
            return content

    @staticmethod
    def getFileContent(path: str):
        with open(path) as file:
            content = file.read()
            return content

    @staticmethod
    def needsDownload(filePath: Path, force: bool = False) -> bool:
        """
        check if a download of the given filePath is necessary that is the file
        does not exist has a size of zero or the download should be forced

        Args:
            filePath(str): the path of the file to be checked
            force(bool): True if the result should be forced to True

        Return:
            bool: True if  a download for this file needed
        """
        if not filePath.is_file():
            result = True
        else:
            stats = filePath.stat()
            size = stats.st_size
            result = force or size == 0
        return result

    @staticmethod
    def downloadBackupFile(
        url: str,
        fileName: str,
        targetDirectory: Path,
        force: bool = False,
        profile: bool = True,
    ):
        """
        Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

        Args:
            url: url linking to a downloadable gzip file
            fileName: Name of the file that should be extracted from gzip file
            targetDirectory(str): download the file to this directory
            force (bool): True if the download should be forced
            profile(bool): if True show profiling information

        Returns:
            Name of the extracted file with path to the backup directory
        """
        extractTo = targetDirectory.joinpath(fileName)
        zipped = targetDirectory.joinpath(f"{fileName}.gz")
        # we might want to check whether a new version is available
        if Download.needsDownload(extractTo, force=force):
            if not targetDirectory.is_dir():
                targetDirectory.parent.mkdir(parents=True, exist_ok=True)
            msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
            profiler = Profiler(msg=msg, profile=profile)
            urllib.request.urlretrieve(url, zipped)
            profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
            with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
                shutil.copyfileobj(gzipped, unzipped)
            if not extractTo.is_file():
                raise Exception(f"could not extract {fileName} from {zipped}")
        return extractTo

`downloadBackupFile(url, fileName, targetDirectory, force=False, profile=True)` `staticmethod`

Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

Parameters:

Name	Type	Description	Default
`url`	`str`	url linking to a downloadable gzip file	required
`fileName`	`str`	Name of the file that should be extracted from gzip file	required
`targetDirectory(str)`		download the file to this directory	required
`force`	`bool`	True if the download should be forced	`False`
`profile(bool)`		if True show profiling information	required

Returns:

Type	Description
	Name of the extracted file with path to the backup directory

Source code in ceurws/utils/download.py

@staticmethod
def downloadBackupFile(
    url: str,
    fileName: str,
    targetDirectory: Path,
    force: bool = False,
    profile: bool = True,
):
    """
    Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

    Args:
        url: url linking to a downloadable gzip file
        fileName: Name of the file that should be extracted from gzip file
        targetDirectory(str): download the file to this directory
        force (bool): True if the download should be forced
        profile(bool): if True show profiling information

    Returns:
        Name of the extracted file with path to the backup directory
    """
    extractTo = targetDirectory.joinpath(fileName)
    zipped = targetDirectory.joinpath(f"{fileName}.gz")
    # we might want to check whether a new version is available
    if Download.needsDownload(extractTo, force=force):
        if not targetDirectory.is_dir():
            targetDirectory.parent.mkdir(parents=True, exist_ok=True)
        msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
        profiler = Profiler(msg=msg, profile=profile)
        urllib.request.urlretrieve(url, zipped)
        profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
        with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
            shutil.copyfileobj(gzipped, unzipped)
        if not extractTo.is_file():
            raise Exception(f"could not extract {fileName} from {zipped}")
    return extractTo

`needsDownload(filePath, force=False)` `staticmethod`

check if a download of the given filePath is necessary that is the file does not exist has a size of zero or the download should be forced

Parameters:

Name	Type	Description	Default
`filePath(str)`		the path of the file to be checked	required
`force(bool)`		True if the result should be forced to True	required

Return

bool: True if a download for this file needed

Source code in ceurws/utils/download.py

@staticmethod
def needsDownload(filePath: Path, force: bool = False) -> bool:
    """
    check if a download of the given filePath is necessary that is the file
    does not exist has a size of zero or the download should be forced

    Args:
        filePath(str): the path of the file to be checked
        force(bool): True if the result should be forced to True

    Return:
        bool: True if  a download for this file needed
    """
    if not filePath.is_file():
        result = True
    else:
        stats = filePath.stat()
        size = stats.st_size
        result = force or size == 0
    return result

`Profiler`

simple profiler

Source code in ceurws/utils/download.py

class Profiler:
    """
    simple profiler
    """

    def __init__(self, msg: str | None = None, profile: bool = True):
        """
        construct me with the given msg and profile active flag

        Args:
            msg(str): the message to show if profiling is active
            profile(bool): True if messages should be shown
        """
        if msg is not None:
            self.msg = msg
        else:
            self.msg = ""
        self.profile = profile
        self.starttime = time.time()
        if profile:
            print(f"Starting {msg} ...")

    def time(self, extraMsg=""):
        """
        time the action and print if profile is active
        """
        elapsed = time.time() - self.starttime
        if self.profile:
            print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
        return elapsed

`init(msg=None, profile=True)`

construct me with the given msg and profile active flag

Parameters:

Name	Type	Description	Default
`msg(str)`		the message to show if profiling is active	required
`profile(bool)`		True if messages should be shown	required

Source code in ceurws/utils/download.py

def __init__(self, msg: str | None = None, profile: bool = True):
    """
    construct me with the given msg and profile active flag

    Args:
        msg(str): the message to show if profiling is active
        profile(bool): True if messages should be shown
    """
    if msg is not None:
        self.msg = msg
    else:
        self.msg = ""
    self.profile = profile
    self.starttime = time.time()
    if profile:
        print(f"Starting {msg} ...")

`time(extraMsg='')`

time the action and print if profile is active

Source code in ceurws/utils/download.py

def time(self, extraMsg=""):
    """
    time the action and print if profile is active
    """
    elapsed = time.time() - self.starttime
    if self.profile:
        print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
    return elapsed

`webscrape`

Created on 2020-08-20

@author: wf

this is a redundant copy of the sources at https://github.com/WolfgangFahl/ConferenceCorpus/blob/main/corpus/datasources/webscrape.py

`ScrapeDescription` `dataclass`

Description of rdfa elements to scrape

Source code in ceurws/utils/webscrape.py

@dataclass
class ScrapeDescription:
    """
    Description of rdfa elements to scrape
    """

    key: str
    tag: str  # the tag to search
    attribute: str  # the attribute to expect
    value: str  # the value to expect
    multi: bool = False  # do we expect multiple elements?

`WebScrape`

WebScraper with a rudimentary Parser for https://en.wikipedia.org/wiki/RDFa extended for CEUR-WS and WikiCFP specific scraping

https://stackoverflow.com/questions/21876602/what-does-the-html-typeof-attribute-do https://de.wikipedia.org/wiki/RDFa https://stackoverflow.com/questions/20767903/parsing-rdfa-in-html-xhtml https://www.w3.org/MarkUp/2009/rdfa-for-html-authors

Source code in ceurws/utils/webscrape.py

class WebScrape:
    """
    WebScraper
    with a rudimentary Parser for https://en.wikipedia.org/wiki/RDFa
    extended for CEUR-WS and WikiCFP specific scraping

    https://stackoverflow.com/questions/21876602/what-does-the-html-typeof-attribute-do
    https://de.wikipedia.org/wiki/RDFa
    https://stackoverflow.com/questions/20767903/parsing-rdfa-in-html-xhtml
    https://www.w3.org/MarkUp/2009/rdfa-for-html-authors
    """

    def __init__(
        self,
        debug: bool = False,
        showHtml: bool = False,
        timeout: float = 20,
        agent: str = "Mozilla/5.0",
    ):
        """
        Constructor

        Args:
            debug(bool): if True show debugging information
            showHtml(bool): if True show the HTML retrieved
            timeout(float): the default timeout
            agent(str): the agent to mimic
        """
        self.err: Exception | None = None
        self.valid = False
        self.debug = debug
        self.showHtml = showHtml
        self.timeout = timeout
        self.agent = agent

    def findLinkForRegexp(self, regex: str):
        """
        find a link for the given regular expression

        Args:
            regex(str): the regular expression to find a link for

        Return:
            m(object),text(str): the match/text tuple or None,None
        """
        m = None
        text = None
        link = self.soup.find("a", href=re.compile(regex))
        if link:
            href = link["href"]
            m = re.match(regex, href)
            if hasattr(link, "text"):
                text = link.text
        return m, text

    def fromTag(
        self,
        soup: BeautifulSoup,
        tag: str,
        attr: str | None = None,
        value: str | None = None,
        multi: bool = False,
    ):
        """
        get metadata from a given tag, attribute and value
        e.g. <span class="CEURVOLACRONYM">DL4KG2020</span>

        tag=span, attr=class, value=CEURVOLACRONYM

        Args:
           soup(BeautifulSoup): the parser to work with
           tag(string): the tag to search
           attr(string): the attribute to expect
           value(string): the value to expect
           multi(bool): if True - return multiple values
        """
        # https://stackoverflow.com/a/16248908/1497139
        # find a list of all tag elements
        if attr is not None and value is not None:
            nodes = soup.find_all(tag, {attr: value})
        else:
            nodes = soup.find_all(tag)
        lines = [node.get_text() for node in nodes]
        if multi:
            return lines
        if len(lines) > 0:
            return lines[0]
        else:
            return None

    def getSoup(self, url: str, showHtml: bool = False, debug: bool = False) -> BeautifulSoup | None:
        """
        get the beautiful Soup parser

        Args:
           url(str): the url to open
           showHtml(bool): if True  the html code should be pretty printed and shown
           debug(bool): if True debug info should be printed
        Return:
            BeautifulSoup: the html parser
        """
        html = self.get_html_from_url(url, debug=debug)
        soup = self.get_soup_from_string(html, show_html=showHtml) if html is not None else None
        return soup

    def get_soup_from_string(self, html: str | bytes, show_html: bool = False) -> BeautifulSoup:
        """
        get the beautiful Soup parser for the given html string

        Args:
            html: html content to parse
            show_html: True if the html code should be pretty printed and shown

        Returns:
            BeautifulSoup: the html parser
        """
        soup = BeautifulSoup(html, "html.parser")
        if show_html:
            self.printPrettyHtml(soup)
        return soup

    def printPrettyHtml(self, soup):
        """
        print the prettified html for the given soup

        Args:
            soup(BeuatifulSoup): the parsed html to print
        """
        prettyHtml = soup.prettify()
        print(prettyHtml)

    def parseWithScrapeDescription(
        self,
        soup: BeautifulSoup,
        scrapeDescr: list["ScrapeDescription"] | None = None,
    ) -> dict:
        """
        parse the given url with the given encoding
        Args:
            soup: html parser to parse the content from
            scrapeDescr: description of the

        Return:
             a dict with the results
        """
        scrapeDict = dict()
        if isinstance(scrapeDescr, list):
            for scrapeItem in scrapeDescr:
                value = self.fromTag(
                    soup,
                    scrapeItem.tag,
                    scrapeItem.attribute,
                    scrapeItem.value,
                    multi=scrapeItem.multi,
                )
                scrapeDict[scrapeItem.key] = value
        self.valid = True
        return scrapeDict

    def parseRDFa(self, url):
        """
        rudimentary RDFa parsing
        """
        triples = []
        try:
            self.soup = self.getSoup(url, self.showHtml)
            subjectNodes = self.soup.find_all(True, {"typeof": True})
            for subjectNode in subjectNodes:
                subject = subjectNode.attrs["typeof"]
                if self.debug:
                    print(subjectNode)
                for predicateNode in subjectNode.find_all():
                    value = None
                    name = None
                    if "content" in predicateNode.attrs:
                        value = predicateNode.attrs["content"]
                    else:
                        value = predicateNode.get_text()
                    if "property" in predicateNode.attrs:
                        name = predicateNode.attrs["property"]
                    if name is not None and value is not None:
                        triples.append((subject, name, value))
            self.valid = True
        except HTTPError as herr:
            self.err = herr
        except urllib.error.URLError as terr:
            self.err = terr
        return triples

    def get_html_from_url(self, url: str, debug: bool = False) -> str | bytes | None:
        """
        Get the html response from the given url
        Args:
            url: url to the get the content from
            debug(bool): if True show non available volumes

        Returns:
            str: content of the url as string
            bytes: If the content of the url contains encoding errors
            None: If the url is not reachable
        """
        req = urllib.request.Request(url, headers={"User-Agent": f"{self.agent}"})
        # handle cookies
        opener = build_opener(HTTPCookieProcessor())
        try:
            response = opener.open(req, timeout=self.timeout)
        except HTTPError as herr:
            self.err = herr
            if debug:
                print(f"{url.split('/')[-1]} not available")
            return None
        html = response.read()
        try:
            html = html.decode(response.headers.get_content_charset())
        except UnicodeDecodeError as ex:
            print(f"ERROR: Could not properly decode the html code of <{url}>")
            print(ex)
        return html

`init(debug=False, showHtml=False, timeout=20, agent='Mozilla/5.0')`

Constructor

Parameters:

Name	Description	Default
`debug(bool)`	if True show debugging information	required
`showHtml(bool)`	if True show the HTML retrieved	required
`timeout(float)`	the default timeout	required
`agent(str)`	the agent to mimic	required

Source code in ceurws/utils/webscrape.py

def __init__(
    self,
    debug: bool = False,
    showHtml: bool = False,
    timeout: float = 20,
    agent: str = "Mozilla/5.0",
):
    """
    Constructor

    Args:
        debug(bool): if True show debugging information
        showHtml(bool): if True show the HTML retrieved
        timeout(float): the default timeout
        agent(str): the agent to mimic
    """
    self.err: Exception | None = None
    self.valid = False
    self.debug = debug
    self.showHtml = showHtml
    self.timeout = timeout
    self.agent = agent

`findLinkForRegexp(regex)`

find a link for the given regular expression

Parameters:

Name	Type	Description	Default
`regex(str)`		the regular expression to find a link for	required

Return

m(object),text(str): the match/text tuple or None,None

Source code in ceurws/utils/webscrape.py

def findLinkForRegexp(self, regex: str):
    """
    find a link for the given regular expression

    Args:
        regex(str): the regular expression to find a link for

    Return:
        m(object),text(str): the match/text tuple or None,None
    """
    m = None
    text = None
    link = self.soup.find("a", href=re.compile(regex))
    if link:
        href = link["href"]
        m = re.match(regex, href)
        if hasattr(link, "text"):
            text = link.text
    return m, text

`fromTag(soup, tag, attr=None, value=None, multi=False)`

get metadata from a given tag, attribute and value e.g. DL4KG2020

tag=span, attr=class, value=CEURVOLACRONYM

Parameters:

Name	Description	Default
`soup(BeautifulSoup)`	the parser to work with	required
`tag(string)`	the tag to search	required
`attr(string)`	the attribute to expect	required
`value(string)`	the value to expect	required
`multi(bool)`	if True - return multiple values	required

Source code in ceurws/utils/webscrape.py

def fromTag(
    self,
    soup: BeautifulSoup,
    tag: str,
    attr: str | None = None,
    value: str | None = None,
    multi: bool = False,
):
    """
    get metadata from a given tag, attribute and value
    e.g. <span class="CEURVOLACRONYM">DL4KG2020</span>

    tag=span, attr=class, value=CEURVOLACRONYM

    Args:
       soup(BeautifulSoup): the parser to work with
       tag(string): the tag to search
       attr(string): the attribute to expect
       value(string): the value to expect
       multi(bool): if True - return multiple values
    """
    # https://stackoverflow.com/a/16248908/1497139
    # find a list of all tag elements
    if attr is not None and value is not None:
        nodes = soup.find_all(tag, {attr: value})
    else:
        nodes = soup.find_all(tag)
    lines = [node.get_text() for node in nodes]
    if multi:
        return lines
    if len(lines) > 0:
        return lines[0]
    else:
        return None

`getSoup(url, showHtml=False, debug=False)`

get the beautiful Soup parser

Parameters:

Name	Description	Default
`url(str)`	the url to open	required
`showHtml(bool)`	if True the html code should be pretty printed and shown	required
`debug(bool)`	if True debug info should be printed	required

Return: BeautifulSoup: the html parser

Source code in ceurws/utils/webscrape.py

def getSoup(self, url: str, showHtml: bool = False, debug: bool = False) -> BeautifulSoup | None:
    """
    get the beautiful Soup parser

    Args:
       url(str): the url to open
       showHtml(bool): if True  the html code should be pretty printed and shown
       debug(bool): if True debug info should be printed
    Return:
        BeautifulSoup: the html parser
    """
    html = self.get_html_from_url(url, debug=debug)
    soup = self.get_soup_from_string(html, show_html=showHtml) if html is not None else None
    return soup

`get_html_from_url(url, debug=False)`

Get the html response from the given url Args: url: url to the get the content from debug(bool): if True show non available volumes

Returns:

Name	Type	Description
`str`	`str \| bytes \| None`	content of the url as string
`bytes`	`str \| bytes \| None`	If the content of the url contains encoding errors
`None`	`str \| bytes \| None`	If the url is not reachable

Source code in ceurws/utils/webscrape.py

def get_html_from_url(self, url: str, debug: bool = False) -> str | bytes | None:
    """
    Get the html response from the given url
    Args:
        url: url to the get the content from
        debug(bool): if True show non available volumes

    Returns:
        str: content of the url as string
        bytes: If the content of the url contains encoding errors
        None: If the url is not reachable
    """
    req = urllib.request.Request(url, headers={"User-Agent": f"{self.agent}"})
    # handle cookies
    opener = build_opener(HTTPCookieProcessor())
    try:
        response = opener.open(req, timeout=self.timeout)
    except HTTPError as herr:
        self.err = herr
        if debug:
            print(f"{url.split('/')[-1]} not available")
        return None
    html = response.read()
    try:
        html = html.decode(response.headers.get_content_charset())
    except UnicodeDecodeError as ex:
        print(f"ERROR: Could not properly decode the html code of <{url}>")
        print(ex)
    return html

`get_soup_from_string(html, show_html=False)`

get the beautiful Soup parser for the given html string

Parameters:

Name	Type	Description	Default
`html`	`str \| bytes`	html content to parse	required
`show_html`	`bool`	True if the html code should be pretty printed and shown	`False`

Returns:

Name	Type	Description
`BeautifulSoup`	`BeautifulSoup`	the html parser

Source code in ceurws/utils/webscrape.py

def get_soup_from_string(self, html: str | bytes, show_html: bool = False) -> BeautifulSoup:
    """
    get the beautiful Soup parser for the given html string

    Args:
        html: html content to parse
        show_html: True if the html code should be pretty printed and shown

    Returns:
        BeautifulSoup: the html parser
    """
    soup = BeautifulSoup(html, "html.parser")
    if show_html:
        self.printPrettyHtml(soup)
    return soup

`parseRDFa(url)`

rudimentary RDFa parsing

Source code in ceurws/utils/webscrape.py

def parseRDFa(self, url):
    """
    rudimentary RDFa parsing
    """
    triples = []
    try:
        self.soup = self.getSoup(url, self.showHtml)
        subjectNodes = self.soup.find_all(True, {"typeof": True})
        for subjectNode in subjectNodes:
            subject = subjectNode.attrs["typeof"]
            if self.debug:
                print(subjectNode)
            for predicateNode in subjectNode.find_all():
                value = None
                name = None
                if "content" in predicateNode.attrs:
                    value = predicateNode.attrs["content"]
                else:
                    value = predicateNode.get_text()
                if "property" in predicateNode.attrs:
                    name = predicateNode.attrs["property"]
                if name is not None and value is not None:
                    triples.append((subject, name, value))
        self.valid = True
    except HTTPError as herr:
        self.err = herr
    except urllib.error.URLError as terr:
        self.err = terr
    return triples

`parseWithScrapeDescription(soup, scrapeDescr=None)`

parse the given url with the given encoding Args: soup: html parser to parse the content from scrapeDescr: description of the

Return

a dict with the results

Source code in ceurws/utils/webscrape.py

def parseWithScrapeDescription(
    self,
    soup: BeautifulSoup,
    scrapeDescr: list["ScrapeDescription"] | None = None,
) -> dict:
    """
    parse the given url with the given encoding
    Args:
        soup: html parser to parse the content from
        scrapeDescr: description of the

    Return:
         a dict with the results
    """
    scrapeDict = dict()
    if isinstance(scrapeDescr, list):
        for scrapeItem in scrapeDescr:
            value = self.fromTag(
                soup,
                scrapeItem.tag,
                scrapeItem.attribute,
                scrapeItem.value,
                multi=scrapeItem.multi,
            )
            scrapeDict[scrapeItem.key] = value
    self.valid = True
    return scrapeDict

`printPrettyHtml(soup)`

print the prettified html for the given soup

Parameters:

Name	Type	Description	Default
`soup(BeuatifulSoup)`		the parsed html to print	required

Source code in ceurws/utils/webscrape.py

def printPrettyHtml(self, soup):
    """
    print the prettified html for the given soup

    Args:
        soup(BeuatifulSoup): the parsed html to print
    """
    prettyHtml = soup.prettify()
    print(prettyHtml)

`version`

Created on 2022-09-11

@author: wf

`Version` `dataclass`

Version handling for VolumeBrowser

Source code in ceurws/version.py

@dataclass
class Version:
    """
    Version handling for VolumeBrowser
    """

    name = "CEUR-WS Volume Browser"
    version = ceurws.__version__
    date = "2022-08-14"
    updated = "2024-07-31"
    description = "CEUR-WS Volume browser"

    authors = "Tim Holzheim, Wolfgang Fahl"

    doc_url = "https://wiki.bitplan.com/index.php/pyCEURmake"
    chat_url = "https://github.com/WolfgangFahl/pyCEURmake/discussions"
    cm_url = "https://github.com/WolfgangFahl/pyCEURmake"

    license = """Copyright 2022 contributors. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied."""
    longDescription = f"""{name} version {version}
{description}

  Created by {authors} on {date} last updated {updated}"""

`view`

Created on 2024-02-23

@author: wf

`View`

generic View

Source code in ceurws/view.py

class View:
    """
    generic View
    """

    noneValue = "-"
    wdPrefix = "http://www.wikidata.org/entity/"

    def getValue(self, obj, attr):
        value = getattr(obj, attr, View.noneValue)
        if value is None:
            value = View.noneValue
        return value

    def getRowValue(self, row, key):
        value = None
        if key in row:
            value = row[key]
        if value is None:
            value = View.noneValue
        return value

    def createLink(self, url: str, text: str):
        """
        create a link from the given url and text

        Args:
            url(str): the url to create a link for
            text(str): the text to add for the link
        """
        link = Link.create(url, text, target="_blank")
        return link

    def createWdLink(self, qid: str, text: str):
        wd_url = f"{View.wdPrefix}/{qid}"
        link = self.createLink(wd_url, text)
        return link

    def get_dict_as_html_table(self, data_dict) -> str:
        # Convert the dictionary to a list of lists for tabulate
        data_list = [[key, value] for key, value in data_dict.items()]

        # Generate the HTML table
        html_table = tabulate(data_list, tablefmt="html", headers=["Key", "Value"])
        return html_table

    def createExternalLink(
        self,
        row: dict,
        key: str,
        text: str,
        formatterUrl: str,
        emptyIfNone: bool = False,
    ) -> str:
        """
        create an ExternalLink for the given row entry with the given key, text and formatterUrl

        Args:
            row(dict): the row to extract the value from
            key(str): the key
            text(str): the text to display for the link
            formatterUrl(str): the prefix for the url to use
            emptyIfNone(bool): if True return empty string if value is Display.noneValue

        Returns:
            str - html link for external id
        """
        value = self.getRowValue(row, key)
        if not value or value == View.noneValue:
            if emptyIfNone:
                return ""
            else:
                return View.noneValue

        if value.startswith(View.wdPrefix):
            value = value.replace(View.wdPrefix, "")
        url = formatterUrl + value
        link = self.createLink(url, text)
        return link

    def createItemLink(self, row: dict, key: str, separator: str | None = None) -> str:
        """
        create an item link
        Args:
            row: row object with the data
            key: key of the value for which the link is created
            separator: If not None split the value on the separator and create multiple links
        """
        value = self.getRowValue(row, key)
        if value == View.noneValue:
            return value
        item = row[key]
        itemLabel = row[f"{key}Label"]
        itemLink = ""
        if separator is not None:
            item_parts = item.split(separator)
            itemLabel_parts = itemLabel.split(separator)
            links = []
            for url, label in zip(item_parts, itemLabel_parts, strict=False):
                link = self.createLink(url, label)
                links.append(link)
            itemLink = "<br>".join(links)
        else:
            itemLink = self.createLink(item, itemLabel)
        return itemLink

`createExternalLink(row, key, text, formatterUrl, emptyIfNone=False)`

create an ExternalLink for the given row entry with the given key, text and formatterUrl

Parameters:

Name	Description	Default
`row(dict)`	the row to extract the value from	required
`key(str)`	the key	required
`text(str)`	the text to display for the link	required
`formatterUrl(str)`	the prefix for the url to use	required
`emptyIfNone(bool)`	if True return empty string if value is Display.noneValue	required

Returns:

Type	Description
`str`	str - html link for external id

Source code in ceurws/view.py

def createExternalLink(
    self,
    row: dict,
    key: str,
    text: str,
    formatterUrl: str,
    emptyIfNone: bool = False,
) -> str:
    """
    create an ExternalLink for the given row entry with the given key, text and formatterUrl

    Args:
        row(dict): the row to extract the value from
        key(str): the key
        text(str): the text to display for the link
        formatterUrl(str): the prefix for the url to use
        emptyIfNone(bool): if True return empty string if value is Display.noneValue

    Returns:
        str - html link for external id
    """
    value = self.getRowValue(row, key)
    if not value or value == View.noneValue:
        if emptyIfNone:
            return ""
        else:
            return View.noneValue

    if value.startswith(View.wdPrefix):
        value = value.replace(View.wdPrefix, "")
    url = formatterUrl + value
    link = self.createLink(url, text)
    return link

`createItemLink(row, key, separator=None)`

create an item link Args: row: row object with the data key: key of the value for which the link is created separator: If not None split the value on the separator and create multiple links

Source code in ceurws/view.py

def createItemLink(self, row: dict, key: str, separator: str | None = None) -> str:
    """
    create an item link
    Args:
        row: row object with the data
        key: key of the value for which the link is created
        separator: If not None split the value on the separator and create multiple links
    """
    value = self.getRowValue(row, key)
    if value == View.noneValue:
        return value
    item = row[key]
    itemLabel = row[f"{key}Label"]
    itemLink = ""
    if separator is not None:
        item_parts = item.split(separator)
        itemLabel_parts = itemLabel.split(separator)
        links = []
        for url, label in zip(item_parts, itemLabel_parts, strict=False):
            link = self.createLink(url, label)
            links.append(link)
        itemLink = "<br>".join(links)
    else:
        itemLink = self.createLink(item, itemLabel)
    return itemLink

`createLink(url, text)`

create a link from the given url and text

Parameters:

Name	Type	Description	Default
`url(str)`		the url to create a link for	required
`text(str)`		the text to add for the link	required

Source code in ceurws/view.py

def createLink(self, url: str, text: str):
    """
    create a link from the given url and text

    Args:
        url(str): the url to create a link for
        text(str): the text to add for the link
    """
    link = Link.create(url, text, target="_blank")
    return link

`volume_neo4j`

`Editor` `dataclass`

Represents an editor with their name and ORCID.

Source code in ceurws/volume_neo4j.py

@dataclass
class Editor:
    """
    Represents an editor with their name and ORCID.
    """

    name: str
    orcid: str | None = None
    likelihood: float | None = None

    @classmethod
    def from_json(cls, json_data):
        """
        Create an Editor instance from JSON data.

        Args:
            json_data (dict): The JSON data representing the editor.

        Returns:
            Editor: The Editor instance created from the JSON data.
        """
        return cls(name=json_data.get("name"), orcid=json_data.get("orcid"))

    def search_by_name(self):
        """
        Search the editor by name using the ORCID API and calculate the likelihood.
        """
        if self.name:
            url = f"https://pub.orcid.org/v3.0/search/?q={self.name}"
            headers = {"Accept": "application/json"}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                data = response.json()
                num_results = data.get("num-found", 0)
                self.likelihood = num_results / 10  # Arbitrary calculation, adjust as needed

    def create_node(self, tx, volume_node_id: int) -> int | None:
        """
        Create an Editor node in Neo4j and establish a relationship with a Volume node.

        Args:
            tx: The Neo4j transaction.
            volume_node_id (int): The ID of the volume node.

        Returns:
            int: The ID of the created Editor node.
            None: if the editor could not be created
        """
        query = """
        MATCH (v:Volume)
        WHERE id(v) = $volume_node_id
        CREATE (v)-[:HAS_EDITOR]->(e:Editor {name: $name, orcid: $orcid, likelihood: $likelihood})
        RETURN id(e) as node_id
        """
        parameters = {
            "volume_node_id": volume_node_id,
            "name": self.name,
            "orcid": self.orcid,
            "likelihood": self.likelihood,
        }
        result = tx.run(query, parameters)
        record = result.single()
        if record is not None:
            return record["node_id"]
        else:
            return None

`create_node(tx, volume_node_id)`

Create an Editor node in Neo4j and establish a relationship with a Volume node.

Parameters:

Name	Type	Description	Default
`tx`		The Neo4j transaction.	required
`volume_node_id`	`int`	The ID of the volume node.	required

Returns:

Name	Type	Description
`int`	`int \| None`	The ID of the created Editor node.
`None`	`int \| None`	if the editor could not be created

Source code in ceurws/volume_neo4j.py

def create_node(self, tx, volume_node_id: int) -> int | None:
    """
    Create an Editor node in Neo4j and establish a relationship with a Volume node.

    Args:
        tx: The Neo4j transaction.
        volume_node_id (int): The ID of the volume node.

    Returns:
        int: The ID of the created Editor node.
        None: if the editor could not be created
    """
    query = """
    MATCH (v:Volume)
    WHERE id(v) = $volume_node_id
    CREATE (v)-[:HAS_EDITOR]->(e:Editor {name: $name, orcid: $orcid, likelihood: $likelihood})
    RETURN id(e) as node_id
    """
    parameters = {
        "volume_node_id": volume_node_id,
        "name": self.name,
        "orcid": self.orcid,
        "likelihood": self.likelihood,
    }
    result = tx.run(query, parameters)
    record = result.single()
    if record is not None:
        return record["node_id"]
    else:
        return None

`from_json(json_data)` `classmethod`

Create an Editor instance from JSON data.

Parameters:

Name	Type	Description	Default
`json_data`	`dict`	The JSON data representing the editor.	required

Returns:

Name	Type	Description
`Editor`		The Editor instance created from the JSON data.

Source code in ceurws/volume_neo4j.py

@classmethod
def from_json(cls, json_data):
    """
    Create an Editor instance from JSON data.

    Args:
        json_data (dict): The JSON data representing the editor.

    Returns:
        Editor: The Editor instance created from the JSON data.
    """
    return cls(name=json_data.get("name"), orcid=json_data.get("orcid"))

`search_by_name()`

Search the editor by name using the ORCID API and calculate the likelihood.

Source code in ceurws/volume_neo4j.py

def search_by_name(self):
    """
    Search the editor by name using the ORCID API and calculate the likelihood.
    """
    if self.name:
        url = f"https://pub.orcid.org/v3.0/search/?q={self.name}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            num_results = data.get("num-found", 0)
            self.likelihood = num_results / 10  # Arbitrary calculation, adjust as needed

`Location` `dataclass`

Source code in ceurws/volume_neo4j.py

@dataclass
class Location:
    city: str
    country: str
    date: str

    @staticmethod
    def parse(location_str: str) -> Optional["Location"]:
        """
        Parse a location string of the format "City, Country, Date"

        Args:
            location_str: The location string to parse.

        Returns:
            A Location object or None if the string could not be parsed.
        """
        match = re.match(r"^(.*), (.*), (.*)$", location_str)
        if match:
            city, country, date = match.groups()
            return Location(city, country, date)
        else:
            return None

`parse(location_str)` `staticmethod`

Parse a location string of the format "City, Country, Date"

Parameters:

Name	Type	Description	Default
`location_str`	`str`	The location string to parse.	required

Returns:

Type	Description
`Optional[Location]`	A Location object or None if the string could not be parsed.

Source code in ceurws/volume_neo4j.py

@staticmethod
def parse(location_str: str) -> Optional["Location"]:
    """
    Parse a location string of the format "City, Country, Date"

    Args:
        location_str: The location string to parse.

    Returns:
        A Location object or None if the string could not be parsed.
    """
    match = re.match(r"^(.*), (.*), (.*)$", location_str)
    if match:
        city, country, date = match.groups()
        return Location(city, country, date)
    else:
        return None

`Neo4j`

Neo4j wrapper class

Source code in ceurws/volume_neo4j.py

class Neo4j:
    """
    Neo4j wrapper class
    """

    def __init__(
        self,
        host: str = socket.gethostbyname(socket.gethostname()),
        bolt_port: int = 7687,
        auth=("neo4j", "password"),
        scheme: str = "bolt",
        encrypted: bool = False,
    ):
        """
        constructor
        """
        self.driver = None
        self.error = None
        self.host = host
        self.bolt_port = bolt_port
        self.encrypted = encrypted
        self.scheme = scheme
        try:
            uri = f"{scheme}://{host}:{bolt_port}"
            if not Neo4j.is_port_available(host, bolt_port):
                raise ValueError(f"port at {uri} not available")
            self.driver = GraphDatabase.driver(uri, auth=auth, encrypted=encrypted)
        except (ServiceUnavailable, AuthError, ConfigurationError) as e:
            self.error = e

    @classmethod
    def is_port_available(cls, host, port: int) -> bool:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(1)  # 1 Second Timeout
        try:
            sock.connect((host, port))
        except OSError:
            return False
        finally:
            sock.close()
        return True

    def close(self):
        if self.driver is not None:
            self.driver.close()

`init(host=socket.gethostbyname(socket.gethostname()), bolt_port=7687, auth=('neo4j', 'password'), scheme='bolt', encrypted=False)`

constructor

Source code in ceurws/volume_neo4j.py

def __init__(
    self,
    host: str = socket.gethostbyname(socket.gethostname()),
    bolt_port: int = 7687,
    auth=("neo4j", "password"),
    scheme: str = "bolt",
    encrypted: bool = False,
):
    """
    constructor
    """
    self.driver = None
    self.error = None
    self.host = host
    self.bolt_port = bolt_port
    self.encrypted = encrypted
    self.scheme = scheme
    try:
        uri = f"{scheme}://{host}:{bolt_port}"
        if not Neo4j.is_port_available(host, bolt_port):
            raise ValueError(f"port at {uri} not available")
        self.driver = GraphDatabase.driver(uri, auth=auth, encrypted=encrypted)
    except (ServiceUnavailable, AuthError, ConfigurationError) as e:
        self.error = e

`Volume` `dataclass`

Represents a volume with its attributes.

Source code in ceurws/volume_neo4j.py

@dataclass
class Volume:
    """
    Represents a volume with its attributes.
    """

    acronym: str
    title: str
    loctime: str
    editors: list["Editor"] = field(default_factory=list)

    @classmethod
    def from_json(cls, json_data):
        """
        Create a Volume instance from JSON data.

        Args:
            json_data (dict): The JSON data representing the volume.

        Returns:
            Volume: The Volume instance created from the JSON data.
        """
        editor_names = json_data.get("editors")
        editor_names = editor_names.split(",") if editor_names else []
        editors = [Editor(name=name.strip()) for name in editor_names]
        return cls(
            acronym=json_data.get("acronym"),
            title=json_data.get("title"),
            loctime=json_data.get("loctime"),
            editors=editors,
        )

    def create_node(self, tx) -> int | None:
        """
        Create a Volume node in Neo4j.

        Args:
            tx: The Neo4j transaction.

        Returns:
            int: The ID of the created node.
            None: if the node was not created
        """
        query = """
        CREATE (v:Volume {acronym: $acronym, title: $title, loctime: $loctime})
        RETURN id(v) as node_id
        """
        parameters = {
            "acronym": self.acronym,
            "title": self.title,
            "loctime": self.loctime,
        }
        result = tx.run(query, parameters)
        record = result.single()
        if record is not None:
            return record["node_id"]
        else:
            return None

    @staticmethod
    def load_json_file(source: str) -> list["Volume"]:
        """
        Load volumes from the source JSON file.

        Args:
            source (str): Path to the source JSON file.

        Returns:
            List[Volume]: The list of loaded volumes.
        """
        with open(source) as file:
            json_data = json.load(file)

        volumes = [Volume.from_json(volume_data) for volume_data in json_data]
        return volumes

    @classmethod
    def default_source(cls) -> Path:
        """
        get the default source
        """
        default_source = CEURWS.CACHE_DIR / "volumes.json"
        return default_source

    @classmethod
    def parse_args(cls, argv: list | None = None):
        """
        Parse command line arguments.

        Args:
            argv(list): command line arguments

        Returns:
            argparse.Namespace: The parsed command line arguments.
        """

        default_source = cls.default_source()
        parser = argparse.ArgumentParser(description="Volume/Editor/Location Information")
        parser.add_argument("--source", default=str(default_source), help="Source JSON file path")
        # Add progress option
        parser.add_argument(
            "--progress",
            action="store_true",
            help="Display progress information",
        )

        return parser.parse_args(argv)

    @staticmethod
    def main(argv=None):
        if argv is None:
            argv = sys.argv[1:]
        args = Volume.parse_args(argv)
        volumes = Volume.load_json_file(args.source)

        # Connect to Neo4j
        driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))
        with driver.session() as session:
            for volume in volumes:
                volume_node_id = volume.create_node(session)
                for editor in volume.editors:
                    editor.search_by_name()
                    editor.create_node(session, volume_node_id)

`create_node(tx)`

Create a Volume node in Neo4j.

Parameters:

Name	Type	Description	Default
`tx`		The Neo4j transaction.	required

Returns:

Name	Type	Description
`int`	`int \| None`	The ID of the created node.
`None`	`int \| None`	if the node was not created

Source code in ceurws/volume_neo4j.py

def create_node(self, tx) -> int | None:
    """
    Create a Volume node in Neo4j.

    Args:
        tx: The Neo4j transaction.

    Returns:
        int: The ID of the created node.
        None: if the node was not created
    """
    query = """
    CREATE (v:Volume {acronym: $acronym, title: $title, loctime: $loctime})
    RETURN id(v) as node_id
    """
    parameters = {
        "acronym": self.acronym,
        "title": self.title,
        "loctime": self.loctime,
    }
    result = tx.run(query, parameters)
    record = result.single()
    if record is not None:
        return record["node_id"]
    else:
        return None

`default_source()` `classmethod`

get the default source

Source code in ceurws/volume_neo4j.py

@classmethod
def default_source(cls) -> Path:
    """
    get the default source
    """
    default_source = CEURWS.CACHE_DIR / "volumes.json"
    return default_source

`from_json(json_data)` `classmethod`

Create a Volume instance from JSON data.

Parameters:

Name	Type	Description	Default
`json_data`	`dict`	The JSON data representing the volume.	required

Returns:

Name	Type	Description
`Volume`		The Volume instance created from the JSON data.

Source code in ceurws/volume_neo4j.py

@classmethod
def from_json(cls, json_data):
    """
    Create a Volume instance from JSON data.

    Args:
        json_data (dict): The JSON data representing the volume.

    Returns:
        Volume: The Volume instance created from the JSON data.
    """
    editor_names = json_data.get("editors")
    editor_names = editor_names.split(",") if editor_names else []
    editors = [Editor(name=name.strip()) for name in editor_names]
    return cls(
        acronym=json_data.get("acronym"),
        title=json_data.get("title"),
        loctime=json_data.get("loctime"),
        editors=editors,
    )

`load_json_file(source)` `staticmethod`

Load volumes from the source JSON file.

Parameters:

Name	Type	Description	Default
`source`	`str`	Path to the source JSON file.	required

Returns:

Type	Description
`list[Volume]`	List[Volume]: The list of loaded volumes.

Source code in ceurws/volume_neo4j.py

@staticmethod
def load_json_file(source: str) -> list["Volume"]:
    """
    Load volumes from the source JSON file.

    Args:
        source (str): Path to the source JSON file.

    Returns:
        List[Volume]: The list of loaded volumes.
    """
    with open(source) as file:
        json_data = json.load(file)

    volumes = [Volume.from_json(volume_data) for volume_data in json_data]
    return volumes

`parse_args(argv=None)` `classmethod`

Parse command line arguments.

Parameters:

Name	Type	Description	Default
`argv(list)`		command line arguments	required

Returns:

Type	Description
	argparse.Namespace: The parsed command line arguments.

Source code in ceurws/volume_neo4j.py

@classmethod
def parse_args(cls, argv: list | None = None):
    """
    Parse command line arguments.

    Args:
        argv(list): command line arguments

    Returns:
        argparse.Namespace: The parsed command line arguments.
    """

    default_source = cls.default_source()
    parser = argparse.ArgumentParser(description="Volume/Editor/Location Information")
    parser.add_argument("--source", default=str(default_source), help="Source JSON file path")
    # Add progress option
    parser.add_argument(
        "--progress",
        action="store_true",
        help="Display progress information",
    )

    return parser.parse_args(argv)

`volume_view`

Created on 2024-02-23

@author: wf

`VolumeListView`

Bases: View

show a list of volumes a table

Source code in ceurws/volume_view.py

class VolumeListView(View):
    """
    show a list of volumes a table
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.wdSync = self.solution.wdSync
        self.dry_run = True
        self.ignore_errors = False
        self.get_volume_lod()
        self.setup_ui()

    def setup_ui(self):
        """
        show my volumes as a list
        """
        try:
            with ui.row() as self.button_row:
                self.check_recently_added_volumes_button = (
                    ui.button(
                        icon="cloud_download",
                        on_click=self.on_check_recently_update_volumes_button_click,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("check for recently added volumes")
                )
                self.wikidataButton = (
                    ui.button(
                        icon="web",
                        on_click=self.onWikidataButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Export to Wikidata")
                )
                self.dry_run_switch = ui.switch("dry run").bind_value(self, "dry_run")
                self.ignore_errors_check_box = ui.checkbox("ignore_errors", value=self.ignore_errors).bind_value(
                    self, "ignore_errors"
                )
                pass
                self.progress_bar = NiceguiProgressbar(total=100, desc="added", unit="volume")
            with ui.row() as self.log_row:
                self.log_view = ui.html()
            with ui.row() as self.grid_row:
                grid_config = GridConfig(key_col="Vol", multiselect=True)
                self.lod_grid = ListOfDictsGrid(lod=self.lod, config=grid_config)
                # Modify the columnDefs for the "Title" column after grid initialization
                for col_def in self.lod_grid.ag_grid.options["columnDefs"]:
                    if col_def["field"] == "Title":  # Identify the "Title" column
                        col_def["maxWidth"] = 400  # width in pixels
                self.lod_grid.sizeColumnsToFit()
        except Exception as ex:
            self.solution.handle_exception(ex)

    def clear_msg(self, msg: str = ""):
        """
        clear the log_view with the given message

        Args:
            msg(str): the message to display
        """
        with self.log_row:
            self.log_view.content = msg

    def add_msg(self, html_markup: str):
        """
        add the given html_markup message to the log_view

        Args:
            msg(str): the html formatted message to add
        """
        with self.log_row:
            self.log_view.content += html_markup

    def updateWikidataVolumes(self, selected_rows):
        """
        update wikidata volumes for the selected rows
        """
        try:
            msg = f"{len(selected_rows)} Volumes selected<br>"
            self.clear_msg(msg)
            # First, sort selected_rows by the volume number in ascending order
            sorted_rows = sorted(selected_rows, key=lambda row: row["#"])
            for row in sorted_rows:
                vol_number = row["#"]
                volume = self.wdSync.volumesByNumber[vol_number]
                self.add_or_update_volume_in_wikidata(volume)
            pass
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onWikidataButtonClick(self, _args):
        """
        handle wikidata sync request
        """
        selected_rows = await self.lod_grid.get_selected_rows()
        await run.io_bound(self.updateWikidataVolumes, selected_rows)

    def check_recently_updated_volumes(self):
        """
        check recently updated volumes
        """
        try:
            text = "checking CEUR-WS index.html for recently added volumes ..."
            self.clear_msg(text)
            (
                volumesByNumber,
                addedVolumeNumberList,
            ) = self.wdSync.getRecentlyAddedVolumeList()
            self.add_msg(f"<br>found {len(addedVolumeNumberList)} new volumes")
            total = len(addedVolumeNumberList)
            self.progress_bar.total = total
            for i, volumeNumber in enumerate(addedVolumeNumberList):
                if i % 100 == 0 and i != 0:
                    self.wdSync.storeVolumes()
                    time.sleep(60)
                volume = volumesByNumber[volumeNumber]
                self.updateRecentlyAddedVolume(volume, i + 1, total)
                url = f"/volume/{volume.number}"
                text = f"{volume}:{volume.acronym}"
                link = self.createLink(url, text)
                self.add_msg(f":{link}")
            pass
            self.wdSync.storeVolumes()
            with self.parent:
                self.progress_bar.reset()
            with self.grid_row:
                self.lod_grid.update()
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def on_check_recently_update_volumes_button_click(self, args):
        """
        handle clicking of the refresh button to get recently added volumes
        """
        await run.io_bound(self.check_recently_updated_volumes)

    def updateRecentlyAddedVolume(self, volume, index, total):
        """
        update a recently added Volume

        Args:
            volume(Volume): the volume to update
            index(int): the relative index of the volume currently being added
            total(int): the total number of volumes currently being added
        """
        html_msg = f"<br>reading {index}/{total} from {volume.url}"
        self.add_msg(html_msg)
        volume.extractValuesFromVolumePage()
        self.wdSync.addVolume(volume)
        self.progress_bar.update_value(index)

    def get_volume_lod(self):
        """
        get the list of dict of all volumes
        """
        self.lod = []
        volumeList = self.wdSync.vm.getList()
        reverseVolumeList = sorted(volumeList, key=lambda volume: volume.number, reverse=True)
        for volume in reverseVolumeList:
            validMark = "✅" if volume.valid else "❌"
            self.lod.append(
                {
                    "#": volume.number,
                    "Vol": self.createLink(volume.url, f"Vol-{volume.number:04}"),
                    "Acronym": self.getValue(volume, "acronym"),
                    "Title": self.getValue(volume, "title"),
                    "Loctime": self.getValue(volume, "loctime"),
                    "Published": self.getValue(volume, "published"),
                    "SubmittedBy": self.getValue(volume, "submittedBy"),
                    "valid": validMark,
                }
            )

    def add_or_update_volume_in_wikidata(self, volume: Volume):
        """
        add the given volume to wikidata or update it if it already exists

        Args:
            volume(Volume): the CEUR-WS volume to update proceedings and event entries for
        """
        try:
            msg = f"trying to add Volume {volume.number} to wikidata"
            with self.parent:
                ui.notify(msg)
            self.add_msg(msg + "<br>")
            proceedingsWikidataId = self.createProceedingsItemFromVolume(volume)
            if proceedingsWikidataId is not None:
                self.createEventItemAndLinkProceedings(volume, proceedingsWikidataId)
            else:
                msg = f"<br>adding Volume {volume.number} proceedings to wikidata failed"
                self.add_msg(msg)
                with self.parent:
                    ui.notify(msg)
        except Exception as ex:
            self.solution.handle_exception(ex)

    def optional_login(self) -> bool:
        """
        check if we need to login

        Returns:
            bool: True if write is enabled
        """
        write = not self.dry_run
        if write:
            self.wdSync.login()
        return write

    def createProceedingsItemFromVolume(self, volume: Volume):
        """
        Create wikidata item for proceedings of given volume
        """
        qId = None
        try:
            write = self.optional_login()
            # check if already in wikidata → use URN
            urn = volume.urn
            wdItems = self.wdSync.getProceedingWdItemsByUrn(urn)
            if len(wdItems) > 0:
                html = f"Volume {volume.number} already in Wikidata see "
                delim = ""
                for wdItem in wdItems:
                    qId = wdItem.split("/")[-1]
                    link = self.createLink(wdItem, qId)
                    html += f"{link}{delim}"
                    delim = ","
                self.add_msg(html + "<br>")
            else:
                # A proceedings volume for the URN is not known → create wd entry
                wdRecord = self.wdSync.getWikidataProceedingsRecord(volume)
                if self.dry_run:
                    markup = self.get_dict_as_html_table(wdRecord)
                    self.add_msg(markup)
                result = self.wdSync.addProceedingsToWikidata(wdRecord, write=write, ignoreErrors=self.ignore_errors)
                qId = result.qid
                if qId is not None:
                    proc_link = self.createWdLink(
                        qId,
                        f"Proceedings entry for Vol {volume.number} {qId} was created",
                    )
                    self.add_msg(proc_link)
                else:
                    self.add_msg(f"Creating wikidata Proceedings entry for Vol {volume.number} failed")
                    for key, value in result.errors.items():
                        msg = f"{key}:{value}"
                        self.add_msg(msg)
        except Exception as ex:
            self.solution.handle_exception(ex)
        return qId

    def createEventItemAndLinkProceedings(self, volume: Volume, proceedingsWikidataId: str | None = None):
        """
        Create event  wikidata item for given volume and link
        the proceedings with the event

        Args:
            volume(Volume): the volume for which to create the event item
            proceedingsWikidataId: wikidata id of the proceedings
        """
        try:
            write = self.optional_login()
            results = self.wdSync.doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId, write=write)
            if write:
                self.wdSync.logout()
            for key, result in results.items():
                if result.qid:
                    if key == "dblp":
                        url = f"https://dblp.org/db/{result.qid}.html"
                        link = self.createLink(url, f"dblp {result.qid}")
                    else:
                        link = self.createWdLink(
                            result.qid,
                            f"{key} for Vol {volume.number} {result.qid}",
                        )
                    self.add_msg("<br>" + link)
                if result.msg:
                    self.add_msg("<br>" + result.msg)
                if len(result.errors) > 0:
                    for error in result.errors.values():
                        self.add_msg(f"error {str(error)}")
        except Exception as ex:
            self.solution.handle_exception(ex)
        pass

`init(solution, parent)`

constructor

Parameters:

Name	Type	Description	Default
`solution`		the solution	required
`parent`		the parent UI container	required

Source code in ceurws/volume_view.py

def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.wdSync = self.solution.wdSync
    self.dry_run = True
    self.ignore_errors = False
    self.get_volume_lod()
    self.setup_ui()

`add_msg(html_markup)`

add the given html_markup message to the log_view

Parameters:

Name	Type	Description	Default
`msg(str)`		the html formatted message to add	required

Source code in ceurws/volume_view.py

def add_msg(self, html_markup: str):
    """
    add the given html_markup message to the log_view

    Args:
        msg(str): the html formatted message to add
    """
    with self.log_row:
        self.log_view.content += html_markup

`add_or_update_volume_in_wikidata(volume)`

add the given volume to wikidata or update it if it already exists

Parameters:

Name	Type	Description	Default
`volume(Volume)`		the CEUR-WS volume to update proceedings and event entries for	required

Source code in ceurws/volume_view.py

def add_or_update_volume_in_wikidata(self, volume: Volume):
    """
    add the given volume to wikidata or update it if it already exists

    Args:
        volume(Volume): the CEUR-WS volume to update proceedings and event entries for
    """
    try:
        msg = f"trying to add Volume {volume.number} to wikidata"
        with self.parent:
            ui.notify(msg)
        self.add_msg(msg + "<br>")
        proceedingsWikidataId = self.createProceedingsItemFromVolume(volume)
        if proceedingsWikidataId is not None:
            self.createEventItemAndLinkProceedings(volume, proceedingsWikidataId)
        else:
            msg = f"<br>adding Volume {volume.number} proceedings to wikidata failed"
            self.add_msg(msg)
            with self.parent:
                ui.notify(msg)
    except Exception as ex:
        self.solution.handle_exception(ex)

`check_recently_updated_volumes()`

check recently updated volumes

Source code in ceurws/volume_view.py

def check_recently_updated_volumes(self):
    """
    check recently updated volumes
    """
    try:
        text = "checking CEUR-WS index.html for recently added volumes ..."
        self.clear_msg(text)
        (
            volumesByNumber,
            addedVolumeNumberList,
        ) = self.wdSync.getRecentlyAddedVolumeList()
        self.add_msg(f"<br>found {len(addedVolumeNumberList)} new volumes")
        total = len(addedVolumeNumberList)
        self.progress_bar.total = total
        for i, volumeNumber in enumerate(addedVolumeNumberList):
            if i % 100 == 0 and i != 0:
                self.wdSync.storeVolumes()
                time.sleep(60)
            volume = volumesByNumber[volumeNumber]
            self.updateRecentlyAddedVolume(volume, i + 1, total)
            url = f"/volume/{volume.number}"
            text = f"{volume}:{volume.acronym}"
            link = self.createLink(url, text)
            self.add_msg(f":{link}")
        pass
        self.wdSync.storeVolumes()
        with self.parent:
            self.progress_bar.reset()
        with self.grid_row:
            self.lod_grid.update()
    except Exception as ex:
        self.solution.handle_exception(ex)

`clear_msg(msg='')`

clear the log_view with the given message

Parameters:

Name	Type	Description	Default
`msg(str)`		the message to display	required

Source code in ceurws/volume_view.py

def clear_msg(self, msg: str = ""):
    """
    clear the log_view with the given message

    Args:
        msg(str): the message to display
    """
    with self.log_row:
        self.log_view.content = msg

`createEventItemAndLinkProceedings(volume, proceedingsWikidataId=None)`

Create event wikidata item for given volume and link the proceedings with the event

Parameters:

Name	Type	Description	Default
`volume(Volume)`		the volume for which to create the event item	required
`proceedingsWikidataId`	`str \| None`	wikidata id of the proceedings	`None`

Source code in ceurws/volume_view.py

def createEventItemAndLinkProceedings(self, volume: Volume, proceedingsWikidataId: str | None = None):
    """
    Create event  wikidata item for given volume and link
    the proceedings with the event

    Args:
        volume(Volume): the volume for which to create the event item
        proceedingsWikidataId: wikidata id of the proceedings
    """
    try:
        write = self.optional_login()
        results = self.wdSync.doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId, write=write)
        if write:
            self.wdSync.logout()
        for key, result in results.items():
            if result.qid:
                if key == "dblp":
                    url = f"https://dblp.org/db/{result.qid}.html"
                    link = self.createLink(url, f"dblp {result.qid}")
                else:
                    link = self.createWdLink(
                        result.qid,
                        f"{key} for Vol {volume.number} {result.qid}",
                    )
                self.add_msg("<br>" + link)
            if result.msg:
                self.add_msg("<br>" + result.msg)
            if len(result.errors) > 0:
                for error in result.errors.values():
                    self.add_msg(f"error {str(error)}")
    except Exception as ex:
        self.solution.handle_exception(ex)
    pass

`createProceedingsItemFromVolume(volume)`

Create wikidata item for proceedings of given volume

Source code in ceurws/volume_view.py

def createProceedingsItemFromVolume(self, volume: Volume):
    """
    Create wikidata item for proceedings of given volume
    """
    qId = None
    try:
        write = self.optional_login()
        # check if already in wikidata → use URN
        urn = volume.urn
        wdItems = self.wdSync.getProceedingWdItemsByUrn(urn)
        if len(wdItems) > 0:
            html = f"Volume {volume.number} already in Wikidata see "
            delim = ""
            for wdItem in wdItems:
                qId = wdItem.split("/")[-1]
                link = self.createLink(wdItem, qId)
                html += f"{link}{delim}"
                delim = ","
            self.add_msg(html + "<br>")
        else:
            # A proceedings volume for the URN is not known → create wd entry
            wdRecord = self.wdSync.getWikidataProceedingsRecord(volume)
            if self.dry_run:
                markup = self.get_dict_as_html_table(wdRecord)
                self.add_msg(markup)
            result = self.wdSync.addProceedingsToWikidata(wdRecord, write=write, ignoreErrors=self.ignore_errors)
            qId = result.qid
            if qId is not None:
                proc_link = self.createWdLink(
                    qId,
                    f"Proceedings entry for Vol {volume.number} {qId} was created",
                )
                self.add_msg(proc_link)
            else:
                self.add_msg(f"Creating wikidata Proceedings entry for Vol {volume.number} failed")
                for key, value in result.errors.items():
                    msg = f"{key}:{value}"
                    self.add_msg(msg)
    except Exception as ex:
        self.solution.handle_exception(ex)
    return qId

`get_volume_lod()`

get the list of dict of all volumes

Source code in ceurws/volume_view.py

def get_volume_lod(self):
    """
    get the list of dict of all volumes
    """
    self.lod = []
    volumeList = self.wdSync.vm.getList()
    reverseVolumeList = sorted(volumeList, key=lambda volume: volume.number, reverse=True)
    for volume in reverseVolumeList:
        validMark = "✅" if volume.valid else "❌"
        self.lod.append(
            {
                "#": volume.number,
                "Vol": self.createLink(volume.url, f"Vol-{volume.number:04}"),
                "Acronym": self.getValue(volume, "acronym"),
                "Title": self.getValue(volume, "title"),
                "Loctime": self.getValue(volume, "loctime"),
                "Published": self.getValue(volume, "published"),
                "SubmittedBy": self.getValue(volume, "submittedBy"),
                "valid": validMark,
            }
        )

`onWikidataButtonClick(_args)` `async`

handle wikidata sync request

Source code in ceurws/volume_view.py

async def onWikidataButtonClick(self, _args):
    """
    handle wikidata sync request
    """
    selected_rows = await self.lod_grid.get_selected_rows()
    await run.io_bound(self.updateWikidataVolumes, selected_rows)

`on_check_recently_update_volumes_button_click(args)` `async`

handle clicking of the refresh button to get recently added volumes

Source code in ceurws/volume_view.py

async def on_check_recently_update_volumes_button_click(self, args):
    """
    handle clicking of the refresh button to get recently added volumes
    """
    await run.io_bound(self.check_recently_updated_volumes)

`optional_login()`

check if we need to login

Returns:

Name	Type	Description
`bool`	`bool`	True if write is enabled

Source code in ceurws/volume_view.py

def optional_login(self) -> bool:
    """
    check if we need to login

    Returns:
        bool: True if write is enabled
    """
    write = not self.dry_run
    if write:
        self.wdSync.login()
    return write

`setup_ui()`

show my volumes as a list

Source code in ceurws/volume_view.py

def setup_ui(self):
    """
    show my volumes as a list
    """
    try:
        with ui.row() as self.button_row:
            self.check_recently_added_volumes_button = (
                ui.button(
                    icon="cloud_download",
                    on_click=self.on_check_recently_update_volumes_button_click,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("check for recently added volumes")
            )
            self.wikidataButton = (
                ui.button(
                    icon="web",
                    on_click=self.onWikidataButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Export to Wikidata")
            )
            self.dry_run_switch = ui.switch("dry run").bind_value(self, "dry_run")
            self.ignore_errors_check_box = ui.checkbox("ignore_errors", value=self.ignore_errors).bind_value(
                self, "ignore_errors"
            )
            pass
            self.progress_bar = NiceguiProgressbar(total=100, desc="added", unit="volume")
        with ui.row() as self.log_row:
            self.log_view = ui.html()
        with ui.row() as self.grid_row:
            grid_config = GridConfig(key_col="Vol", multiselect=True)
            self.lod_grid = ListOfDictsGrid(lod=self.lod, config=grid_config)
            # Modify the columnDefs for the "Title" column after grid initialization
            for col_def in self.lod_grid.ag_grid.options["columnDefs"]:
                if col_def["field"] == "Title":  # Identify the "Title" column
                    col_def["maxWidth"] = 400  # width in pixels
            self.lod_grid.sizeColumnsToFit()
    except Exception as ex:
        self.solution.handle_exception(ex)

`updateRecentlyAddedVolume(volume, index, total)`

update a recently added Volume

Parameters:

Name	Description	Default
`volume(Volume)`	the volume to update	required
`index(int)`	the relative index of the volume currently being added	required
`total(int)`	the total number of volumes currently being added	required

Source code in ceurws/volume_view.py

def updateRecentlyAddedVolume(self, volume, index, total):
    """
    update a recently added Volume

    Args:
        volume(Volume): the volume to update
        index(int): the relative index of the volume currently being added
        total(int): the total number of volumes currently being added
    """
    html_msg = f"<br>reading {index}/{total} from {volume.url}"
    self.add_msg(html_msg)
    volume.extractValuesFromVolumePage()
    self.wdSync.addVolume(volume)
    self.progress_bar.update_value(index)

`updateWikidataVolumes(selected_rows)`

update wikidata volumes for the selected rows

Source code in ceurws/volume_view.py

def updateWikidataVolumes(self, selected_rows):
    """
    update wikidata volumes for the selected rows
    """
    try:
        msg = f"{len(selected_rows)} Volumes selected<br>"
        self.clear_msg(msg)
        # First, sort selected_rows by the volume number in ascending order
        sorted_rows = sorted(selected_rows, key=lambda row: row["#"])
        for row in sorted_rows:
            vol_number = row["#"]
            volume = self.wdSync.volumesByNumber[vol_number]
            self.add_or_update_volume_in_wikidata(volume)
        pass
    except Exception as ex:
        self.solution.handle_exception(ex)

`VolumeView`

Bases: View

displays a single volume

Source code in ceurws/volume_view.py

class VolumeView(View):
    """
    displays a single volume
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.volumeToolBar = None
        self.wdSync = self.solution.wdSync
        self.wdSpan = None

    def setup_ui(self):
        """
        setup my User Interface elements
        """
        with self.parent:
            with ui.row() as self.volumeToolBar:
                self.volumeRefreshButton = (
                    ui.button(
                        icon="refresh",
                        on_click=self.onRefreshButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Refresh from CEUR-WS Volume page")
                )
                self.wikidataButton = (
                    ui.button(
                        icon="web",
                        on_click=self.onWikidataButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Export to Wikidata")
                )
            self.header_view = ui.html()
            self.iframe_view = ui.html().classes("w-full").style("height: 80vh;")

    def updateWikidataSpan(self, qId: str, volume: Volume):
        """
        create a Wikidata Export span

        Args:
            a(): ancestor
            qId(str): wikidata item Q Identifier
            volume(Volume): the Volume
        """
        if self.wdSpan is None:
            self.wdSpan = ui.html()
        volume_link = Link.create(url=self.volume.url, text=f"{volume.number}:{volume.acronym}")
        wd_url = self.wdSync.itemUrl(qId)
        wd_link = Link.create(url=wd_url, text=f"{qId} ")
        self.wdSpan.content = f"{volume_link}{wd_link}"

    def showVolume(self, volume: Volume):
        """
        show the given volume

        Args:
            volume(Volume): the volume to show
        """
        try:
            self.volume = volume
            if self.volumeToolBar is None:
                self.setup_ui()

            wdProc = self.wdSync.getProceedingsForVolume(volume.number)
            self.wikidataButton.disabled = wdProc is not None
            links = ""
            if wdProc is not None:
                # wikidata proceedings link
                itemLink = self.createLink(wdProc["item"], "wikidataitem")
                # dblp proceedings link
                dblpLink = self.createExternalLink(
                    wdProc,
                    "dblpEventId",
                    "dblp",
                    DblpEndpoint.DBLP_EVENT_PREFIX,
                    emptyIfNone=True,
                )
                # k10plus proceedings link
                k10PlusLink = self.createExternalLink(
                    wdProc,
                    "ppnId",
                    "k10plus",
                    "https://opac.k10plus.de/DB=2.299/PPNSET?PPN=",
                    emptyIfNone=True,
                )
                # scholia proceedings link
                scholiaLink = self.createExternalLink(
                    wdProc,
                    "item",
                    "scholia",
                    "https://scholia.toolforge.org/venue/",
                    emptyIfNone=True,
                )
                # scholia event link
                scholiaEventLink = self.createExternalLink(
                    wdProc,
                    "event",
                    "event",
                    "https://scholia.toolforge.org/event/",
                    emptyIfNone=True,
                )
                # scholia event series link
                scholiaEventSeriesLink = self.createExternalLink(
                    wdProc,
                    "eventSeries",
                    "series",
                    "https://scholia.toolforge.org/event-series/",
                    emptyIfNone=True,
                )
                # scholia colocated with link
                delim = ""
                for link in [
                    itemLink,
                    dblpLink,
                    k10PlusLink,
                    scholiaLink,
                    scholiaEventLink,
                    scholiaEventSeriesLink,
                ]:
                    if link:
                        links += delim + link
                        delim = "&nbsp;"

            headerHtml = f"""
    {links}<h3 style='font-size: 24px; font-weight: normal; margin-top: 20px; margin-bottom: 10px;'>{volume.h1}</h3>
    <a href='{volume.url}'>{volume.acronym}<a>
    {volume.title}<br>
    {volume.desc}
    published: {volume.pubDate}
    submitted By: {volume.submittedBy}"""
            iframeHtml = f"""
            <iframe src='{volume.url}' style='width: 100%; height: 80vh; border: none;'></iframe>"""
            self.header_view.content = headerHtml
            self.iframe_view.content = iframeHtml

        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onRefreshButtonClick(self, _args):
        try:
            self.volume.extractValuesFromVolumePage()
            msg = f"updated from {self.volume.url}"
            ui.notify(msg)
            self.showVolume(self.volume)
            # self.wdSync.storeVolumes()
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onWikidataButtonClick(self, _args):
        """
        handle wikidata sync request
        """
        try:
            wdRecord = self.wdSync.getWikidataProceedingsRecord(self.volume)
            result = self.wdSync.addProceedingsToWikidata(wdRecord, write=True, ignoreErrors=False)
            qId = result.qid
            if qId is not None:
                msg = f"wikidata export of {self.volume.number} to {qId} done"
                ui.notify(msg)
                self.updateWikidataSpan(qId=qId, volume=self.volume)
            else:
                err_msg = f"error:{result.error}"
                self.solution.log_view.push(err_msg)
        except Exception as ex:
            self.solution.handle_exception(ex)

`init(solution, parent)`

constructor

Parameters:

Name	Type	Description	Default
`solution`		the solution	required
`parent`		the parent UI container	required

Source code in ceurws/volume_view.py

def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.volumeToolBar = None
    self.wdSync = self.solution.wdSync
    self.wdSpan = None

`onWikidataButtonClick(_args)` `async`

handle wikidata sync request

Source code in ceurws/volume_view.py

async def onWikidataButtonClick(self, _args):
    """
    handle wikidata sync request
    """
    try:
        wdRecord = self.wdSync.getWikidataProceedingsRecord(self.volume)
        result = self.wdSync.addProceedingsToWikidata(wdRecord, write=True, ignoreErrors=False)
        qId = result.qid
        if qId is not None:
            msg = f"wikidata export of {self.volume.number} to {qId} done"
            ui.notify(msg)
            self.updateWikidataSpan(qId=qId, volume=self.volume)
        else:
            err_msg = f"error:{result.error}"
            self.solution.log_view.push(err_msg)
    except Exception as ex:
        self.solution.handle_exception(ex)

`setup_ui()`

setup my User Interface elements

Source code in ceurws/volume_view.py

def setup_ui(self):
    """
    setup my User Interface elements
    """
    with self.parent:
        with ui.row() as self.volumeToolBar:
            self.volumeRefreshButton = (
                ui.button(
                    icon="refresh",
                    on_click=self.onRefreshButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Refresh from CEUR-WS Volume page")
            )
            self.wikidataButton = (
                ui.button(
                    icon="web",
                    on_click=self.onWikidataButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Export to Wikidata")
            )
        self.header_view = ui.html()
        self.iframe_view = ui.html().classes("w-full").style("height: 80vh;")

`showVolume(volume)`

show the given volume

Parameters:

Name	Type	Description	Default
`volume(Volume)`		the volume to show	required

Source code in ceurws/volume_view.py

def showVolume(self, volume: Volume):
    """
    show the given volume

    Args:
        volume(Volume): the volume to show
    """
    try:
        self.volume = volume
        if self.volumeToolBar is None:
            self.setup_ui()

        wdProc = self.wdSync.getProceedingsForVolume(volume.number)
        self.wikidataButton.disabled = wdProc is not None
        links = ""
        if wdProc is not None:
            # wikidata proceedings link
            itemLink = self.createLink(wdProc["item"], "wikidataitem")
            # dblp proceedings link
            dblpLink = self.createExternalLink(
                wdProc,
                "dblpEventId",
                "dblp",
                DblpEndpoint.DBLP_EVENT_PREFIX,
                emptyIfNone=True,
            )
            # k10plus proceedings link
            k10PlusLink = self.createExternalLink(
                wdProc,
                "ppnId",
                "k10plus",
                "https://opac.k10plus.de/DB=2.299/PPNSET?PPN=",
                emptyIfNone=True,
            )
            # scholia proceedings link
            scholiaLink = self.createExternalLink(
                wdProc,
                "item",
                "scholia",
                "https://scholia.toolforge.org/venue/",
                emptyIfNone=True,
            )
            # scholia event link
            scholiaEventLink = self.createExternalLink(
                wdProc,
                "event",
                "event",
                "https://scholia.toolforge.org/event/",
                emptyIfNone=True,
            )
            # scholia event series link
            scholiaEventSeriesLink = self.createExternalLink(
                wdProc,
                "eventSeries",
                "series",
                "https://scholia.toolforge.org/event-series/",
                emptyIfNone=True,
            )
            # scholia colocated with link
            delim = ""
            for link in [
                itemLink,
                dblpLink,
                k10PlusLink,
                scholiaLink,
                scholiaEventLink,
                scholiaEventSeriesLink,
            ]:
                if link:
                    links += delim + link
                    delim = "&nbsp;"

        headerHtml = f"""
{links}<h3 style='font-size: 24px; font-weight: normal; margin-top: 20px; margin-bottom: 10px;'>{volume.h1}</h3>
<a href='{volume.url}'>{volume.acronym}<a>
{volume.title}<br>
{volume.desc}
published: {volume.pubDate}
submitted By: {volume.submittedBy}"""
        iframeHtml = f"""
        <iframe src='{volume.url}' style='width: 100%; height: 80vh; border: none;'></iframe>"""
        self.header_view.content = headerHtml
        self.iframe_view.content = iframeHtml

    except Exception as ex:
        self.solution.handle_exception(ex)

`updateWikidataSpan(qId, volume)`

create a Wikidata Export span

Parameters:

Name	Description	Default
`a()`	ancestor	required
`qId(str)`	wikidata item Q Identifier	required
`volume(Volume)`	the Volume	required

Source code in ceurws/volume_view.py

def updateWikidataSpan(self, qId: str, volume: Volume):
    """
    create a Wikidata Export span

    Args:
        a(): ancestor
        qId(str): wikidata item Q Identifier
        volume(Volume): the Volume
    """
    if self.wdSpan is None:
        self.wdSpan = ui.html()
    volume_link = Link.create(url=self.volume.url, text=f"{volume.number}:{volume.acronym}")
    wd_url = self.wdSync.itemUrl(qId)
    wd_link = Link.create(url=wd_url, text=f"{qId} ")
    self.wdSpan.content = f"{volume_link}{wd_link}"

`volumeparser`

Created on 2022-08-14

@author: wf

`VolumePageCache`

Cache interface for ceur-ws volume pages

Source code in ceurws/volumeparser.py

class VolumePageCache:
    """
    Cache interface for ceur-ws volume pages
    """

    cache_location: Path = CEURWS.CACHE_DIR / "volumes"

    @classmethod
    def is_cached(cls, number: int) -> bool:
        """
        Check if the volume page of the given volume number is cached
        Args:
            number: volume number of the volume page

        Returns:
            True if the corresponding volume page is cached
        """
        return cls._get_volume_cache_path(number).is_file()

    @classmethod
    def cache(cls, number: int, html: str | bytes):
        """
        cache the volume page corresponding to the given number
        Args:
            number: number of the volume to cache
            html: html of the volume page to cache
        """
        if html is None:
            return
        Path(cls.cache_location).mkdir(parents=True, exist_ok=True)
        filename = cls._get_volume_cache_path(number)
        mode = "w"
        if isinstance(html, bytes):
            mode += "b"
        with open(filename, mode=mode) as f:
            f.write(html)

    @classmethod
    def _get_volume_cache_path(cls, number: int) -> Path:
        """
        get the name of the volume cache file
        """
        return cls.cache_location / f"Vol-{number}.html"

    @classmethod
    def get(cls, number: int) -> str | bytes | None:
        """
        Get the cached volume page of the given volume number.
        If the volume page is not cached None is returned.
        Args:
            number: volume number to retrieve

        Returns:
            str: cached volume page
            bytes: if the cached volume page contains encoding errors
            None: if no volume with the given number is cached
        """
        volume_page: str | bytes | None = None
        if cls.is_cached(number):
            filepath = cls._get_volume_cache_path(number)
            try:
                volume_page = filepath.read_text()
            except UnicodeDecodeError as _ex:
                volume_page = filepath.read_bytes()
        return volume_page

    @classmethod
    def delete(cls, number: int):
        """
        Delete the cache corresponding to the given volume number
        Args:
            number: volume number
        """
        if cls.is_cached(number):
            filepath = cls._get_volume_cache_path(number)
            os.remove(filepath)

`cache(number, html)` `classmethod`

cache the volume page corresponding to the given number Args: number: number of the volume to cache html: html of the volume page to cache

Source code in ceurws/volumeparser.py

@classmethod
def cache(cls, number: int, html: str | bytes):
    """
    cache the volume page corresponding to the given number
    Args:
        number: number of the volume to cache
        html: html of the volume page to cache
    """
    if html is None:
        return
    Path(cls.cache_location).mkdir(parents=True, exist_ok=True)
    filename = cls._get_volume_cache_path(number)
    mode = "w"
    if isinstance(html, bytes):
        mode += "b"
    with open(filename, mode=mode) as f:
        f.write(html)

`delete(number)` `classmethod`

Delete the cache corresponding to the given volume number Args: number: volume number

Source code in ceurws/volumeparser.py

@classmethod
def delete(cls, number: int):
    """
    Delete the cache corresponding to the given volume number
    Args:
        number: volume number
    """
    if cls.is_cached(number):
        filepath = cls._get_volume_cache_path(number)
        os.remove(filepath)

`get(number)` `classmethod`

Get the cached volume page of the given volume number. If the volume page is not cached None is returned. Args: number: volume number to retrieve

Returns:

Name	Type	Description
`str`	`str \| bytes \| None`	cached volume page
`bytes`	`str \| bytes \| None`	if the cached volume page contains encoding errors
`None`	`str \| bytes \| None`	if no volume with the given number is cached

Source code in ceurws/volumeparser.py

@classmethod
def get(cls, number: int) -> str | bytes | None:
    """
    Get the cached volume page of the given volume number.
    If the volume page is not cached None is returned.
    Args:
        number: volume number to retrieve

    Returns:
        str: cached volume page
        bytes: if the cached volume page contains encoding errors
        None: if no volume with the given number is cached
    """
    volume_page: str | bytes | None = None
    if cls.is_cached(number):
        filepath = cls._get_volume_cache_path(number)
        try:
            volume_page = filepath.read_text()
        except UnicodeDecodeError as _ex:
            volume_page = filepath.read_bytes()
    return volume_page

`is_cached(number)` `classmethod`

Check if the volume page of the given volume number is cached Args: number: volume number of the volume page

Returns:

Type	Description
`bool`	True if the corresponding volume page is cached

Source code in ceurws/volumeparser.py

@classmethod
def is_cached(cls, number: int) -> bool:
    """
    Check if the volume page of the given volume number is cached
    Args:
        number: volume number of the volume page

    Returns:
        True if the corresponding volume page is cached
    """
    return cls._get_volume_cache_path(number).is_file()

`VolumeParser`

Bases: Textparser

CEUR-WS VolumeParser

Source code in ceurws/volumeparser.py

class VolumeParser(Textparser):
    """
    CEUR-WS VolumeParser
    """

    def __init__(
        self,
        baseurl: str = "http://ceur-ws.org",
        timeout: float = 3,
        showHtml: bool = False,
        debug: bool = False,
    ):
        """
        Constructor

        Args:
            baseurl(str): the baseurl of the CEUR-WS website,
            timeout(float): the number of seconds to wait
            showHtml(bool): if True show the HTML code
            debug(bool): if True switch debugging on
        """
        Textparser.__init__(self, debug=debug)
        self.showHtml = showHtml
        self.baseurl = baseurl
        self.timeout = timeout
        self.scrape = WebScrape(timeout=timeout)

    def volumeUrl(self, volnumber: str | int):
        """
        get the url for the given volume number

        Args:
            volnumber(str): the volume number

        Returns:
            str: url - the url of the volume
        """
        # e.g. http://ceur-ws.org/Vol-2635/
        url = f"{self.baseurl}/Vol-{volnumber}"
        return url

    def getSoup(self, url: str) -> BeautifulSoup | None:
        """
        get the beautiful Soup parser for the given url
        Args:
            url: url to parse

        Returns:
            parsed webpage
        """
        return self.scrape.getSoup(url, showHtml=self.showHtml, debug=self.debug)

    def get_volume_soup(self, number: int, use_cache: bool = True) -> BeautifulSoup | None:
        """
        Get Soup of the volume page for the given volume number
        Args:
            number: volume number of the volume to parse
            use_cache: If True use volume page from cache if present otherwise load from web and cache

        Returns:
            BeautifulSoup: soup of the volume page
            None: soup can not be loaded from cache or from web
        """
        html = self.get_volume_page(number, recache=not use_cache)
        if html is None:
            if self.debug:
                print(f"Vol-{number} could not be retrieved")
            return None
        soup = self.scrape.get_soup_from_string(html, show_html=self.showHtml)
        return soup

    def get_volume_page(self, number: int, recache: bool = False) -> str | bytes | None:
        """
        Get the html content of the given volume number.
        Retrieves the volume page from cache or from ceur-ws.org
        Caches the volume page if not already cached
        Args:
            number: volume number
            recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

        Returns:
            html of volume page or None if the volume page is not found
        """
        if not recache and VolumePageCache.is_cached(number):
            volume_page = VolumePageCache.get(number)
        else:
            url = self.volumeUrl(number)
            volume_page = self.scrape.get_html_from_url(url)
            if volume_page:
                VolumePageCache.cache(number, volume_page)
        return volume_page

    def parse_volume(self, number: int, use_cache: bool = True) -> tuple[dict, BeautifulSoup | None]:
        """
        parse the given volume
        caches the volume pages at ~/.ceurws/volumes

        Args:
            number: volume number of the volume to parse
            use_cache: If True use volume page from cache if present otherwise load from web and cache

        Returns:
            dict: extracted information
        """
        soup = self.get_volume_soup(number, use_cache=use_cache)
        parsed_dict = self.parse_soup(number=str(number), soup=soup) if soup else {}
        self.check_parsed_dict(parsed_dict)
        return parsed_dict, soup

    def check_parsed_dict(self, parsed_dict: dict):
        """
        check parsed_dict content e.g. urn check digit
        """
        if "urn" in parsed_dict:
            urn = parsed_dict["urn"]
            if urn:
                urn_prefix = urn[:-1]
                check_digit = URN.calc_urn_checksum(urn_prefix)
                parsed_dict["urn_check_digit"] = check_digit
                urn_ok = URN.check_urn_checksum(urn)
                parsed_dict["urn_ok"] = urn_ok

    def parse(self, url: str) -> dict:
        """
        parse the given url
        Args:
             url: URL to parse the volume information from

        Returns:
            dict: extracted information
        """
        soup = self.getSoup(url)
        parsed_dict = self.parse_soup(soup=soup) if soup else {}
        return parsed_dict

    def parse_soup(self, soup: BeautifulSoup, number: str | None = None) -> dict:
        """
        parse the volume page data from the given soup

        Args:
            number(str): the volume number
            soup(BeautifulSoup): html parser to extract the content from

        Returns:
            dict: parsed content
        """
        if soup is None:
            return {"vol_number": number}
        # first try RDFa annotations
        scrapedDict = self.parseRDFa(soup)
        for key in scrapedDict:
            scrapedDict[key] = Textparser.sanitize(scrapedDict[key])

        # second part
        for descValue in ["description", "descripton"]:
            # descripton is a typo in the Volume index files not here!
            firstDesc = soup.find("meta", {"name": descValue})
            if isinstance(firstDesc, Tag):
                desc = firstDesc["content"]
                desc = Textparser.sanitize(desc, ["CEUR Workshop Proceedings "])
                scrapedDict["desc"] = desc
                break

        # first H1 has title info
        firstH1 = soup.find("h1")
        if firstH1 is not None:
            h1 = firstH1.text
            h1 = Textparser.sanitize(h1, ['<TD bgcolor="#FFFFFF">'])
            scrapedDict["h1"] = h1
            link = firstH1.find("a")
            if link is not None and isinstance(link, Tag) and len(link.text) < 20:
                acronym = link.text.strip()
                if not acronym:
                    acronym = h1 if len(h1) < 28 else h1.split()[0]

                eventHomepage = link.attrs.get("href")
                scrapedDict["acronym"] = acronym
                scrapedDict["homepage"] = eventHomepage

        # first h3 has loctime
        firstH3 = soup.find("h3")
        if firstH3 is not None:
            h3 = firstH3.text
            h3 = Textparser.sanitize(h3)
            scrapedDict["h3"] = h3

        if self.hasValue(scrapedDict, "desc") and not self.hasValue(scrapedDict, "acronym"):
            scrapedDict["acronym"] = scrapedDict["desc"]
        if self.hasValue(scrapedDict, "h1") and not self.hasValue(scrapedDict, "title"):
            scrapedDict["title"] = scrapedDict["h1"]
        if (
            self.hasValue(scrapedDict, "h1")
            and self.hasValue(scrapedDict, "title")
            and not self.hasValue(scrapedDict, "acronym")
        ):
            scrapedDict["acronym"] = scrapedDict["h1"]
        # editorsRecords = self.parseEditors(soup)
        # scrapedDict["editors"] = editorsRecords
        return scrapedDict

    def parseEditors(self, soup: BeautifulSoup):
        """
        parse all editor information contained in the given soup
        parse all information between <b> Edited by </b> ... <hr>
        Args:
            soup: volume web page
        """
        if soup is None:
            return None
        possible_start_elements = soup.find_all("b")
        # find start
        start_elements = []
        for e in possible_start_elements:
            start_tags = ["edited by", "program committee"]
            for tag in start_tags:
                if tag in e.text.lower():
                    start_elements.append(e)
        if len(start_elements) == 0:
            return None
        edited_by = start_elements[0]
        editor_h3 = edited_by.find_next("h3")
        editor_records: dict[str, dict] = dict()
        if editor_h3 is None:
            return None
        editor_spans = editor_h3.find_all(attrs={"class": "CEURVOLEDITOR"})
        if editor_spans is not None and len(editor_spans) > 0:
            for editor_span in editor_spans:
                editor_name = editor_span.text
                editor = {"name": editor_name}
                if editor_span.parent.name == "a":
                    homepage = editor_span.parent.attrs.get("href", None)
                    editor["homepage"] = homepage
                    if editor_span.parent.next_sibling is not None:
                        affiliation_keys = editor_span.parent.next_sibling.text.strip()
                    else:
                        affiliation_keys = None
                else:
                    if editor_span.next_sibling is not None:
                        affiliation_keys = editor_span.next_sibling.text.strip()
                    else:
                        affiliation_keys = None
                if affiliation_keys is None or affiliation_keys == "":
                    sup = editor_span.find_next("sup")
                    if sup is not None:
                        affiliation_keys = sup.text.strip()
                editor["affiliation_keys"] = affiliation_keys
                editor_records[editor_name] = editor
        else:
            editor_elements = []
            group_elements: list[PageElement] = []
            if (
                editor_h3.next_sibling
                and editor_h3.next_sibling.next_sibling
                and editor_h3.next_sibling.next_sibling.name == "h3"
            ):
                while editor_h3.next_sibling.next_sibling.name == "h3" and editor_h3.text.strip() != "":
                    editor_elements.append(editor_h3.contents)
                    editor_h3 = editor_h3.next_sibling.next_sibling
            else:
                for child in editor_h3.childGenerator():
                    if child.name == "br":
                        editor_elements.append(group_elements)
                        group_elements = []
                    else:
                        group_elements.append(child)
            for elements in editor_elements:
                text = "".join([e.text for e in elements]).strip()
                affiliation_key = text.split(" ")[-1]
                editor_name = text[: -len(affiliation_key)]
                links = [e for e in elements if e.name == "a"]
                homepage = links[0].attrs.get("href", None) if len(links) > 0 else None
                editor = {
                    "name": editor_name,
                    "homepage": homepage,
                    "affiliation_key": affiliation_key,
                }
                editor_records[editor_name] = editor
        affiliation_keys = {
            editor.get("affiliation_key")
            for editor in editor_records.values()
            if editor.get("affiliation_key", None) is not None
        }
        affiliation_map = self.parseAffiliationMap(editor_h3.next_sibling)
        for editor_record in editor_records.values():
            editor_keys = editor_record.get("affiliation_keys", "")
            if editor_keys is not None:
                keys = re.split("[, ]", editor_keys)
                editor_affiliations = []
                for key in keys:
                    if key in affiliation_map:
                        editor_affiliations.append(affiliation_map.get(key.strip()))
                editor_record["affiliation"] = editor_affiliations
        return editor_records

    def parseAffiliationMap(self, start: PageElement) -> dict:
        """
        Parse out the affiliations and their reference key
        Args:
            start:

        Returns:
            dict
        """
        if start is None:
            return dict()
        end = start.find_next("hr")
        affiliations_elements = []
        group_elements: list[PageElement] = []
        if isinstance(start.previous, Tag | NavigableString):
            for element in start.previous.nextGenerator():
                if isinstance(element, Tag | NavigableString) and element.name in ["br", "hr"]:
                    affiliations_elements.append(group_elements)
                    group_elements = []
                elif isinstance(element, NavigableString) and element.text.strip() == "":
                    pass
                elif isinstance(element, Tag | NavigableString) and element.name == "h3":
                    # elements inside the element are included through the nextGenerator
                    pass
                else:
                    group_elements.append(element)
                if element == end:
                    break
        affiliations_elements = [x for x in affiliations_elements if x != []]
        affiliation_map = dict()
        for elements in affiliations_elements:
            if isinstance(elements[0], NavigableString) and " " in elements[0].text.strip():
                text_containing_key = elements[0].text.strip()
                key = text_containing_key.split(" ")[0]
                key_element = NavigableString(value=key)
                text_element = NavigableString(value=text_containing_key[len(key) :])
                elements = [key_element, text_element, *elements[1:]]
            key = elements[0].text.strip()
            text_elements = []
            link_elements = []
            for element in elements[1:]:
                if isinstance(element, NavigableString):
                    text_elements.append(element)
                elif isinstance(element, Tag | NavigableString) and element.name == "a":
                    link_elements.append(element)
            affiliation = "".join([elem.text for elem in text_elements])
            affiliation = affiliation.replace("\n", "").replace("\t", "").replace("\r", "")
            if affiliation.startswith(key):
                affiliation = affiliation[len(key) :]
            homepages = []
            for element in link_elements:
                if hasattr(element, "attrs") and element.attrs.get("href", None) is not None:
                    homepage = element.attrs.get("href", None)
                    homepages.append(homepage)
            if key is not None and key != "":
                key = key.strip(".")
                affiliation_map[key] = {
                    "name": affiliation,
                    "homepage": homepages,
                }
        return affiliation_map

    def parseRDFa(self, soup: BeautifulSoup) -> dict:
        """
        tries to parse rdfa content from the given soup
        Args:
            soup: html parser to extract the content from

        Returns:
            dict: dict with the extracted content
        """
        scrapeDescr = [
            ScrapeDescription(
                key="volume_number",
                tag="span",
                attribute="class",
                value="CEURVOLNR",
            ),
            ScrapeDescription(key="urn", tag="span", attribute="class", value="CEURURN"),
            ScrapeDescription(key="year", tag="span", attribute="class", value="CEURPUBYEAR"),
            ScrapeDescription(
                key="ceurpubdate",
                tag="span",
                attribute="class",
                value="CEURPUBDATE",
            ),
            ScrapeDescription(
                key="acronym",
                tag="span",
                attribute="class",
                value="CEURVOLACRONYM",
            ),
            ScrapeDescription(
                key="voltitle",
                tag="span",
                attribute="class",
                value="CEURVOLTITLE",
            ),
            ScrapeDescription(
                key="title",
                tag="span",
                attribute="class",
                value="CEURFULLTITLE",
            ),
            ScrapeDescription(
                key="loctime",
                tag="span",
                attribute="class",
                value="CEURLOCTIME",
            ),
            ScrapeDescription(
                key="colocated",
                tag="span",
                attribute="class",
                value="CEURCOLOCATED",
            ),
        ]
        scrapedDict = self.scrape.parseWithScrapeDescription(soup, scrapeDescr)
        return scrapedDict

`init(baseurl='http://ceur-ws.org', timeout=3, showHtml=False, debug=False)`

Constructor

Parameters:

Name	Description	Default
`baseurl(str)`	the baseurl of the CEUR-WS website,	required
`timeout(float)`	the number of seconds to wait	required
`showHtml(bool)`	if True show the HTML code	required
`debug(bool)`	if True switch debugging on	required

Source code in ceurws/volumeparser.py

def __init__(
    self,
    baseurl: str = "http://ceur-ws.org",
    timeout: float = 3,
    showHtml: bool = False,
    debug: bool = False,
):
    """
    Constructor

    Args:
        baseurl(str): the baseurl of the CEUR-WS website,
        timeout(float): the number of seconds to wait
        showHtml(bool): if True show the HTML code
        debug(bool): if True switch debugging on
    """
    Textparser.__init__(self, debug=debug)
    self.showHtml = showHtml
    self.baseurl = baseurl
    self.timeout = timeout
    self.scrape = WebScrape(timeout=timeout)

`check_parsed_dict(parsed_dict)`

check parsed_dict content e.g. urn check digit

Source code in ceurws/volumeparser.py

def check_parsed_dict(self, parsed_dict: dict):
    """
    check parsed_dict content e.g. urn check digit
    """
    if "urn" in parsed_dict:
        urn = parsed_dict["urn"]
        if urn:
            urn_prefix = urn[:-1]
            check_digit = URN.calc_urn_checksum(urn_prefix)
            parsed_dict["urn_check_digit"] = check_digit
            urn_ok = URN.check_urn_checksum(urn)
            parsed_dict["urn_ok"] = urn_ok

`getSoup(url)`

get the beautiful Soup parser for the given url Args: url: url to parse

Returns:

Type	Description
`BeautifulSoup \| None`	parsed webpage

Source code in ceurws/volumeparser.py

def getSoup(self, url: str) -> BeautifulSoup | None:
    """
    get the beautiful Soup parser for the given url
    Args:
        url: url to parse

    Returns:
        parsed webpage
    """
    return self.scrape.getSoup(url, showHtml=self.showHtml, debug=self.debug)

`get_volume_page(number, recache=False)`

Get the html content of the given volume number. Retrieves the volume page from cache or from ceur-ws.org Caches the volume page if not already cached Args: number: volume number recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

Returns:

Type	Description
`str \| bytes \| None`	html of volume page or None if the volume page is not found

Source code in ceurws/volumeparser.py

def get_volume_page(self, number: int, recache: bool = False) -> str | bytes | None:
    """
    Get the html content of the given volume number.
    Retrieves the volume page from cache or from ceur-ws.org
    Caches the volume page if not already cached
    Args:
        number: volume number
        recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

    Returns:
        html of volume page or None if the volume page is not found
    """
    if not recache and VolumePageCache.is_cached(number):
        volume_page = VolumePageCache.get(number)
    else:
        url = self.volumeUrl(number)
        volume_page = self.scrape.get_html_from_url(url)
        if volume_page:
            VolumePageCache.cache(number, volume_page)
    return volume_page

`get_volume_soup(number, use_cache=True)`

Get Soup of the volume page for the given volume number Args: number: volume number of the volume to parse use_cache: If True use volume page from cache if present otherwise load from web and cache

Returns:

Name	Type	Description
`BeautifulSoup`	`BeautifulSoup \| None`	soup of the volume page
`None`	`BeautifulSoup \| None`	soup can not be loaded from cache or from web

Source code in ceurws/volumeparser.py

def get_volume_soup(self, number: int, use_cache: bool = True) -> BeautifulSoup | None:
    """
    Get Soup of the volume page for the given volume number
    Args:
        number: volume number of the volume to parse
        use_cache: If True use volume page from cache if present otherwise load from web and cache

    Returns:
        BeautifulSoup: soup of the volume page
        None: soup can not be loaded from cache or from web
    """
    html = self.get_volume_page(number, recache=not use_cache)
    if html is None:
        if self.debug:
            print(f"Vol-{number} could not be retrieved")
        return None
    soup = self.scrape.get_soup_from_string(html, show_html=self.showHtml)
    return soup

`parse(url)`

parse the given url Args: url: URL to parse the volume information from

Returns:

Name	Type	Description
`dict`	`dict`	extracted information

Source code in ceurws/volumeparser.py

def parse(self, url: str) -> dict:
    """
    parse the given url
    Args:
         url: URL to parse the volume information from

    Returns:
        dict: extracted information
    """
    soup = self.getSoup(url)
    parsed_dict = self.parse_soup(soup=soup) if soup else {}
    return parsed_dict

`parseAffiliationMap(start)`

Parse out the affiliations and their reference key Args: start:

Returns:

Type	Description
`dict`	dict

Source code in ceurws/volumeparser.py

def parseAffiliationMap(self, start: PageElement) -> dict:
    """
    Parse out the affiliations and their reference key
    Args:
        start:

    Returns:
        dict
    """
    if start is None:
        return dict()
    end = start.find_next("hr")
    affiliations_elements = []
    group_elements: list[PageElement] = []
    if isinstance(start.previous, Tag | NavigableString):
        for element in start.previous.nextGenerator():
            if isinstance(element, Tag | NavigableString) and element.name in ["br", "hr"]:
                affiliations_elements.append(group_elements)
                group_elements = []
            elif isinstance(element, NavigableString) and element.text.strip() == "":
                pass
            elif isinstance(element, Tag | NavigableString) and element.name == "h3":
                # elements inside the element are included through the nextGenerator
                pass
            else:
                group_elements.append(element)
            if element == end:
                break
    affiliations_elements = [x for x in affiliations_elements if x != []]
    affiliation_map = dict()
    for elements in affiliations_elements:
        if isinstance(elements[0], NavigableString) and " " in elements[0].text.strip():
            text_containing_key = elements[0].text.strip()
            key = text_containing_key.split(" ")[0]
            key_element = NavigableString(value=key)
            text_element = NavigableString(value=text_containing_key[len(key) :])
            elements = [key_element, text_element, *elements[1:]]
        key = elements[0].text.strip()
        text_elements = []
        link_elements = []
        for element in elements[1:]:
            if isinstance(element, NavigableString):
                text_elements.append(element)
            elif isinstance(element, Tag | NavigableString) and element.name == "a":
                link_elements.append(element)
        affiliation = "".join([elem.text for elem in text_elements])
        affiliation = affiliation.replace("\n", "").replace("\t", "").replace("\r", "")
        if affiliation.startswith(key):
            affiliation = affiliation[len(key) :]
        homepages = []
        for element in link_elements:
            if hasattr(element, "attrs") and element.attrs.get("href", None) is not None:
                homepage = element.attrs.get("href", None)
                homepages.append(homepage)
        if key is not None and key != "":
            key = key.strip(".")
            affiliation_map[key] = {
                "name": affiliation,
                "homepage": homepages,
            }
    return affiliation_map

`parseEditors(soup)`

parse all editor information contained in the given soup parse all information between Edited by ...

Args: soup: volume web page

Source code in ceurws/volumeparser.py

def parseEditors(self, soup: BeautifulSoup):
    """
    parse all editor information contained in the given soup
    parse all information between <b> Edited by </b> ... <hr>
    Args:
        soup: volume web page
    """
    if soup is None:
        return None
    possible_start_elements = soup.find_all("b")
    # find start
    start_elements = []
    for e in possible_start_elements:
        start_tags = ["edited by", "program committee"]
        for tag in start_tags:
            if tag in e.text.lower():
                start_elements.append(e)
    if len(start_elements) == 0:
        return None
    edited_by = start_elements[0]
    editor_h3 = edited_by.find_next("h3")
    editor_records: dict[str, dict] = dict()
    if editor_h3 is None:
        return None
    editor_spans = editor_h3.find_all(attrs={"class": "CEURVOLEDITOR"})
    if editor_spans is not None and len(editor_spans) > 0:
        for editor_span in editor_spans:
            editor_name = editor_span.text
            editor = {"name": editor_name}
            if editor_span.parent.name == "a":
                homepage = editor_span.parent.attrs.get("href", None)
                editor["homepage"] = homepage
                if editor_span.parent.next_sibling is not None:
                    affiliation_keys = editor_span.parent.next_sibling.text.strip()
                else:
                    affiliation_keys = None
            else:
                if editor_span.next_sibling is not None:
                    affiliation_keys = editor_span.next_sibling.text.strip()
                else:
                    affiliation_keys = None
            if affiliation_keys is None or affiliation_keys == "":
                sup = editor_span.find_next("sup")
                if sup is not None:
                    affiliation_keys = sup.text.strip()
            editor["affiliation_keys"] = affiliation_keys
            editor_records[editor_name] = editor
    else:
        editor_elements = []
        group_elements: list[PageElement] = []
        if (
            editor_h3.next_sibling
            and editor_h3.next_sibling.next_sibling
            and editor_h3.next_sibling.next_sibling.name == "h3"
        ):
            while editor_h3.next_sibling.next_sibling.name == "h3" and editor_h3.text.strip() != "":
                editor_elements.append(editor_h3.contents)
                editor_h3 = editor_h3.next_sibling.next_sibling
        else:
            for child in editor_h3.childGenerator():
                if child.name == "br":
                    editor_elements.append(group_elements)
                    group_elements = []
                else:
                    group_elements.append(child)
        for elements in editor_elements:
            text = "".join([e.text for e in elements]).strip()
            affiliation_key = text.split(" ")[-1]
            editor_name = text[: -len(affiliation_key)]
            links = [e for e in elements if e.name == "a"]
            homepage = links[0].attrs.get("href", None) if len(links) > 0 else None
            editor = {
                "name": editor_name,
                "homepage": homepage,
                "affiliation_key": affiliation_key,
            }
            editor_records[editor_name] = editor
    affiliation_keys = {
        editor.get("affiliation_key")
        for editor in editor_records.values()
        if editor.get("affiliation_key", None) is not None
    }
    affiliation_map = self.parseAffiliationMap(editor_h3.next_sibling)
    for editor_record in editor_records.values():
        editor_keys = editor_record.get("affiliation_keys", "")
        if editor_keys is not None:
            keys = re.split("[, ]", editor_keys)
            editor_affiliations = []
            for key in keys:
                if key in affiliation_map:
                    editor_affiliations.append(affiliation_map.get(key.strip()))
            editor_record["affiliation"] = editor_affiliations
    return editor_records

`parseRDFa(soup)`

tries to parse rdfa content from the given soup Args: soup: html parser to extract the content from

Returns:

Name	Type	Description
`dict`	`dict`	dict with the extracted content

Source code in ceurws/volumeparser.py

def parseRDFa(self, soup: BeautifulSoup) -> dict:
    """
    tries to parse rdfa content from the given soup
    Args:
        soup: html parser to extract the content from

    Returns:
        dict: dict with the extracted content
    """
    scrapeDescr = [
        ScrapeDescription(
            key="volume_number",
            tag="span",
            attribute="class",
            value="CEURVOLNR",
        ),
        ScrapeDescription(key="urn", tag="span", attribute="class", value="CEURURN"),
        ScrapeDescription(key="year", tag="span", attribute="class", value="CEURPUBYEAR"),
        ScrapeDescription(
            key="ceurpubdate",
            tag="span",
            attribute="class",
            value="CEURPUBDATE",
        ),
        ScrapeDescription(
            key="acronym",
            tag="span",
            attribute="class",
            value="CEURVOLACRONYM",
        ),
        ScrapeDescription(
            key="voltitle",
            tag="span",
            attribute="class",
            value="CEURVOLTITLE",
        ),
        ScrapeDescription(
            key="title",
            tag="span",
            attribute="class",
            value="CEURFULLTITLE",
        ),
        ScrapeDescription(
            key="loctime",
            tag="span",
            attribute="class",
            value="CEURLOCTIME",
        ),
        ScrapeDescription(
            key="colocated",
            tag="span",
            attribute="class",
            value="CEURCOLOCATED",
        ),
    ]
    scrapedDict = self.scrape.parseWithScrapeDescription(soup, scrapeDescr)
    return scrapedDict

`parse_soup(soup, number=None)`

parse the volume page data from the given soup

Parameters:

Name	Type	Description	Default
`number(str)`		the volume number	required
`soup(BeautifulSoup)`		html parser to extract the content from	required

Returns:

Name	Type	Description
`dict`	`dict`	parsed content

Source code in ceurws/volumeparser.py

def parse_soup(self, soup: BeautifulSoup, number: str | None = None) -> dict:
    """
    parse the volume page data from the given soup

    Args:
        number(str): the volume number
        soup(BeautifulSoup): html parser to extract the content from

    Returns:
        dict: parsed content
    """
    if soup is None:
        return {"vol_number": number}
    # first try RDFa annotations
    scrapedDict = self.parseRDFa(soup)
    for key in scrapedDict:
        scrapedDict[key] = Textparser.sanitize(scrapedDict[key])

    # second part
    for descValue in ["description", "descripton"]:
        # descripton is a typo in the Volume index files not here!
        firstDesc = soup.find("meta", {"name": descValue})
        if isinstance(firstDesc, Tag):
            desc = firstDesc["content"]
            desc = Textparser.sanitize(desc, ["CEUR Workshop Proceedings "])
            scrapedDict["desc"] = desc
            break

    # first H1 has title info
    firstH1 = soup.find("h1")
    if firstH1 is not None:
        h1 = firstH1.text
        h1 = Textparser.sanitize(h1, ['<TD bgcolor="#FFFFFF">'])
        scrapedDict["h1"] = h1
        link = firstH1.find("a")
        if link is not None and isinstance(link, Tag) and len(link.text) < 20:
            acronym = link.text.strip()
            if not acronym:
                acronym = h1 if len(h1) < 28 else h1.split()[0]

            eventHomepage = link.attrs.get("href")
            scrapedDict["acronym"] = acronym
            scrapedDict["homepage"] = eventHomepage

    # first h3 has loctime
    firstH3 = soup.find("h3")
    if firstH3 is not None:
        h3 = firstH3.text
        h3 = Textparser.sanitize(h3)
        scrapedDict["h3"] = h3

    if self.hasValue(scrapedDict, "desc") and not self.hasValue(scrapedDict, "acronym"):
        scrapedDict["acronym"] = scrapedDict["desc"]
    if self.hasValue(scrapedDict, "h1") and not self.hasValue(scrapedDict, "title"):
        scrapedDict["title"] = scrapedDict["h1"]
    if (
        self.hasValue(scrapedDict, "h1")
        and self.hasValue(scrapedDict, "title")
        and not self.hasValue(scrapedDict, "acronym")
    ):
        scrapedDict["acronym"] = scrapedDict["h1"]
    # editorsRecords = self.parseEditors(soup)
    # scrapedDict["editors"] = editorsRecords
    return scrapedDict

`parse_volume(number, use_cache=True)`

parse the given volume caches the volume pages at ~/.ceurws/volumes

Parameters:

Name	Type	Description	Default
`number`	`int`	volume number of the volume to parse	required
`use_cache`	`bool`	If True use volume page from cache if present otherwise load from web and cache	`True`

Returns:

Name	Type	Description
`dict`	`tuple[dict, BeautifulSoup \| None]`	extracted information

Source code in ceurws/volumeparser.py

def parse_volume(self, number: int, use_cache: bool = True) -> tuple[dict, BeautifulSoup | None]:
    """
    parse the given volume
    caches the volume pages at ~/.ceurws/volumes

    Args:
        number: volume number of the volume to parse
        use_cache: If True use volume page from cache if present otherwise load from web and cache

    Returns:
        dict: extracted information
    """
    soup = self.get_volume_soup(number, use_cache=use_cache)
    parsed_dict = self.parse_soup(number=str(number), soup=soup) if soup else {}
    self.check_parsed_dict(parsed_dict)
    return parsed_dict, soup

`volumeUrl(volnumber)`

get the url for the given volume number

Parameters:

Name	Type	Description	Default
`volnumber(str)`		the volume number	required

Returns:

Name	Type	Description
`str`		url - the url of the volume

Source code in ceurws/volumeparser.py

def volumeUrl(self, volnumber: str | int):
    """
    get the url for the given volume number

    Args:
        volnumber(str): the volume number

    Returns:
        str: url - the url of the volume
    """
    # e.g. http://ceur-ws.org/Vol-2635/
    url = f"{self.baseurl}/Vol-{volnumber}"
    return url

`webserver`

Created on 2024-02-22

@author: wf

`CeurWsSolution`

Bases: InputWebSolution

CEUR-WS Volume browser solution

Source code in ceurws/webserver.py

class CeurWsSolution(InputWebSolution):
    """
    CEUR-WS Volume browser solution

    """

    def __init__(self, webserver: CeurWsWebServer, client: Client):
        """
        Initialize the solution

        Calls the constructor of the base solution
        Args:
            webserver (CeurWsWebServer): The webserver instance associated with this context.
            client (Client): The client instance this context is associated with.
        """
        super().__init__(webserver, client)  # Call to the superclass constructor
        self.wdSync = self.webserver.wdSync

    def configure_menu(self):
        InputWebSolution.configure_menu(self)
        self.link_button(name="volumes", icon_name="table", target="/volumes", new_tab=False)
        self.link_button(name="wikidata", icon_name="cloud_sync", target="/wikidatasync", new_tab=False)

    def prepare_ui(self):
        """
        prepare the user interface
        """
        InputWebSolution.prepare_ui(self)
        # does not work as expected ...
        # self.add_css()

    def add_css(self):
        # Get the correct path to the 'css' directory
        css_directory_path = Path(__file__).parent.parent / "css"
        # Check if the directory exists before trying to serve it
        if css_directory_path.is_dir():
            # Serve files from the 'css' directory at the '/css' route
            app.add_static_files("/css", str(css_directory_path))

            # Iterate over all .css files in the directory
            for css_file in os.listdir(css_directory_path):
                if css_file.endswith(".css"):
                    # Add the link tag for the css file to the head of the HTML document
                    ui.add_head_html(f'<link rel="stylesheet" type="text/css" href="/css/{css_file}">')

    async def wikidatasync(self):
        """
        show the wikidata sync table
        """

        def show():
            self.wikidata_view = WikidataView(self, self.container)

        await self.setup_content_div(show)

    async def volumes(self):
        """
        show the volumes table
        """

        def show():
            self.volume_list_view = VolumeListView(self, self.container)

        await self.setup_content_div(show)

    async def home(self):
        """
        home page selection
        """

        def show():
            try:
                with self.container:
                    with ui.row() as self.select_container:
                        self.volume_select = self.add_select(
                            "Volume",
                            selection=self.wdSync.volumeOptions,
                            with_input=True,
                            on_change=self.volume_selected,
                        ).props("size=120")
                    self.volume_view = VolumeView(self, self.container)
            except Exception as ex:
                self.handle_exception(ex)

        await self.setup_content_div(show)

    async def volume_selected(self, args: ValueChangeEventArguments):
        """
        when a volume is selected show the details in the Volume View
        """
        vol_number = args.value
        volume = self.wdSync.volumesByNumber[vol_number]
        self.volume_view.showVolume(volume)
        pass

`init(webserver, client)`

Initialize the solution

Calls the constructor of the base solution Args: webserver (CeurWsWebServer): The webserver instance associated with this context. client (Client): The client instance this context is associated with.

Source code in ceurws/webserver.py

def __init__(self, webserver: CeurWsWebServer, client: Client):
    """
    Initialize the solution

    Calls the constructor of the base solution
    Args:
        webserver (CeurWsWebServer): The webserver instance associated with this context.
        client (Client): The client instance this context is associated with.
    """
    super().__init__(webserver, client)  # Call to the superclass constructor
    self.wdSync = self.webserver.wdSync

`home()` `async`

home page selection

Source code in ceurws/webserver.py

async def home(self):
    """
    home page selection
    """

    def show():
        try:
            with self.container:
                with ui.row() as self.select_container:
                    self.volume_select = self.add_select(
                        "Volume",
                        selection=self.wdSync.volumeOptions,
                        with_input=True,
                        on_change=self.volume_selected,
                    ).props("size=120")
                self.volume_view = VolumeView(self, self.container)
        except Exception as ex:
            self.handle_exception(ex)

    await self.setup_content_div(show)

`prepare_ui()`

prepare the user interface

Source code in ceurws/webserver.py

def prepare_ui(self):
    """
    prepare the user interface
    """
    InputWebSolution.prepare_ui(self)

`volume_selected(args)` `async`

when a volume is selected show the details in the Volume View

Source code in ceurws/webserver.py

async def volume_selected(self, args: ValueChangeEventArguments):
    """
    when a volume is selected show the details in the Volume View
    """
    vol_number = args.value
    volume = self.wdSync.volumesByNumber[vol_number]
    self.volume_view.showVolume(volume)
    pass

`volumes()` `async`

show the volumes table

Source code in ceurws/webserver.py

async def volumes(self):
    """
    show the volumes table
    """

    def show():
        self.volume_list_view = VolumeListView(self, self.container)

    await self.setup_content_div(show)

`wikidatasync()` `async`

show the wikidata sync table

Source code in ceurws/webserver.py

async def wikidatasync(self):
    """
    show the wikidata sync table
    """

    def show():
        self.wikidata_view = WikidataView(self, self.container)

    await self.setup_content_div(show)

`CeurWsWebServer`

Bases: InputWebserver

webserver

Source code in ceurws/webserver.py

class CeurWsWebServer(InputWebserver):
    """
    webserver
    """

    @classmethod
    def get_config(cls) -> WebserverConfig:
        copy_right = "(c)2023-2024 Wolfgang Fahl"
        config = WebserverConfig(
            copy_right=copy_right,
            version=Version(),
            default_port=9998,
            timeout=10.0,
            short_name="spf",
        )
        server_config = WebserverConfig.get(config)
        server_config.solution_class = CeurWsSolution
        return server_config

    def __init__(self):
        """
        constructor
        """
        InputWebserver.__init__(self, config=CeurWsWebServer.get_config())

        @ui.page("/volumes")
        async def show_volumes(client: Client):
            return await self.page(client, CeurWsSolution.volumes)

        @ui.page("/volume/{volnumber}")
        async def show_volume_page(client: Client, vol_number):
            return await self.page(client, CeurWsSolution.volumePage, vol_number)

        @ui.page("/wikidatasync")
        async def wikidatasync(client: Client):
            return await self.page(client, CeurWsSolution.wikidatasync)

        @app.get("/volumes.json")
        async def volumes():
            """
            direct fastapi return of volumes
            """
            volumeList = self.wdSync.vm.getList()
            return volumeList

        @app.get("/proceedings.json")
        async def proceedings():
            """
            direct fastapi return of proceedings
            """
            proceedingsList = self.wdSync.loadProceedingsFromCache()
            return ORJSONResponse(proceedingsList)

        @app.get("/papers.json")
        async def papers():
            """
            direct fastapi return of papers
            """
            paperList = self.wdSync.pm.getList()
            return paperList

        @app.get(
            "/papers_dblp.json",
            tags=["dblp complete dataset"],
            # response_model= List[DblpPaper]
        )
        async def papers_dblp():
            """
            direct fastapi return of paper information from dblp
            """
            self.wdSync.dblpEndpoint.dblp_papers.load()
            papers = self.wdSync.dblpEndpoint.dblp_papers.papers
            records = [p.to_json() for p in papers]
            lod = [orjson.loads(json_str) for json_str in records]
            return ORJSONResponse(lod)

        @app.get(
            "/authors_dblp.json",
            tags=["dblp complete dataset"],
            # response_model=List[DblpAuthor]
        )
        async def authors_papers_dblp():
            """
            direct fastapi return of paper information from dblp
            """
            authors = self.wdSync.dblpEndpoint.get_all_ceur_authors()
            return ORJSONResponse(content=authors)

        @app.get("/dblp/papers", tags=["dblp complete dataset"])
        async def dblp_papers(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
            """
            Get ceur-ws volumes form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            papers = self.wdSync.dblpEndpoint.get_all_ceur_papers()
            return papers[offset:limit]

        @app.get("/dblp/editors", tags=["dblp complete dataset"])
        async def dblp_editors(limit: int = 100, offset: int = 0) -> list[DblpScholar]:
            """
            Get ceur-ws volume editors form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            editors = self.wdSync.dblpEndpoint.get_all_ceur_editors()
            return editors[offset:limit]

        @app.get("/dblp/volumes", tags=["dblp complete dataset"])
        async def dblp_volumes(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
            """
            Get ceur-ws volumes form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            proceedings = self.wdSync.dblpEndpoint.get_all_ceur_proceedings()
            return proceedings[offset:limit]

        @app.get("/dblp/volume/{volume_number}", tags=["dblp"])
        async def dblp_volume(volume_number: int) -> DblpProceeding:
            """
            Get ceur-ws volume form dblp
            """
            try:
                proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
            except Exception as e:
                raise HTTPException(status_code=404, detail=str(e)) from e
            if proceeding:
                return proceeding
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get("/dblp/volume/{volume_number}/editor", tags=["dblp"])
        async def dblp_volume_editors(volume_number: int) -> list[DblpScholar]:
            """
            Get ceur-ws volume editors form dblp
            """
            try:
                proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
            except Exception as e:
                raise HTTPException(status_code=404, detail=str(e)) from e
            if proceeding:
                return proceeding.editors
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get("/dblp/volume/{volume_number}/paper", tags=["dblp"])
        async def dblp_volume_papers(volume_number: int) -> list[DblpPaper]:
            """
            Get ceur-ws volume papers form dblp
            Args:
                volume_number: number of the volume

            Returns:
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            return papers

        @app.get("/dblp/volume/{volume_number}/paper/{paper_id}", tags=["dblp"])
        async def dblp_paper(volume_number: int, paper_id: str) -> DblpPaper:
            """
            Get ceur-ws volume paper form dblp
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            if papers:
                for paper in papers:
                    if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                        return paper
                raise HTTPException(status_code=404, detail="Paper not found")
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get(
            "/dblp/volume/{volume_number}/paper/{paper_id}/author",
            tags=["dblp"],
        )
        async def dblp_paper_authors(volume_number: int, paper_id: str) -> list[DblpScholar]:
            """
            Get ceur-ws volume paper form dblp
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            if papers:
                for paper in papers:
                    if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                        return paper.authors
                raise HTTPException(status_code=404, detail="Paper not found")
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

    def configure_run(self):
        """
        configure command line specific details
        """
        InputWebserver.configure_run(self)
        self.wdSync = WikidataSync.from_args(self.args)

`init()`

constructor

Source code in ceurws/webserver.py

def __init__(self):
    """
    constructor
    """
    InputWebserver.__init__(self, config=CeurWsWebServer.get_config())

    @ui.page("/volumes")
    async def show_volumes(client: Client):
        return await self.page(client, CeurWsSolution.volumes)

    @ui.page("/volume/{volnumber}")
    async def show_volume_page(client: Client, vol_number):
        return await self.page(client, CeurWsSolution.volumePage, vol_number)

    @ui.page("/wikidatasync")
    async def wikidatasync(client: Client):
        return await self.page(client, CeurWsSolution.wikidatasync)

    @app.get("/volumes.json")
    async def volumes():
        """
        direct fastapi return of volumes
        """
        volumeList = self.wdSync.vm.getList()
        return volumeList

    @app.get("/proceedings.json")
    async def proceedings():
        """
        direct fastapi return of proceedings
        """
        proceedingsList = self.wdSync.loadProceedingsFromCache()
        return ORJSONResponse(proceedingsList)

    @app.get("/papers.json")
    async def papers():
        """
        direct fastapi return of papers
        """
        paperList = self.wdSync.pm.getList()
        return paperList

    @app.get(
        "/papers_dblp.json",
        tags=["dblp complete dataset"],
        # response_model= List[DblpPaper]
    )
    async def papers_dblp():
        """
        direct fastapi return of paper information from dblp
        """
        self.wdSync.dblpEndpoint.dblp_papers.load()
        papers = self.wdSync.dblpEndpoint.dblp_papers.papers
        records = [p.to_json() for p in papers]
        lod = [orjson.loads(json_str) for json_str in records]
        return ORJSONResponse(lod)

    @app.get(
        "/authors_dblp.json",
        tags=["dblp complete dataset"],
        # response_model=List[DblpAuthor]
    )
    async def authors_papers_dblp():
        """
        direct fastapi return of paper information from dblp
        """
        authors = self.wdSync.dblpEndpoint.get_all_ceur_authors()
        return ORJSONResponse(content=authors)

    @app.get("/dblp/papers", tags=["dblp complete dataset"])
    async def dblp_papers(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
        """
        Get ceur-ws volumes form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        papers = self.wdSync.dblpEndpoint.get_all_ceur_papers()
        return papers[offset:limit]

    @app.get("/dblp/editors", tags=["dblp complete dataset"])
    async def dblp_editors(limit: int = 100, offset: int = 0) -> list[DblpScholar]:
        """
        Get ceur-ws volume editors form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        editors = self.wdSync.dblpEndpoint.get_all_ceur_editors()
        return editors[offset:limit]

    @app.get("/dblp/volumes", tags=["dblp complete dataset"])
    async def dblp_volumes(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
        """
        Get ceur-ws volumes form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        proceedings = self.wdSync.dblpEndpoint.get_all_ceur_proceedings()
        return proceedings[offset:limit]

    @app.get("/dblp/volume/{volume_number}", tags=["dblp"])
    async def dblp_volume(volume_number: int) -> DblpProceeding:
        """
        Get ceur-ws volume form dblp
        """
        try:
            proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
        except Exception as e:
            raise HTTPException(status_code=404, detail=str(e)) from e
        if proceeding:
            return proceeding
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get("/dblp/volume/{volume_number}/editor", tags=["dblp"])
    async def dblp_volume_editors(volume_number: int) -> list[DblpScholar]:
        """
        Get ceur-ws volume editors form dblp
        """
        try:
            proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
        except Exception as e:
            raise HTTPException(status_code=404, detail=str(e)) from e
        if proceeding:
            return proceeding.editors
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get("/dblp/volume/{volume_number}/paper", tags=["dblp"])
    async def dblp_volume_papers(volume_number: int) -> list[DblpPaper]:
        """
        Get ceur-ws volume papers form dblp
        Args:
            volume_number: number of the volume

        Returns:
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        return papers

    @app.get("/dblp/volume/{volume_number}/paper/{paper_id}", tags=["dblp"])
    async def dblp_paper(volume_number: int, paper_id: str) -> DblpPaper:
        """
        Get ceur-ws volume paper form dblp
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        if papers:
            for paper in papers:
                if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                    return paper
            raise HTTPException(status_code=404, detail="Paper not found")
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get(
        "/dblp/volume/{volume_number}/paper/{paper_id}/author",
        tags=["dblp"],
    )
    async def dblp_paper_authors(volume_number: int, paper_id: str) -> list[DblpScholar]:
        """
        Get ceur-ws volume paper form dblp
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        if papers:
            for paper in papers:
                if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                    return paper.authors
            raise HTTPException(status_code=404, detail="Paper not found")
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

`configure_run()`

configure command line specific details

Source code in ceurws/webserver.py

def configure_run(self):
    """
    configure command line specific details
    """
    InputWebserver.configure_run(self)
    self.wdSync = WikidataSync.from_args(self.args)

`wikidata_view`

Created on 2024-02-23

@author: wf

`WikidataView`

Bases: View

Wikidata View

Source code in ceurws/wikidata_view.py

class WikidataView(View):
    """
    Wikidata View
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.setup_ui()

    async def update_proceedings(self):
        """
        update the cached proceedings
        """
        try:
            self.proceedings_records = self.solution.wdSync.loadProceedingsFromCache()
            with self.parent:
                ui.notify(f"found {len(self.proceedings_records)} cached wikidata proceedings records")
                self.reload_aggrid(self.proceedings_records)
        except Exception as ex:
            self.solution.handle_exception(ex)

    def reload_aggrid(self, olod: list):
        """
        reload my aggrid with the list of Volumes
        """
        reverseLod = sorted(
            olod,
            key=lambda row: int(row.get("sVolume") or row.get("Volume") or 0),
            reverse=True,
        )
        lod = []
        for row in reverseLod:
            volume = self.getRowValue(row, "sVolume")
            if volume == self.noneValue:
                volume = self.getRowValue(row, "Volume")
            if volume != self.noneValue:
                try:
                    vol_no = int(volume)
                    volumeLink = self.createLink(
                        f"http://ceur-ws.org/Vol-{volume}",
                        f"Vol-{vol_no:04}",
                    )
                except Exception as _ex:
                    volumeLink = self.noneValue
            else:
                volumeLink = self.noneValue
            itemLink = self.createItemLink(row, "item")
            eventLink = self.createItemLink(row, "event", separator="|")
            eventSeriesLink = self.createItemLink(row, "eventSeries", separator="|")
            dblpLink = self.createExternalLink(row, "dblpProceedingsId", "dblp", DblpEndpoint.DBLP_REC_PREFIX)
            k10PlusLink = self.createExternalLink(
                row, "ppnId", "k10plus", "https://opac.k10plus.de/DB=2.299/PPNSET?PPN="
            )
            lod.append(
                {
                    "#": volume,
                    "item": itemLink,
                    "volume": volumeLink,
                    "acronym": self.getRowValue(row, "short_name"),
                    "dblp": dblpLink,
                    "k10plus": k10PlusLink,
                    "event": eventLink,
                    "series": eventSeriesLink,
                    "ordinal": self.getRowValue(row, "eventSeriesOrdinal"),
                    # "title":row.get("title","?"),
                }
            )
        self.lod_grid.load_lod(lod)
        # set max width of Item column
        self.lod_grid.set_column_def("item", "maxWidth", 380)
        self.lod_grid.set_column_def("event", "maxWidth", 380)
        self.lod_grid.sizeColumnsToFit()

    async def on_refresh_button_click(self):
        """
        handle the refreshing of the proceedings from wikidata
        """
        await run.io_bound(self.refresh_wikidata)

    def refresh_wikidata(self):
        try:
            with self.solution.container:
                ui.notify("wikidata refresh button clicked")
            wd_records = self.solution.wdSync.update()
            with self.solution.container:
                ui.notify(f"read {len(wd_records)} proceeding records from wikidata")
            with self.parent:
                self.reload_aggrid(wd_records)
            pass
        except Exception as ex:
            self.solution.handle_exception(ex)

    def setup_ui(self):
        """
        setup my User Interface elements
        """
        with self.parent:
            with ui.row() as self.tool_bar:
                self.refresh_button = (
                    ui.button(
                        icon="refresh",
                        on_click=self.on_refresh_button_click,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Refresh from Wikidata SPARQL endpoint")
                )
                self.query_view = QueryView(
                    self.solution,
                    name="CEUR-WS wikidata sync",
                    sparql_endpoint=self.solution.wdSync.wikidata_endpoint,
                )
                self.query_view.show_query(self.solution.wdSync.wdQuery.query)

            # grid_config = GridConfig(
            #        key_col="Vol",
            #        multiselect=True)

            self.lod_grid = ListOfDictsGrid()
            ui.timer(0, self.update_proceedings, once=True)
            pass

`init(solution, parent)`

constructor

Parameters:

Name	Type	Description	Default
`solution`		the solution	required
`parent`		the parent UI container	required

Source code in ceurws/wikidata_view.py

def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.setup_ui()

`on_refresh_button_click()` `async`

handle the refreshing of the proceedings from wikidata

Source code in ceurws/wikidata_view.py

async def on_refresh_button_click(self):
    """
    handle the refreshing of the proceedings from wikidata
    """
    await run.io_bound(self.refresh_wikidata)

`reload_aggrid(olod)`

reload my aggrid with the list of Volumes

Source code in ceurws/wikidata_view.py

def reload_aggrid(self, olod: list):
    """
    reload my aggrid with the list of Volumes
    """
    reverseLod = sorted(
        olod,
        key=lambda row: int(row.get("sVolume") or row.get("Volume") or 0),
        reverse=True,
    )
    lod = []
    for row in reverseLod:
        volume = self.getRowValue(row, "sVolume")
        if volume == self.noneValue:
            volume = self.getRowValue(row, "Volume")
        if volume != self.noneValue:
            try:
                vol_no = int(volume)
                volumeLink = self.createLink(
                    f"http://ceur-ws.org/Vol-{volume}",
                    f"Vol-{vol_no:04}",
                )
            except Exception as _ex:
                volumeLink = self.noneValue
        else:
            volumeLink = self.noneValue
        itemLink = self.createItemLink(row, "item")
        eventLink = self.createItemLink(row, "event", separator="|")
        eventSeriesLink = self.createItemLink(row, "eventSeries", separator="|")
        dblpLink = self.createExternalLink(row, "dblpProceedingsId", "dblp", DblpEndpoint.DBLP_REC_PREFIX)
        k10PlusLink = self.createExternalLink(
            row, "ppnId", "k10plus", "https://opac.k10plus.de/DB=2.299/PPNSET?PPN="
        )
        lod.append(
            {
                "#": volume,
                "item": itemLink,
                "volume": volumeLink,
                "acronym": self.getRowValue(row, "short_name"),
                "dblp": dblpLink,
                "k10plus": k10PlusLink,
                "event": eventLink,
                "series": eventSeriesLink,
                "ordinal": self.getRowValue(row, "eventSeriesOrdinal"),
                # "title":row.get("title","?"),
            }
        )
    self.lod_grid.load_lod(lod)
    # set max width of Item column
    self.lod_grid.set_column_def("item", "maxWidth", 380)
    self.lod_grid.set_column_def("event", "maxWidth", 380)
    self.lod_grid.sizeColumnsToFit()

`setup_ui()`

setup my User Interface elements

Source code in ceurws/wikidata_view.py

def setup_ui(self):
    """
    setup my User Interface elements
    """
    with self.parent:
        with ui.row() as self.tool_bar:
            self.refresh_button = (
                ui.button(
                    icon="refresh",
                    on_click=self.on_refresh_button_click,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Refresh from Wikidata SPARQL endpoint")
            )
            self.query_view = QueryView(
                self.solution,
                name="CEUR-WS wikidata sync",
                sparql_endpoint=self.solution.wdSync.wikidata_endpoint,
            )
            self.query_view.show_query(self.solution.wdSync.wdQuery.query)

        # grid_config = GridConfig(
        #        key_col="Vol",
        #        multiselect=True)

        self.lod_grid = ListOfDictsGrid()
        ui.timer(0, self.update_proceedings, once=True)
        pass

`update_proceedings()` `async`

update the cached proceedings

Source code in ceurws/wikidata_view.py

async def update_proceedings(self):
    """
    update the cached proceedings
    """
    try:
        self.proceedings_records = self.solution.wdSync.loadProceedingsFromCache()
        with self.parent:
            ui.notify(f"found {len(self.proceedings_records)} cached wikidata proceedings records")
            self.reload_aggrid(self.proceedings_records)
    except Exception as ex:
        self.solution.handle_exception(ex)

`wikidatasync`

Created on 2022-08-14

@author: wf

`WikidataSync`

synchronize with wikidata

Source code in ceurws/wikidatasync.py

class WikidataSync:
    """
    synchronize with wikidata
    """

    def __init__(
        self,
        baseurl: str = "https://www.wikidata.org",
        debug: bool = False,
        dblp_endpoint_url: str | None = None,
    ):
        """
        Constructor

        Args:
            baseurl(str): the baseurl of the wikidata endpoint
            debug(bool): if True switch on debugging
            dblp_endpoint_url: sparql endpoint url of dblp
        """
        if dblp_endpoint_url is None:
            dblp_endpoint_url = DBLP_ENDPOINT.endpoint
        self.debug = debug
        self.prepareVolumeManager()
        self.preparePaperManager()
        self.prepareRDF()
        self.wdQuery = self.qm.queriesByName["Proceedings"]
        self.baseurl = baseurl
        self.wd = Wikidata(debug=debug)
        self.sqldb = SQLDB(CEURWS.CACHE_FILE, check_same_thread=False)
        self.procRecords = None
        self.procsByVolnumber = None
        self.dblpEndpoint = DblpEndpoint(endpoint=dblp_endpoint_url)
        self.wikidata_endpoint: Endpoint | None = None

    @classmethod
    def from_args(cls, args) -> "WikidataSync":
        """
        create a WikidataSync object from the given command line arguments

        Args:
            args(Namespace): the command line arguments
        """
        wd_en = args.wikidata_endpoint_name
        dblp_en = args.dblp_endpoint_name
        wd_sync = cls.from_endpoint_names(wd_en, dblp_en, debug=args.debug)
        return wd_sync

    @classmethod
    def from_endpoint_names(cls, wd_en: str, dblp_en: str, debug: bool = False) -> "WikidataSync":
        """
        create a WikidataSync object from the given endpoint names

        Args:
            wd_en(str): wikidata endpoint name
            dblp_en(str): dblp endpoint name
        """
        endpoints = EndpointManager.getEndpoints()
        if wd_en not in endpoints:
            raise Exception(f"invalid wikidata endpoint name {wd_en}\nsee sparqlquery -le ")
        if dblp_en not in endpoints:
            raise Exception(f"invalid dblp endpoint name {dblp_en}\nsee sparqlquery -le ")
        dblp_ep = endpoints[dblp_en]
        wd_ep = endpoints[wd_en]
        wd_sync = cls(
            baseurl=wd_ep.endpoint,
            dblp_endpoint_url=dblp_ep.endpoint,
            debug=debug,
        )
        wd_sync.wikidata_endpoint = wd_ep
        return wd_sync

    def login(self):
        self.wd.loginWithCredentials()

    def logout(self):
        self.wd.logout()

    def itemUrl(self, qId):
        url = f"{self.baseurl}/wiki/{qId}"
        return url

    def prepareRDF(self):
        # SPARQL setup
        self.endpoints = EndpointManager.getEndpoints(lang="sparql")
        self.endpointConf = self.endpoints.get("wikidata")
        self.sparql = SPARQL(self.endpointConf.endpoint)
        path = os.path.dirname(__file__)
        qYamlFile = f"{path}/resources/queries/ceurws.yaml"
        if os.path.isfile(qYamlFile):
            self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)

    def preparePaperManager(self):
        """
        prepare my paper Manager
        """
        self.pm = PaperManager()
        if self.pm.isCached():
            self.pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
        else:
            print(
                "PaperManager not cached you might want to run ceur-ws --recreate",
                file=sys.stderr,
            )

    def prepareVolumeManager(self):
        """
        prepare my volume manager
        """
        self.vm = VolumeManager()
        self.vm.load()
        self.volumesByNumber, _duplicates = LOD.getLookup(self.vm.getList(), "number")
        self.volumeList = self.vm.getList()
        self.volumeCount = len(self.volumeList)
        self.volumeOptions = {}
        reverse_keys = sorted(self.volumesByNumber.keys(), reverse=True)
        for volume_number in reverse_keys:
            volume = self.volumesByNumber[volume_number]
            self.volumeOptions[volume.number] = f"Vol-{volume.number}:{volume.title}"

    def addVolume(self, volume: Volume):
        """
        add the given volume

        Args:
            volume(Volume): the volume to add
        """
        self.volumeList.append(volume)
        self.volumesByNumber[volume.number] = volume
        self.volumeCount += 1

    def getRecentlyAddedVolumeList(self) -> tuple[dict[int, dict], list[dict]]:
        """
        get the list of volumes that have recently been added
        we do not expect deletions

        Returns:
            list[int]: list of volume numbers recently added

        """
        self.prepareVolumeManager()
        refreshVm = VolumeManager()
        parser_config = ParserConfig()
        parser_config.force_download = True
        self.vm.set_down_to_volume(parser_config)
        refreshVm.loadFromIndexHtml(parser_config=parser_config)
        refreshVolumesByNumber, _duplicates = LOD.getLookup(refreshVm.getList(), "number")
        # https://stackoverflow.com/questions/3462143/get-difference-between-two-lists
        newVolumes = list(set(list(refreshVolumesByNumber.keys())) - set(list(self.volumesByNumber.keys())))
        return refreshVolumesByNumber, newVolumes

    def storeVolumes(self):
        """
        store my volumes
        """
        self.vm.store()

    def getWikidataProceedingsRecord(self, volume):
        """
        get the wikidata Record for the given volume
        """
        record = {
            "title": getattr(volume, "title", None),
            "label": getattr(volume, "title", None),
            "description": f"Proceedings of {getattr(volume, 'acronym', None)} workshop",
            "urn": getattr(volume, "urn", None),
            "short name": getattr(volume, "acronym", None),
            "volume": getattr(volume, "number", None),
            "pubDate": getattr(volume, "pubDate", None),
            "ceurwsUrl": getattr(volume, "url", None),
            "language of work or name": "Q1860",
            "fullWorkUrl": getattr(volume, "url", None),
        }
        if isinstance(record.get("pubDate"), datetime.datetime):
            record["pubDate"] = record["pubDate"].isoformat()
        return record

    def getWikidataEventRecord(self, volume: Volume):
        """
        get the wikidata Record for the given volume
        """
        volumeTitle = volume.title
        volumeNumber = volume.number
        dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(number=volumeNumber)
        title = label = instanceOf = description = None
        if volumeTitle:
            instanceOf, description = self.getEventTypeFromTitle(volumeTitle)
            title = label = self.getEventNameFromTitle(volumeTitle)
        start_time = volume.dateFrom
        end_time = volume.dateTo
        record = {
            "title": title,
            "label": label,
            "description": description,
            "instanceOf": instanceOf,
            "short name": volume.acronym,
            "locationWikidataId": volume.cityWikidataId,
            "countryWikidataId": volume.countryWikidataId,
            "start time": start_time.isoformat() if start_time is not None else start_time,
            "end time": end_time.isoformat() if end_time is not None else end_time,
            "referenceUrl": volume.getVolumeUrl(),
        }
        if dblpEntityIds is not None and len(dblpEntityIds) > 0:
            dblpEntityId = dblpEntityIds[0]
            record["describedAt"] = self.dblpEndpoint.toDblpUrl(dblpEntityId)
            record["language of work or name"] = "Q1860"
            record["dblpEventId"] = self.dblpEndpoint.convertEntityIdToUrlId(entityId=dblpEntityId)
        # the modeling of virtual events has changed in wikidata
        # virtual event (Q7935096) is discontinued for conferences
        # if volume.isVirtualEvent():
        #     record["instanceOf"] = [instanceOf, "Q7935096"]
        return record

    def update(self, withStore: bool = True):
        """
        update my table from the Wikidata Proceedings SPARQL query
        """
        if self.debug:
            print(f"Querying proceedings from {self.baseurl} ...")
        # query proceedings
        wd_proceedings_records: list[dict] = self.sparql.queryAsListOfDicts(self.wdQuery.query)
        # query events
        event_query = self.qm.queriesByName["EventsByProceeding"]
        wd_event_records: list[dict] = self.sparql.queryAsListOfDicts(event_query.query)
        # add events to proceeding records
        proceedings_event_map, _duplicates = LOD.getLookup(wd_event_records, "item")
        for proceedings_record in wd_proceedings_records:
            item = proceedings_record.get("item")
            if item in proceedings_event_map:
                event_record = proceedings_event_map.get(item)
                proceedings_record.update(**event_record)
        primaryKey = "URN_NBN"
        withCreate = True
        withDrop = True
        entityInfo = self.sqldb.createTable(
            wd_proceedings_records,
            "Proceedings",
            primaryKey,
            withCreate,
            withDrop,
            sampleRecordCount=5000,
            failIfTooFew=False,
        )
        procsByURN, duplicates = LOD.getLookup(wd_proceedings_records, "URN_NBN")
        if withStore:
            self.sqldb.store(procsByURN.values(), entityInfo, executeMany=True, fixNone=True)
        if self.debug:
            print(f"stored {len(procsByURN.values())} proceedings records")
        if len(duplicates) > 0:
            print(f"found {len(duplicates)} duplicates URN entries")
            if len(duplicates) < 10:
                print(duplicates)
        return wd_proceedings_records

    def loadProceedingsFromCache(self):
        """
        load the proceedings records from the cache
        """
        sqlQuery = "SELECT * from Proceedings"
        self.procRecords = self.sqldb.query(sqlQuery)
        return self.procRecords

    def getProceedingsForVolume(self, searchVolnumber: int) -> dict | None:
        """
        get the proceedings record for the given searchVolnumber

        Args:
            searchVolnumber(int): the number of the volume to search

        Returns:
            dict: the record for the proceedings in wikidata
            None: if the proceeding record in not found for the given searchVolnumber
        """
        if self.procRecords is None:
            self.loadProceedingsFromCache()
        if self.procsByVolnumber is None:
            self.procsByVolnumber: dict[int, dict] = {}
            if isinstance(self.procRecords, list):
                for procRecord in self.procRecords:
                    volnumber = procRecord.get("sVolume", None)
                    if volnumber is None:
                        procRecord.get("Volume", None)
                    if volnumber is not None:
                        self.procsByVolnumber[int(volnumber)] = procRecord
        volProcRecord = self.procsByVolnumber.get(searchVolnumber, None)
        return volProcRecord

    def getProceedingWdItemsByUrn(self, urn: str) -> list[str]:
        """
        queries the wikidata items that have the given urn for the property P4109
        Args:
            urn: URN id to query for

        Returns:
            List of corresponding wikidata item ids or empty list of no matching item is found
        """
        query = f"""SELECT ?proceeding WHERE{{ ?proceeding wdt:P4109 "{urn}"}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("proceeding") for record in qres]
        return wdItems

    def getEventWdItemsByUrn(self, urn: str) -> list[str]:
        """
        queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event
        Args:
            urn: URN id to query for

        Returns:
            List of corresponding wikidata item ids or empty list of no matching item is found
        """
        query = f"""SELECT ?event WHERE{{ ?proceeding wdt:P4109 "{urn}"; wdt:P4745 ?event .}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event") for record in qres]
        return wdItems

    def getEventsOfProceedings(self, itemId: str) -> list[str]:
        """
        get the item ids of the events the given proceedings ids is the proceedings from
        Args:
            itemId: Qid of the proceedings

        Returns:
            List of the events
        """
        query = f"""SELECT ?event WHERE {{ wd:{itemId} wdt:P4745 ?event.}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
        return wdItems

    def getEventsOfProceedingsByVolnumber(self, volnumber: int | str) -> list[str]:
        """
        get the item ids of the events the given proceedings ids is the proceedings from
        Args:
            volnumber: Volume number of the proceedings

        Returns:
            List of the events
        """
        query = f"""SELECT ?event 
                    WHERE {{
                    ?proceeding wdt:P31 wd:Q1143604; 
                                p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volnumber}"]; 
                                wdt:P4745 ?event.}}
        """
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
        return wdItems

    def addProceedingsToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
        """
        Creates a wikidata entry for the given record

        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        """
        if write:
            self.login()
        result = self.doAddProceedingsToWikidata(record, write, ignoreErrors)
        if write:
            self.logout()
        return result

    def doAddProceedingsToWikidata(
        self, record: dict, write: bool = True, ignoreErrors: bool = False
    ) -> WikidataResult:
        """
        Creates a wikidata proceedings entry for the given record

        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors
        Returns:
            WikidataResult: the result of the add operation
        """
        mappings = [
            PropertyMapping(
                column="instanceof",
                propertyName="instanceof",
                propertyId="P31",
                propertyType=WdDatatype.itemid,
                value="Q1143604",
            ),
            PropertyMapping(
                column="part of the series",
                propertyName="part of the series",
                propertyId="P179",
                propertyType=WdDatatype.itemid,
                value="Q27230297",
            ),
            PropertyMapping(
                column="volume",
                propertyName="volume",
                propertyId="P478",
                propertyType=WdDatatype.string,
                qualifierOf="part of the series",
            ),  # ToDo: refactor qualifier of anchor column or property name?
            PropertyMapping(
                column="short name",
                propertyName="short name",
                propertyId="P1813",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="pubDate",
                propertyName="publication date",
                propertyId="P577",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="title",
                propertyName="title",
                propertyId="P1476",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="ceurwsUrl",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
                qualifierOf="ceurwsUrl",
            ),
            PropertyMapping(
                column="fullWorkUrl",
                propertyName="full work available at URL",
                propertyId="P953",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="urn",
                propertyName="URN-NBN",
                propertyId="P4109",
                propertyType=WdDatatype.extid,
            ),
        ]
        reference = UrlReference(url=record.get("ceurwsUrl"))
        result = self.wd.add_record(
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def askWikidata(self, askQuery: str) -> bool:
        try:
            qres = self.sparql.rawQuery(askQuery).convert()
            return qres.get("boolean", False)
        except Exception as ex:
            print(ex)
            return False

    def checkIfProceedingsFromExists(self, volumeNumber: int, eventItemQid: str | None) -> bool:
        """Returns True if the is proceedings from relation already exists between the given proceedings and event"""
        eventVar = "?event"
        if eventItemQid is not None:
            eventVar = f"wd:{eventItemQid}"
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        query = f"""ASK{{ wd:{proceedingsWikidataId} wdt:P4745 {eventVar}.}}"""
        proceedingExists = self.askWikidata(query)
        return proceedingExists

    def hasItemPropertyValueFor(self, item, propertyId: str):
        """
        ask wikidata if the given item has a value for the given property
        Args:
            item: item Qid
            propertyId: property Pid
        Returns:
            True if the item has the property else False
        """
        query = f"""ASK{{ wd:{item} wdt:{propertyId} ?value.}}"""
        return self.askWikidata(query)

    def addLinkBetweenProceedingsAndEvent(
        self,
        eventItemQid: str,
        volumeNumber: int | None = None,
        proceedingsWikidataId: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ) -> WikidataResult:
        """
        add the link between the wikidata proceedings item and the given event wikidata item
        Args:
            volumeNumber: ceurws volume number of the proceedings
            eventItemQid: wikidata Qid of the event
            proceedingsWikidataId: wikidata id of the proceedings item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation

        Raises:
            ValueError: if the volume number is not provided or the volume is not unique in Wikidata
        """
        if proceedingsWikidataId is None:
            proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        if proceedingsWikidataId is None:
            raise ValueError("Volume is not unique → Proceedings item can not be determined")
        mappings = [
            PropertyMapping(
                column="isProceedingsFrom",
                propertyName="is proceedings from",
                propertyId="P4745",
                propertyType=WdDatatype.itemid,
            )
        ]
        reference = None
        if volumeNumber is not None:
            volume_url = Volume.getVolumeUrlOf(volumeNumber)
            reference = UrlReference(volume_url)
        record = {"isProceedingsFrom": eventItemQid}
        result = self.wd.add_record(
            item_id=proceedingsWikidataId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def doAddEventToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
        """
        Creates a wikidata event entry for the given record
        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        entityQid = record.get("instanceOf")
        # entity = record.get("description")
        mappings = [
            PropertyMapping(
                column="instanceof",
                propertyName="instanceof",
                propertyId="P31",
                propertyType=WdDatatype.itemid,
                value=entityQid,
            ),
            PropertyMapping(
                column="short name",
                propertyName="short name",
                propertyId="P1813",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="describedAt",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
                qualifierOf="describedAt",
                value="Q1860",
            ),
            PropertyMapping(
                column="title",
                propertyName="title",
                propertyId="P1476",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="describedAt",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="dblpEventId",
                propertyName="DBLP event ID",
                propertyId="P10692",
                propertyType=WdDatatype.extid,
            ),
            PropertyMapping(
                column="start time",
                propertyName="start time",
                propertyId="P580",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="end time",
                propertyName="end time",
                propertyId="P582",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="locationWikidataId",
                propertyName="location",
                propertyId="P276",
                propertyType=WdDatatype.itemid,
            ),
            PropertyMapping(
                column="countryWikidataId",
                propertyName="country",
                propertyId="P17",
                propertyType=WdDatatype.itemid,
            ),
        ]
        reference_url = record.pop("referenceUrl")
        reference = UrlReference(url=reference_url)
        result = self.wd.add_record(
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def addDblpPublicationId(
        self,
        volumeNumber: int,
        dblpRecordId: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ) -> WikidataResult:
        """
        try to add the dblp publication id (P8978) to the proceedings record
        Args:
            volumeNumber: ceurws volumenumber of the proceedings
            dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id
            write: if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        if proceedingsWikidataId is None:
            return False, "Proceedings item can not be determined"
        if self.hasItemPropertyValueFor(item=proceedingsWikidataId, propertyId="P8978"):
            return (
                False,
                "dblp publication id is already assigned to the proceedings item",
            )
        if dblpRecordId is None:
            dblpRecordIds = self.dblpEndpoint.getDblpIdByVolumeNumber(volumeNumber)
            if len(dblpRecordIds) == 1:
                dblpRecordId = dblpRecordIds[0]
            elif len(dblpRecordIds) > 1:
                return (
                    False,
                    f"More than one proceedings record found ({dblpRecordIds})",
                )
            else:
                return (
                    False,
                    f"Proceedings of volume {volumeNumber} are not in dblp",
                )
        mappings = [
            PropertyMapping(
                column="DBLP publication ID",
                propertyName="DBLP publication ID",
                propertyId="P8978",
                propertyType=WdDatatype.extid,
            )
        ]
        wdMetadata = [
            {
                "Entity": "proceedings",
                "Column": "DBLP publication ID",
                "PropertyName": "DBLP publication ID",
                "PropertyId": "P8978",
                "Type": "extid",
                "Qualifier": None,
                "Lookup": "",
            }
        ]
        mapDict, _ = LOD.getLookup(wdMetadata, "PropertyId")
        volume_url = Volume.getVolumeUrlOf(volumeNumber)
        reference = UrlReference(volume_url)
        record = {"DBLP publication ID": dblpRecordId}
        result = self.wd.add_record(
            item_id=proceedingsWikidataId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def addAcronymToItem(
        self,
        itemId: str,
        acronym: str,
        desc: str | None = None,
        label: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ):
        """
        add the acronym to the given item
        Args:
            itemId: item to add the acronym to
            acronym(str): acronym of the item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            (qid, errors) id of the created entry and occurred errors
        """
        wdMetadata = [
            {
                "Column": "short name",
                "PropertyName": "short name",
                "PropertyId": "P1813",
                "Type": "text",
                "Lookup": "",
            }
        ]
        record = {"short name": acronym, "description": desc, "label": label}
        map_dict, _ = LOD.getLookup(wdMetadata, "PropertyId")
        qId, errors = self.wd.addDict(
            itemId=itemId,
            row=record,
            mapDict=map_dict,
            write=write,
            ignoreErrors=ignoreErrors,
        )
        return qId, errors

    def addOfficialWebsiteToItem(
        self,
        itemId: str,
        officialWebsite: str,
        write: bool = True,
        ignoreErrors: bool = False,
    ):
        """
        add the official website to the given item
        Args:
            itemId: item to add the acronym to
            officialWebsite(str): officialWebsite of the item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        mappings = [
            PropertyMapping(
                column="official website",
                propertyName="official website",
                propertyId="P856",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
            ),
        ]
        record = {
            "official website": officialWebsite,
            "language of work or name": "Q1860",
        }
        qId, errors = self.wd.add_record(
            item_id=itemId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
        )
        return qId, errors

    def getWikidataIdByVolumeNumber(self, number: int | None) -> str | None:
        """
        query wikidata for the qId of the proceedings of the given volume number
        Args:
            number: volume number

        Returns:
            str: wikidata id corresponding to the given volume number
            None: if the corresponding wikidata id was not found
        """
        if number is None:
            return None
        query = f"""SELECT * WHERE{{ ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{number}"].}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        qid = None
        if qres is not None and qres != []:
            qids = [record.get("proceeding").split("/")[-1] for record in qres]
            if len(qids) > 1:
                print("CEUR-WS volume number is not unique")
            else:
                qid = qids[0]
        return qid

    def getWikidataIdByDblpEventId(self, entityId: str | None, volumeNumber: int | None = None) -> list[str]:
        """
        query wikidata for the qId of items that correspond to the given dblpEventId
        Args:
            entityId: id of a dblp event
            volumeNumber: volume number

        Returns:
            list of matching wikidata items
        """
        dblpEventId = self.dblpEndpoint.convertEntityIdToUrlId(entityId=entityId)
        dblpIds = [entityId, dblpEventId]
        dblpIdsStr = " ".join([f'"{dblpId}"' for dblpId in dblpIds])
        urls = ""
        if entityId is not None:
            urls = " ".join(
                [
                    f"<{self.dblpEndpoint.toDblpUrl(entityId)}>",
                    f"<{self.dblpEndpoint.toDblpUrl(entityId, True)}>",
                ]
            )
        volumeQuery = ""
        if volumeNumber is not None:
            volumeQuery = f"""
            UNION
                  {{
                  ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volumeNumber}"].
                  ?proceeding wdt:P4745 ?qid.
                  }}
            """
        query = f"""SELECT DISTINCT ?qid
            WHERE{{
              VALUES ?url {{ {urls} }}
              VALUES ?dblpEventId {{ {dblpIdsStr} }}
              VALUES ?eventType {{wd:Q2020153 wd:Q40444998}}
              {{?qid wdt:P31 ?eventType; wdt:P973 ?url}}
              UNION
              {{?qid wdt:P31 ?eventType; wdt:P10692 ?dblpEventId}}
              {volumeQuery}
            }}
        """
        qres = self.sparql.queryAsListOfDicts(query)
        qIds = []
        if qres is not None and qres != []:
            qIds = [self.removeWdPrefix(record.get("qid")) for record in qres]
        return qIds

    @classmethod
    def getEventNameFromTitle(cls, title: str) -> str:
        """
        Get the event name from the given proceedings title
        Args:
            title: title of the proceeding

        Returns:
            name of the event
        """
        prefixes = [
            "Proceedings of the",
            "Proceedings of",
            "Joint Proceedings of the",
            "Joint Proceedings of",
            "Joint Proceedings",
            "Joint Proceeding of the",
            "Joint Proceeding of",
            "Selected Papers of the",
            "Selected Contributions of the",
            "Workshops Proceedings for the",
            "Supplementary Proceedings of the",
            "Short Paper Proceedings of",
            "Short Paper Proceedings of the",
            "Working Notes Proceedings of the",
            "Working Notes of",
            "Working Notes for",
            "Joint Workshop Proceedings of the",
            "Joint Workshop Proceedings of",
            "Workshop Proceedings from",
            "Workshop and Poster Proceedings of the",
            "Workshops Proceedings and Tutorials of the",
            "Extended Papers of the",
            "Short Papers Proceedings of the",
            "Short Papers Proceedings of",
            "Proceedings of the Selected Papers of the",
            "Proceedings of the Working Notes of",
            "Proceedings of the Doctoral Consortium Papers Presented at the",
            "Selected Contributions to the",
            "Selected and Revised Papers of",
            "Selected Papers of",
            "Up-and-Coming and Short Papers of the",
            "Academic Papers at",
            "Poster Track of the",
            "Actes de la",
            "Post-proceedings of the",
            "Late Breaking Papers of the",
            "Anais do",
            "Proceedings del",
            "Proceedings",
            "Gemeinsamer Tagungsband der",
            "Local Proceedings of the",
            "Local Proceedings and Materials of",
        ]
        postfixes = [
            "Workshop Proceedings",
            "Proceedings",
            "Conference Proceedings",
            "Workshops Proceedings",
            "Adjunct Proceedings",
            "Poster and Demo Proceedings",
            "(full papers)",
        ]
        if title is not None:
            prefixes.sort(key=lambda prefix: len(prefix), reverse=True)
            for prefix in prefixes:
                if title.lower().startswith(prefix.lower()):
                    title = title[len(prefix) :]
                    title = title.strip()
                    break
            postfixes.sort(key=lambda postfix: len(postfix), reverse=True)
            for postfix in postfixes:
                if title.lower().endswith(postfix.lower()):
                    title = title[: -len(postfix)]
                    title = title.strip(" .,")
                    break
        return title

    @classmethod
    def getEventTypeFromTitle(cls, title: str) -> tuple[str | None, str | None]:
        """
        Extract the event type from the given title
        Assumption: lowest mentioned type is the correct one
        Args:
            title: title of the event

        Returns:
            wikidata id and label of the event type
        """
        if title is None or title == "":
            return None, None
        academicConference = ("Q2020153", "academic conference")
        academicWorkshop = ("Q40444998", "academic workshop")
        if "workshop" in title.lower():
            return academicWorkshop
        elif "conference" in title.lower() or "symposium" in title.lower():
            return academicConference
        else:
            return academicWorkshop

    def doCreateEventItemAndLinkProceedings(
        self,
        volume: Volume,
        proceedingsWikidataId: str | None = None,
        write: bool = False,
    ) -> dict[str, WikidataResult]:
        """
        Create event  wikidata item for given volume and link the proceedings with the event
        Args:
            volume: volume to create the event for
            proceedingsWikidataId: proceedings wikidata id of the event
            write: If True actually write

        Returns:
            proceedingsQId, eventQId, msg
        """
        results = {}
        vol_number = volume.number
        if (
            proceedingsWikidataId is None
            and vol_number is not None
            and self.checkIfProceedingsFromExists(vol_number, eventItemQid=None)
        ):
            # link between proceedings and event already exists
            proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=vol_number)
            results["Proceedings"] = WikidataResult(
                qid=proceedingsWikidataId,
                msg=f"Proceedings for Vol-{vol_number} already exists",
            )
        dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(vol_number)
        dblpEntityId = None
        msg = None
        if len(dblpEntityIds) > 1:
            msg = f"Multiple dblpEventIds found for Vol-{vol_number}: {','.join(dblpEntityIds)}"
        elif len(dblpEntityIds) == 1:
            dblpEntityId = dblpEntityIds[0]
        else:
            dblpEntityId = None
        results["dblp"] = WikidataResult(qid=dblpEntityId, msg=msg)
        wdItems = self.getWikidataIdByDblpEventId(dblpEntityId, vol_number)
        msg = ""
        eventQid = None
        if len(wdItems) == 0:
            # event item does not exist → create a new one
            volume.resolveLoctime()
            eventRecord = self.getWikidataEventRecord(volume)
            event_result = self.doAddEventToWikidata(record=eventRecord, write=write)
            eventQid = event_result.qid
            results["Event"] = event_result
        elif len(wdItems) == 1:
            results["Event"] = WikidataResult(
                # the event item already exists
                qid=wdItems[0],
                msg="Event item already exists;",
            )
        else:
            results["Event"] = WikidataResult(msg=f"Multiple event entries exist: {','.join(wdItems)}")
        if eventQid is not None:
            # add link between Proceedings and the event item
            link_result = self.addLinkBetweenProceedingsAndEvent(
                volumeNumber=vol_number,
                eventItemQid=eventQid,
                proceedingsWikidataId=proceedingsWikidataId,
                write=write,
            )
            link_result.msg = "Added Link between Proceedings and Event item;"
            results["link"] = link_result
        return results

    @classmethod
    def removeWdPrefix(cls, value: str):
        """
        removes the wikidata entity prefix
        Args:
            value: wikidata entity url
        """
        wd_prefix = "http://www.wikidata.org/entity/"
        if value is not None and isinstance(value, str) and value.startswith(wd_prefix):
            value = value[len("http://www.wikidata.org/entity/") :]
        return value

    def getAuthorByIds(self, identifiers: dict) -> dict[str, str]:
        """
        Based on the given identifiers get potential author items
        the names of the identifiers must be according to DblpAuthorIdentifier
        Args:
            identifiers: known identifiers of the author
        """
        if identifiers is None or len(identifiers) == 0:
            return dict()
        id_map = DblpAuthorIdentifier.getAllAsMap()
        optional_clauses = []
        for id_name, id_value in identifiers.items():
            if id_value is not None and id_value != "":
                id_query = None
                if id_name in id_map:
                    id_query = DblpAuthorIdentifier.getWikidataIdQueryPart(id_name, id_value, "?person")
                else:
                    if id_name == "homepage":
                        id_query = f"{{ ?person wdt:P856 <{id_value}>. }}"
                if id_query is not None:
                    optional_clauses.append(id_query)
        id_queries = "\nUNION\n".join(optional_clauses)
        query = f"""SELECT DISTINCT ?person ?personLabel
                    WHERE
                    {{
                        {id_queries}
                        ?person rdfs:label ?personLabel. FILTER(lang(?personLabel)="en").
                    }}"""
        qres = self.sparql.queryAsListOfDicts(query)
        res = dict()
        for record in qres:
            if record is None or len(record) == 0:
                continue
            item_id = self.removeWdPrefix(record.get("person"))
            name = record.get("personLabel")
            res[item_id] = name
        return res

`init(baseurl='https://www.wikidata.org', debug=False, dblp_endpoint_url=None)`

Constructor

Parameters:

Name	Type	Description	Default
`baseurl(str)`		the baseurl of the wikidata endpoint	required
`debug(bool)`		if True switch on debugging	required
`dblp_endpoint_url`	`str \| None`	sparql endpoint url of dblp	`None`

Source code in ceurws/wikidatasync.py

def __init__(
    self,
    baseurl: str = "https://www.wikidata.org",
    debug: bool = False,
    dblp_endpoint_url: str | None = None,
):
    """
    Constructor

    Args:
        baseurl(str): the baseurl of the wikidata endpoint
        debug(bool): if True switch on debugging
        dblp_endpoint_url: sparql endpoint url of dblp
    """
    if dblp_endpoint_url is None:
        dblp_endpoint_url = DBLP_ENDPOINT.endpoint
    self.debug = debug
    self.prepareVolumeManager()
    self.preparePaperManager()
    self.prepareRDF()
    self.wdQuery = self.qm.queriesByName["Proceedings"]
    self.baseurl = baseurl
    self.wd = Wikidata(debug=debug)
    self.sqldb = SQLDB(CEURWS.CACHE_FILE, check_same_thread=False)
    self.procRecords = None
    self.procsByVolnumber = None
    self.dblpEndpoint = DblpEndpoint(endpoint=dblp_endpoint_url)
    self.wikidata_endpoint: Endpoint | None = None

`addAcronymToItem(itemId, acronym, desc=None, label=None, write=True, ignoreErrors=False)`

add the acronym to the given item Args: itemId: item to add the acronym to acronym(str): acronym of the item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Type	Description
	(qid, errors) id of the created entry and occurred errors

Source code in ceurws/wikidatasync.py

def addAcronymToItem(
    self,
    itemId: str,
    acronym: str,
    desc: str | None = None,
    label: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
):
    """
    add the acronym to the given item
    Args:
        itemId: item to add the acronym to
        acronym(str): acronym of the item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        (qid, errors) id of the created entry and occurred errors
    """
    wdMetadata = [
        {
            "Column": "short name",
            "PropertyName": "short name",
            "PropertyId": "P1813",
            "Type": "text",
            "Lookup": "",
        }
    ]
    record = {"short name": acronym, "description": desc, "label": label}
    map_dict, _ = LOD.getLookup(wdMetadata, "PropertyId")
    qId, errors = self.wd.addDict(
        itemId=itemId,
        row=record,
        mapDict=map_dict,
        write=write,
        ignoreErrors=ignoreErrors,
    )
    return qId, errors

`addDblpPublicationId(volumeNumber, dblpRecordId=None, write=True, ignoreErrors=False)`

try to add the dblp publication id (P8978) to the proceedings record Args: volumeNumber: ceurws volumenumber of the proceedings dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id write: if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name	Type	Description
`WikidataResult`	`WikidataResult`	the result of the add operation

Source code in ceurws/wikidatasync.py

def addDblpPublicationId(
    self,
    volumeNumber: int,
    dblpRecordId: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
) -> WikidataResult:
    """
    try to add the dblp publication id (P8978) to the proceedings record
    Args:
        volumeNumber: ceurws volumenumber of the proceedings
        dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id
        write: if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    if proceedingsWikidataId is None:
        return False, "Proceedings item can not be determined"
    if self.hasItemPropertyValueFor(item=proceedingsWikidataId, propertyId="P8978"):
        return (
            False,
            "dblp publication id is already assigned to the proceedings item",
        )
    if dblpRecordId is None:
        dblpRecordIds = self.dblpEndpoint.getDblpIdByVolumeNumber(volumeNumber)
        if len(dblpRecordIds) == 1:
            dblpRecordId = dblpRecordIds[0]
        elif len(dblpRecordIds) > 1:
            return (
                False,
                f"More than one proceedings record found ({dblpRecordIds})",
            )
        else:
            return (
                False,
                f"Proceedings of volume {volumeNumber} are not in dblp",
            )
    mappings = [
        PropertyMapping(
            column="DBLP publication ID",
            propertyName="DBLP publication ID",
            propertyId="P8978",
            propertyType=WdDatatype.extid,
        )
    ]
    wdMetadata = [
        {
            "Entity": "proceedings",
            "Column": "DBLP publication ID",
            "PropertyName": "DBLP publication ID",
            "PropertyId": "P8978",
            "Type": "extid",
            "Qualifier": None,
            "Lookup": "",
        }
    ]
    mapDict, _ = LOD.getLookup(wdMetadata, "PropertyId")
    volume_url = Volume.getVolumeUrlOf(volumeNumber)
    reference = UrlReference(volume_url)
    record = {"DBLP publication ID": dblpRecordId}
    result = self.wd.add_record(
        item_id=proceedingsWikidataId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

`addLinkBetweenProceedingsAndEvent(eventItemQid, volumeNumber=None, proceedingsWikidataId=None, write=True, ignoreErrors=False)`

add the link between the wikidata proceedings item and the given event wikidata item Args: volumeNumber: ceurws volume number of the proceedings eventItemQid: wikidata Qid of the event proceedingsWikidataId: wikidata id of the proceedings item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name	Type	Description
`WikidataResult`	`WikidataResult`	the result of the add operation

Raises:

Type	Description
`ValueError`	if the volume number is not provided or the volume is not unique in Wikidata

Source code in ceurws/wikidatasync.py

def addLinkBetweenProceedingsAndEvent(
    self,
    eventItemQid: str,
    volumeNumber: int | None = None,
    proceedingsWikidataId: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
) -> WikidataResult:
    """
    add the link between the wikidata proceedings item and the given event wikidata item
    Args:
        volumeNumber: ceurws volume number of the proceedings
        eventItemQid: wikidata Qid of the event
        proceedingsWikidataId: wikidata id of the proceedings item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation

    Raises:
        ValueError: if the volume number is not provided or the volume is not unique in Wikidata
    """
    if proceedingsWikidataId is None:
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    if proceedingsWikidataId is None:
        raise ValueError("Volume is not unique → Proceedings item can not be determined")
    mappings = [
        PropertyMapping(
            column="isProceedingsFrom",
            propertyName="is proceedings from",
            propertyId="P4745",
            propertyType=WdDatatype.itemid,
        )
    ]
    reference = None
    if volumeNumber is not None:
        volume_url = Volume.getVolumeUrlOf(volumeNumber)
        reference = UrlReference(volume_url)
    record = {"isProceedingsFrom": eventItemQid}
    result = self.wd.add_record(
        item_id=proceedingsWikidataId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

`addOfficialWebsiteToItem(itemId, officialWebsite, write=True, ignoreErrors=False)`

add the official website to the given item Args: itemId: item to add the acronym to officialWebsite(str): officialWebsite of the item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name	Type	Description
`WikidataResult`		the result of the add operation

Source code in ceurws/wikidatasync.py

def addOfficialWebsiteToItem(
    self,
    itemId: str,
    officialWebsite: str,
    write: bool = True,
    ignoreErrors: bool = False,
):
    """
    add the official website to the given item
    Args:
        itemId: item to add the acronym to
        officialWebsite(str): officialWebsite of the item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    mappings = [
        PropertyMapping(
            column="official website",
            propertyName="official website",
            propertyId="P856",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
        ),
    ]
    record = {
        "official website": officialWebsite,
        "language of work or name": "Q1860",
    }
    qId, errors = self.wd.add_record(
        item_id=itemId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
    )
    return qId, errors

`addProceedingsToWikidata(record, write=True, ignoreErrors=False)`

Creates a wikidata entry for the given record

Parameters:

Name	Description	Default
`record(dict)`	the data to add	required
`write(bool)`	if True actually write	required
`ignoreErrors(bool)`	if True ignore errors	required

Source code in ceurws/wikidatasync.py

def addProceedingsToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
    """
    Creates a wikidata entry for the given record

    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    """
    if write:
        self.login()
    result = self.doAddProceedingsToWikidata(record, write, ignoreErrors)
    if write:
        self.logout()
    return result

`addVolume(volume)`

add the given volume

Parameters:

Name	Type	Description	Default
`volume(Volume)`		the volume to add	required

Source code in ceurws/wikidatasync.py

def addVolume(self, volume: Volume):
    """
    add the given volume

    Args:
        volume(Volume): the volume to add
    """
    self.volumeList.append(volume)
    self.volumesByNumber[volume.number] = volume
    self.volumeCount += 1

`checkIfProceedingsFromExists(volumeNumber, eventItemQid)`

Returns True if the is proceedings from relation already exists between the given proceedings and event

Source code in ceurws/wikidatasync.py

def checkIfProceedingsFromExists(self, volumeNumber: int, eventItemQid: str | None) -> bool:
    """Returns True if the is proceedings from relation already exists between the given proceedings and event"""
    eventVar = "?event"
    if eventItemQid is not None:
        eventVar = f"wd:{eventItemQid}"
    proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    query = f"""ASK{{ wd:{proceedingsWikidataId} wdt:P4745 {eventVar}.}}"""
    proceedingExists = self.askWikidata(query)
    return proceedingExists

`doAddEventToWikidata(record, write=True, ignoreErrors=False)`

Creates a wikidata event entry for the given record Args: record(dict): the data to add write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name	Type	Description
`WikidataResult`		the result of the add operation

Source code in ceurws/wikidatasync.py

def doAddEventToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
    """
    Creates a wikidata event entry for the given record
    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    entityQid = record.get("instanceOf")
    # entity = record.get("description")
    mappings = [
        PropertyMapping(
            column="instanceof",
            propertyName="instanceof",
            propertyId="P31",
            propertyType=WdDatatype.itemid,
            value=entityQid,
        ),
        PropertyMapping(
            column="short name",
            propertyName="short name",
            propertyId="P1813",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="describedAt",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
            qualifierOf="describedAt",
            value="Q1860",
        ),
        PropertyMapping(
            column="title",
            propertyName="title",
            propertyId="P1476",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="describedAt",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="dblpEventId",
            propertyName="DBLP event ID",
            propertyId="P10692",
            propertyType=WdDatatype.extid,
        ),
        PropertyMapping(
            column="start time",
            propertyName="start time",
            propertyId="P580",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="end time",
            propertyName="end time",
            propertyId="P582",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="locationWikidataId",
            propertyName="location",
            propertyId="P276",
            propertyType=WdDatatype.itemid,
        ),
        PropertyMapping(
            column="countryWikidataId",
            propertyName="country",
            propertyId="P17",
            propertyType=WdDatatype.itemid,
        ),
    ]
    reference_url = record.pop("referenceUrl")
    reference = UrlReference(url=reference_url)
    result = self.wd.add_record(
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

`doAddProceedingsToWikidata(record, write=True, ignoreErrors=False)`

Creates a wikidata proceedings entry for the given record

Parameters:

Name	Description	Default
`record(dict)`	the data to add	required
`write(bool)`	if True actually write	required
`ignoreErrors(bool)`	if True ignore errors	required

Returns: WikidataResult: the result of the add operation

Source code in ceurws/wikidatasync.py

def doAddProceedingsToWikidata(
    self, record: dict, write: bool = True, ignoreErrors: bool = False
) -> WikidataResult:
    """
    Creates a wikidata proceedings entry for the given record

    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors
    Returns:
        WikidataResult: the result of the add operation
    """
    mappings = [
        PropertyMapping(
            column="instanceof",
            propertyName="instanceof",
            propertyId="P31",
            propertyType=WdDatatype.itemid,
            value="Q1143604",
        ),
        PropertyMapping(
            column="part of the series",
            propertyName="part of the series",
            propertyId="P179",
            propertyType=WdDatatype.itemid,
            value="Q27230297",
        ),
        PropertyMapping(
            column="volume",
            propertyName="volume",
            propertyId="P478",
            propertyType=WdDatatype.string,
            qualifierOf="part of the series",
        ),  # ToDo: refactor qualifier of anchor column or property name?
        PropertyMapping(
            column="short name",
            propertyName="short name",
            propertyId="P1813",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="pubDate",
            propertyName="publication date",
            propertyId="P577",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="title",
            propertyName="title",
            propertyId="P1476",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="ceurwsUrl",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
            qualifierOf="ceurwsUrl",
        ),
        PropertyMapping(
            column="fullWorkUrl",
            propertyName="full work available at URL",
            propertyId="P953",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="urn",
            propertyName="URN-NBN",
            propertyId="P4109",
            propertyType=WdDatatype.extid,
        ),
    ]
    reference = UrlReference(url=record.get("ceurwsUrl"))
    result = self.wd.add_record(
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

`doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId=None, write=False)`

Create event wikidata item for given volume and link the proceedings with the event Args: volume: volume to create the event for proceedingsWikidataId: proceedings wikidata id of the event write: If True actually write

Returns:

Type	Description
`dict[str, WikidataResult]`	proceedingsQId, eventQId, msg

Source code in ceurws/wikidatasync.py

def doCreateEventItemAndLinkProceedings(
    self,
    volume: Volume,
    proceedingsWikidataId: str | None = None,
    write: bool = False,
) -> dict[str, WikidataResult]:
    """
    Create event  wikidata item for given volume and link the proceedings with the event
    Args:
        volume: volume to create the event for
        proceedingsWikidataId: proceedings wikidata id of the event
        write: If True actually write

    Returns:
        proceedingsQId, eventQId, msg
    """
    results = {}
    vol_number = volume.number
    if (
        proceedingsWikidataId is None
        and vol_number is not None
        and self.checkIfProceedingsFromExists(vol_number, eventItemQid=None)
    ):
        # link between proceedings and event already exists
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=vol_number)
        results["Proceedings"] = WikidataResult(
            qid=proceedingsWikidataId,
            msg=f"Proceedings for Vol-{vol_number} already exists",
        )
    dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(vol_number)
    dblpEntityId = None
    msg = None
    if len(dblpEntityIds) > 1:
        msg = f"Multiple dblpEventIds found for Vol-{vol_number}: {','.join(dblpEntityIds)}"
    elif len(dblpEntityIds) == 1:
        dblpEntityId = dblpEntityIds[0]
    else:
        dblpEntityId = None
    results["dblp"] = WikidataResult(qid=dblpEntityId, msg=msg)
    wdItems = self.getWikidataIdByDblpEventId(dblpEntityId, vol_number)
    msg = ""
    eventQid = None
    if len(wdItems) == 0:
        # event item does not exist → create a new one
        volume.resolveLoctime()
        eventRecord = self.getWikidataEventRecord(volume)
        event_result = self.doAddEventToWikidata(record=eventRecord, write=write)
        eventQid = event_result.qid
        results["Event"] = event_result
    elif len(wdItems) == 1:
        results["Event"] = WikidataResult(
            # the event item already exists
            qid=wdItems[0],
            msg="Event item already exists;",
        )
    else:
        results["Event"] = WikidataResult(msg=f"Multiple event entries exist: {','.join(wdItems)}")
    if eventQid is not None:
        # add link between Proceedings and the event item
        link_result = self.addLinkBetweenProceedingsAndEvent(
            volumeNumber=vol_number,
            eventItemQid=eventQid,
            proceedingsWikidataId=proceedingsWikidataId,
            write=write,
        )
        link_result.msg = "Added Link between Proceedings and Event item;"
        results["link"] = link_result
    return results

`from_args(args)` `classmethod`

create a WikidataSync object from the given command line arguments

Parameters:

Name	Type	Description	Default
`args(Namespace)`		the command line arguments	required

Source code in ceurws/wikidatasync.py

@classmethod
def from_args(cls, args) -> "WikidataSync":
    """
    create a WikidataSync object from the given command line arguments

    Args:
        args(Namespace): the command line arguments
    """
    wd_en = args.wikidata_endpoint_name
    dblp_en = args.dblp_endpoint_name
    wd_sync = cls.from_endpoint_names(wd_en, dblp_en, debug=args.debug)
    return wd_sync

`from_endpoint_names(wd_en, dblp_en, debug=False)` `classmethod`

create a WikidataSync object from the given endpoint names

Parameters:

Name	Type	Description	Default
`wd_en(str)`		wikidata endpoint name	required
`dblp_en(str)`		dblp endpoint name	required

Source code in ceurws/wikidatasync.py

@classmethod
def from_endpoint_names(cls, wd_en: str, dblp_en: str, debug: bool = False) -> "WikidataSync":
    """
    create a WikidataSync object from the given endpoint names

    Args:
        wd_en(str): wikidata endpoint name
        dblp_en(str): dblp endpoint name
    """
    endpoints = EndpointManager.getEndpoints()
    if wd_en not in endpoints:
        raise Exception(f"invalid wikidata endpoint name {wd_en}\nsee sparqlquery -le ")
    if dblp_en not in endpoints:
        raise Exception(f"invalid dblp endpoint name {dblp_en}\nsee sparqlquery -le ")
    dblp_ep = endpoints[dblp_en]
    wd_ep = endpoints[wd_en]
    wd_sync = cls(
        baseurl=wd_ep.endpoint,
        dblp_endpoint_url=dblp_ep.endpoint,
        debug=debug,
    )
    wd_sync.wikidata_endpoint = wd_ep
    return wd_sync

`getAuthorByIds(identifiers)`

Based on the given identifiers get potential author items the names of the identifiers must be according to DblpAuthorIdentifier Args: identifiers: known identifiers of the author

Source code in ceurws/wikidatasync.py

def getAuthorByIds(self, identifiers: dict) -> dict[str, str]:
    """
    Based on the given identifiers get potential author items
    the names of the identifiers must be according to DblpAuthorIdentifier
    Args:
        identifiers: known identifiers of the author
    """
    if identifiers is None or len(identifiers) == 0:
        return dict()
    id_map = DblpAuthorIdentifier.getAllAsMap()
    optional_clauses = []
    for id_name, id_value in identifiers.items():
        if id_value is not None and id_value != "":
            id_query = None
            if id_name in id_map:
                id_query = DblpAuthorIdentifier.getWikidataIdQueryPart(id_name, id_value, "?person")
            else:
                if id_name == "homepage":
                    id_query = f"{{ ?person wdt:P856 <{id_value}>. }}"
            if id_query is not None:
                optional_clauses.append(id_query)
    id_queries = "\nUNION\n".join(optional_clauses)
    query = f"""SELECT DISTINCT ?person ?personLabel
                WHERE
                {{
                    {id_queries}
                    ?person rdfs:label ?personLabel. FILTER(lang(?personLabel)="en").
                }}"""
    qres = self.sparql.queryAsListOfDicts(query)
    res = dict()
    for record in qres:
        if record is None or len(record) == 0:
            continue
        item_id = self.removeWdPrefix(record.get("person"))
        name = record.get("personLabel")
        res[item_id] = name
    return res

`getEventNameFromTitle(title)` `classmethod`

Get the event name from the given proceedings title Args: title: title of the proceeding

Returns:

Type	Description
`str`	name of the event

Source code in ceurws/wikidatasync.py

@classmethod
def getEventNameFromTitle(cls, title: str) -> str:
    """
    Get the event name from the given proceedings title
    Args:
        title: title of the proceeding

    Returns:
        name of the event
    """
    prefixes = [
        "Proceedings of the",
        "Proceedings of",
        "Joint Proceedings of the",
        "Joint Proceedings of",
        "Joint Proceedings",
        "Joint Proceeding of the",
        "Joint Proceeding of",
        "Selected Papers of the",
        "Selected Contributions of the",
        "Workshops Proceedings for the",
        "Supplementary Proceedings of the",
        "Short Paper Proceedings of",
        "Short Paper Proceedings of the",
        "Working Notes Proceedings of the",
        "Working Notes of",
        "Working Notes for",
        "Joint Workshop Proceedings of the",
        "Joint Workshop Proceedings of",
        "Workshop Proceedings from",
        "Workshop and Poster Proceedings of the",
        "Workshops Proceedings and Tutorials of the",
        "Extended Papers of the",
        "Short Papers Proceedings of the",
        "Short Papers Proceedings of",
        "Proceedings of the Selected Papers of the",
        "Proceedings of the Working Notes of",
        "Proceedings of the Doctoral Consortium Papers Presented at the",
        "Selected Contributions to the",
        "Selected and Revised Papers of",
        "Selected Papers of",
        "Up-and-Coming and Short Papers of the",
        "Academic Papers at",
        "Poster Track of the",
        "Actes de la",
        "Post-proceedings of the",
        "Late Breaking Papers of the",
        "Anais do",
        "Proceedings del",
        "Proceedings",
        "Gemeinsamer Tagungsband der",
        "Local Proceedings of the",
        "Local Proceedings and Materials of",
    ]
    postfixes = [
        "Workshop Proceedings",
        "Proceedings",
        "Conference Proceedings",
        "Workshops Proceedings",
        "Adjunct Proceedings",
        "Poster and Demo Proceedings",
        "(full papers)",
    ]
    if title is not None:
        prefixes.sort(key=lambda prefix: len(prefix), reverse=True)
        for prefix in prefixes:
            if title.lower().startswith(prefix.lower()):
                title = title[len(prefix) :]
                title = title.strip()
                break
        postfixes.sort(key=lambda postfix: len(postfix), reverse=True)
        for postfix in postfixes:
            if title.lower().endswith(postfix.lower()):
                title = title[: -len(postfix)]
                title = title.strip(" .,")
                break
    return title

`getEventTypeFromTitle(title)` `classmethod`

Extract the event type from the given title Assumption: lowest mentioned type is the correct one Args: title: title of the event

Returns:

Type	Description
`tuple[str \| None, str \| None]`	wikidata id and label of the event type

Source code in ceurws/wikidatasync.py

@classmethod
def getEventTypeFromTitle(cls, title: str) -> tuple[str | None, str | None]:
    """
    Extract the event type from the given title
    Assumption: lowest mentioned type is the correct one
    Args:
        title: title of the event

    Returns:
        wikidata id and label of the event type
    """
    if title is None or title == "":
        return None, None
    academicConference = ("Q2020153", "academic conference")
    academicWorkshop = ("Q40444998", "academic workshop")
    if "workshop" in title.lower():
        return academicWorkshop
    elif "conference" in title.lower() or "symposium" in title.lower():
        return academicConference
    else:
        return academicWorkshop

`getEventWdItemsByUrn(urn)`

queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event Args: urn: URN id to query for

Returns:

Type	Description
`list[str]`	List of corresponding wikidata item ids or empty list of no matching item is found

Source code in ceurws/wikidatasync.py

def getEventWdItemsByUrn(self, urn: str) -> list[str]:
    """
    queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event
    Args:
        urn: URN id to query for

    Returns:
        List of corresponding wikidata item ids or empty list of no matching item is found
    """
    query = f"""SELECT ?event WHERE{{ ?proceeding wdt:P4109 "{urn}"; wdt:P4745 ?event .}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event") for record in qres]
    return wdItems

`getEventsOfProceedings(itemId)`

get the item ids of the events the given proceedings ids is the proceedings from Args: itemId: Qid of the proceedings

Returns:

Type	Description
`list[str]`	List of the events

Source code in ceurws/wikidatasync.py

def getEventsOfProceedings(self, itemId: str) -> list[str]:
    """
    get the item ids of the events the given proceedings ids is the proceedings from
    Args:
        itemId: Qid of the proceedings

    Returns:
        List of the events
    """
    query = f"""SELECT ?event WHERE {{ wd:{itemId} wdt:P4745 ?event.}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
    return wdItems

`getEventsOfProceedingsByVolnumber(volnumber)`

get the item ids of the events the given proceedings ids is the proceedings from Args: volnumber: Volume number of the proceedings

Returns:

Type	Description
`list[str]`	List of the events

Source code in ceurws/wikidatasync.py

def getEventsOfProceedingsByVolnumber(self, volnumber: int | str) -> list[str]:
    """
    get the item ids of the events the given proceedings ids is the proceedings from
    Args:
        volnumber: Volume number of the proceedings

    Returns:
        List of the events
    """
    query = f"""SELECT ?event 
                WHERE {{
                ?proceeding wdt:P31 wd:Q1143604; 
                            p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volnumber}"]; 
                            wdt:P4745 ?event.}}
    """
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
    return wdItems

`getProceedingWdItemsByUrn(urn)`

queries the wikidata items that have the given urn for the property P4109 Args: urn: URN id to query for

Returns:

Type	Description
`list[str]`	List of corresponding wikidata item ids or empty list of no matching item is found

Source code in ceurws/wikidatasync.py

def getProceedingWdItemsByUrn(self, urn: str) -> list[str]:
    """
    queries the wikidata items that have the given urn for the property P4109
    Args:
        urn: URN id to query for

    Returns:
        List of corresponding wikidata item ids or empty list of no matching item is found
    """
    query = f"""SELECT ?proceeding WHERE{{ ?proceeding wdt:P4109 "{urn}"}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("proceeding") for record in qres]
    return wdItems

`getProceedingsForVolume(searchVolnumber)`

get the proceedings record for the given searchVolnumber

Parameters:

Name	Type	Description	Default
`searchVolnumber(int)`		the number of the volume to search	required

Returns:

Name	Type	Description
`dict`	`dict \| None`	the record for the proceedings in wikidata
`None`	`dict \| None`	if the proceeding record in not found for the given searchVolnumber

Source code in ceurws/wikidatasync.py

def getProceedingsForVolume(self, searchVolnumber: int) -> dict | None:
    """
    get the proceedings record for the given searchVolnumber

    Args:
        searchVolnumber(int): the number of the volume to search

    Returns:
        dict: the record for the proceedings in wikidata
        None: if the proceeding record in not found for the given searchVolnumber
    """
    if self.procRecords is None:
        self.loadProceedingsFromCache()
    if self.procsByVolnumber is None:
        self.procsByVolnumber: dict[int, dict] = {}
        if isinstance(self.procRecords, list):
            for procRecord in self.procRecords:
                volnumber = procRecord.get("sVolume", None)
                if volnumber is None:
                    procRecord.get("Volume", None)
                if volnumber is not None:
                    self.procsByVolnumber[int(volnumber)] = procRecord
    volProcRecord = self.procsByVolnumber.get(searchVolnumber, None)
    return volProcRecord

`getRecentlyAddedVolumeList()`

get the list of volumes that have recently been added we do not expect deletions

Returns:

Type	Description
`tuple[dict[int, dict], list[dict]]`	list[int]: list of volume numbers recently added

Source code in ceurws/wikidatasync.py

def getRecentlyAddedVolumeList(self) -> tuple[dict[int, dict], list[dict]]:
    """
    get the list of volumes that have recently been added
    we do not expect deletions

    Returns:
        list[int]: list of volume numbers recently added

    """
    self.prepareVolumeManager()
    refreshVm = VolumeManager()
    parser_config = ParserConfig()
    parser_config.force_download = True
    self.vm.set_down_to_volume(parser_config)
    refreshVm.loadFromIndexHtml(parser_config=parser_config)
    refreshVolumesByNumber, _duplicates = LOD.getLookup(refreshVm.getList(), "number")
    # https://stackoverflow.com/questions/3462143/get-difference-between-two-lists
    newVolumes = list(set(list(refreshVolumesByNumber.keys())) - set(list(self.volumesByNumber.keys())))
    return refreshVolumesByNumber, newVolumes

`getWikidataEventRecord(volume)`

get the wikidata Record for the given volume

Source code in ceurws/wikidatasync.py

def getWikidataEventRecord(self, volume: Volume):
    """
    get the wikidata Record for the given volume
    """
    volumeTitle = volume.title
    volumeNumber = volume.number
    dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(number=volumeNumber)
    title = label = instanceOf = description = None
    if volumeTitle:
        instanceOf, description = self.getEventTypeFromTitle(volumeTitle)
        title = label = self.getEventNameFromTitle(volumeTitle)
    start_time = volume.dateFrom
    end_time = volume.dateTo
    record = {
        "title": title,
        "label": label,
        "description": description,
        "instanceOf": instanceOf,
        "short name": volume.acronym,
        "locationWikidataId": volume.cityWikidataId,
        "countryWikidataId": volume.countryWikidataId,
        "start time": start_time.isoformat() if start_time is not None else start_time,
        "end time": end_time.isoformat() if end_time is not None else end_time,
        "referenceUrl": volume.getVolumeUrl(),
    }
    if dblpEntityIds is not None and len(dblpEntityIds) > 0:
        dblpEntityId = dblpEntityIds[0]
        record["describedAt"] = self.dblpEndpoint.toDblpUrl(dblpEntityId)
        record["language of work or name"] = "Q1860"
        record["dblpEventId"] = self.dblpEndpoint.convertEntityIdToUrlId(entityId=dblpEntityId)
    # the modeling of virtual events has changed in wikidata
    # virtual event (Q7935096) is discontinued for conferences
    # if volume.isVirtualEvent():
    #     record["instanceOf"] = [instanceOf, "Q7935096"]
    return record

`getWikidataIdByDblpEventId(entityId, volumeNumber=None)`

query wikidata for the qId of items that correspond to the given dblpEventId Args: entityId: id of a dblp event volumeNumber: volume number

Returns:

Type	Description
`list[str]`	list of matching wikidata items

Source code in ceurws/wikidatasync.py

def getWikidataIdByDblpEventId(self, entityId: str | None, volumeNumber: int | None = None) -> list[str]:
    """
    query wikidata for the qId of items that correspond to the given dblpEventId
    Args:
        entityId: id of a dblp event
        volumeNumber: volume number

    Returns:
        list of matching wikidata items
    """
    dblpEventId = self.dblpEndpoint.convertEntityIdToUrlId(entityId=entityId)
    dblpIds = [entityId, dblpEventId]
    dblpIdsStr = " ".join([f'"{dblpId}"' for dblpId in dblpIds])
    urls = ""
    if entityId is not None:
        urls = " ".join(
            [
                f"<{self.dblpEndpoint.toDblpUrl(entityId)}>",
                f"<{self.dblpEndpoint.toDblpUrl(entityId, True)}>",
            ]
        )
    volumeQuery = ""
    if volumeNumber is not None:
        volumeQuery = f"""
        UNION
              {{
              ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volumeNumber}"].
              ?proceeding wdt:P4745 ?qid.
              }}
        """
    query = f"""SELECT DISTINCT ?qid
        WHERE{{
          VALUES ?url {{ {urls} }}
          VALUES ?dblpEventId {{ {dblpIdsStr} }}
          VALUES ?eventType {{wd:Q2020153 wd:Q40444998}}
          {{?qid wdt:P31 ?eventType; wdt:P973 ?url}}
          UNION
          {{?qid wdt:P31 ?eventType; wdt:P10692 ?dblpEventId}}
          {volumeQuery}
        }}
    """
    qres = self.sparql.queryAsListOfDicts(query)
    qIds = []
    if qres is not None and qres != []:
        qIds = [self.removeWdPrefix(record.get("qid")) for record in qres]
    return qIds

`getWikidataIdByVolumeNumber(number)`

query wikidata for the qId of the proceedings of the given volume number Args: number: volume number

Returns:

Name	Type	Description
`str`	`str \| None`	wikidata id corresponding to the given volume number
`None`	`str \| None`	if the corresponding wikidata id was not found

Source code in ceurws/wikidatasync.py

def getWikidataIdByVolumeNumber(self, number: int | None) -> str | None:
    """
    query wikidata for the qId of the proceedings of the given volume number
    Args:
        number: volume number

    Returns:
        str: wikidata id corresponding to the given volume number
        None: if the corresponding wikidata id was not found
    """
    if number is None:
        return None
    query = f"""SELECT * WHERE{{ ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{number}"].}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    qid = None
    if qres is not None and qres != []:
        qids = [record.get("proceeding").split("/")[-1] for record in qres]
        if len(qids) > 1:
            print("CEUR-WS volume number is not unique")
        else:
            qid = qids[0]
    return qid

`getWikidataProceedingsRecord(volume)`

get the wikidata Record for the given volume

Source code in ceurws/wikidatasync.py

def getWikidataProceedingsRecord(self, volume):
    """
    get the wikidata Record for the given volume
    """
    record = {
        "title": getattr(volume, "title", None),
        "label": getattr(volume, "title", None),
        "description": f"Proceedings of {getattr(volume, 'acronym', None)} workshop",
        "urn": getattr(volume, "urn", None),
        "short name": getattr(volume, "acronym", None),
        "volume": getattr(volume, "number", None),
        "pubDate": getattr(volume, "pubDate", None),
        "ceurwsUrl": getattr(volume, "url", None),
        "language of work or name": "Q1860",
        "fullWorkUrl": getattr(volume, "url", None),
    }
    if isinstance(record.get("pubDate"), datetime.datetime):
        record["pubDate"] = record["pubDate"].isoformat()
    return record

`hasItemPropertyValueFor(item, propertyId)`

ask wikidata if the given item has a value for the given property Args: item: item Qid propertyId: property Pid Returns: True if the item has the property else False

Source code in ceurws/wikidatasync.py

def hasItemPropertyValueFor(self, item, propertyId: str):
    """
    ask wikidata if the given item has a value for the given property
    Args:
        item: item Qid
        propertyId: property Pid
    Returns:
        True if the item has the property else False
    """
    query = f"""ASK{{ wd:{item} wdt:{propertyId} ?value.}}"""
    return self.askWikidata(query)

`loadProceedingsFromCache()`

load the proceedings records from the cache

Source code in ceurws/wikidatasync.py

def loadProceedingsFromCache(self):
    """
    load the proceedings records from the cache
    """
    sqlQuery = "SELECT * from Proceedings"
    self.procRecords = self.sqldb.query(sqlQuery)
    return self.procRecords

`preparePaperManager()`

prepare my paper Manager

Source code in ceurws/wikidatasync.py

def preparePaperManager(self):
    """
    prepare my paper Manager
    """
    self.pm = PaperManager()
    if self.pm.isCached():
        self.pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
    else:
        print(
            "PaperManager not cached you might want to run ceur-ws --recreate",
            file=sys.stderr,
        )

`prepareVolumeManager()`

prepare my volume manager

Source code in ceurws/wikidatasync.py

def prepareVolumeManager(self):
    """
    prepare my volume manager
    """
    self.vm = VolumeManager()
    self.vm.load()
    self.volumesByNumber, _duplicates = LOD.getLookup(self.vm.getList(), "number")
    self.volumeList = self.vm.getList()
    self.volumeCount = len(self.volumeList)
    self.volumeOptions = {}
    reverse_keys = sorted(self.volumesByNumber.keys(), reverse=True)
    for volume_number in reverse_keys:
        volume = self.volumesByNumber[volume_number]
        self.volumeOptions[volume.number] = f"Vol-{volume.number}:{volume.title}"

`removeWdPrefix(value)` `classmethod`

removes the wikidata entity prefix Args: value: wikidata entity url

Source code in ceurws/wikidatasync.py

@classmethod
def removeWdPrefix(cls, value: str):
    """
    removes the wikidata entity prefix
    Args:
        value: wikidata entity url
    """
    wd_prefix = "http://www.wikidata.org/entity/"
    if value is not None and isinstance(value, str) and value.startswith(wd_prefix):
        value = value[len("http://www.wikidata.org/entity/") :]
    return value

`storeVolumes()`

store my volumes

Source code in ceurws/wikidatasync.py

def storeVolumes(self):
    """
    store my volumes
    """
    self.vm.store()

`update(withStore=True)`

update my table from the Wikidata Proceedings SPARQL query

Source code in ceurws/wikidatasync.py

def update(self, withStore: bool = True):
    """
    update my table from the Wikidata Proceedings SPARQL query
    """
    if self.debug:
        print(f"Querying proceedings from {self.baseurl} ...")
    # query proceedings
    wd_proceedings_records: list[dict] = self.sparql.queryAsListOfDicts(self.wdQuery.query)
    # query events
    event_query = self.qm.queriesByName["EventsByProceeding"]
    wd_event_records: list[dict] = self.sparql.queryAsListOfDicts(event_query.query)
    # add events to proceeding records
    proceedings_event_map, _duplicates = LOD.getLookup(wd_event_records, "item")
    for proceedings_record in wd_proceedings_records:
        item = proceedings_record.get("item")
        if item in proceedings_event_map:
            event_record = proceedings_event_map.get(item)
            proceedings_record.update(**event_record)
    primaryKey = "URN_NBN"
    withCreate = True
    withDrop = True
    entityInfo = self.sqldb.createTable(
        wd_proceedings_records,
        "Proceedings",
        primaryKey,
        withCreate,
        withDrop,
        sampleRecordCount=5000,
        failIfTooFew=False,
    )
    procsByURN, duplicates = LOD.getLookup(wd_proceedings_records, "URN_NBN")
    if withStore:
        self.sqldb.store(procsByURN.values(), entityInfo, executeMany=True, fixNone=True)
    if self.debug:
        print(f"stored {len(procsByURN.values())} proceedings records")
    if len(duplicates) > 0:
        print(f"found {len(duplicates)} duplicates URN entries")
        if len(duplicates) < 10:
            print(duplicates)
    return wd_proceedings_records

`workshop`

Created on 2020-11-12

@author: wf

`Workshop`

a single Workshop

Source code in ceurws/workshop.py

class Workshop:
    """
    a single Workshop
    """

    def __init__(self):
        """
        Constructor
        """

    @staticmethod
    def ofURI(uri):
        xml = urlopen(uri).read().decode()
        ws = Workshop()
        ws.wsdict = xmltodict.parse(xml)
        return ws

`init()`

Constructor

Source code in ceurws/workshop.py

def __init__(self):
    """
    Constructor
    """

pyCEURmake API Documentation

ceur_ws

Conference

getSamples() staticmethod

ConferenceManager

Editor

getSamples() staticmethod

EditorManager

Paper

__str__()

getSamples() staticmethod

PaperManager

Session

__init__(id, title, position, papers)

getSamples() staticmethod

SessionManager

Volume

papers property

sessions property writable

extractAndSetLocation(locationStr)

extractDates(dateStr, durationThreshold=11)

extractValuesFromVolumePage(timeout=3)

getSubmittingEditor()

getVolumeNumber()

getVolumeUrl()

getVolumeUrlOf(number) staticmethod

get_loctime()

isVirtualEvent()

normalize()

rankLocations(locationStr, locations) staticmethod

removePartsMatching(value, pattern, separator=',') staticmethod

resolveLoctime()

VolumeManager

getIndexHtml(force=False)

load()

loadFromBackup()

loadFromIndexHtml(parser_config=None, vol_limit=None)

recreate(parser_config)

update(parser_config)

update_or_recreate(parser_config)

ceur_ws_web_cmd

CeurWsCmd

__init__()

getArgParser(description, version_msg)

handle_args()

main(argv=None)

config

CEURWS

get_home_path() staticmethod

dblp

DblpAuthorIdentifier dataclass

all() classmethod

getAllAsMap() classmethod

getWikidataIdQueryPart(id_name, value, var) classmethod

DblpAuthors

load(force_query=False)

DblpEditors

load(force_query=False)

DblpEndpoint

__init__(endpoint, debug=False)

convertEntityIdToUrlId(entityId)

getDblpIdByVolumeNumber(number)

getDblpUrlByDblpId(entityId=None)

getEditorsOfVolume(number)

get_ceur_proceeding(volume_number)

get_ceur_volume_papers(volume_number)

get_lod(cache_name, query_name, force_query=False)

load_all(force_query=False)

toDblpUrl(entityId, withPostfix=False)

DblpManager

__init__(endpoint, cache_name, query_name)

load(force_query=False)

DblpPapers

load(force_query=False)

DblpVolumes

load(force_query=False)

indexparser

IndexHtmlParser

__init__(htmlText, config=None)

find(startLine, compiledPattern, step=1)

`ceur_ws`

`Conference`

`getSamples()` `staticmethod`

`ConferenceManager`

`Editor`

`getSamples()` `staticmethod`

`EditorManager`

`Paper`

`str()`

`getSamples()` `staticmethod`

`PaperManager`

`Session`

`init(id, title, position, papers)`

`getSamples()` `staticmethod`

`SessionManager`

`Volume`

`papers` `property`

`sessions` `property` `writable`

`extractAndSetLocation(locationStr)`

`extractDates(dateStr, durationThreshold=11)`

`extractValuesFromVolumePage(timeout=3)`

`getSubmittingEditor()`

`getVolumeNumber()`

`getVolumeUrl()`

`getVolumeUrlOf(number)` `staticmethod`

`get_loctime()`

`isVirtualEvent()`

`normalize()`

`rankLocations(locationStr, locations)` `staticmethod`

`removePartsMatching(value, pattern, separator=',')` `staticmethod`

`resolveLoctime()`

`VolumeManager`

`getIndexHtml(force=False)`

`load()`

`loadFromBackup()`

`loadFromIndexHtml(parser_config=None, vol_limit=None)`

`recreate(parser_config)`

`update(parser_config)`

`update_or_recreate(parser_config)`

`ceur_ws_web_cmd`

`CeurWsCmd`

`init()`

`getArgParser(description, version_msg)`

`handle_args()`

`main(argv=None)`

`config`

`CEURWS`

`get_home_path()` `staticmethod`

`dblp`

`DblpAuthorIdentifier` `dataclass`

`all()` `classmethod`

`getAllAsMap()` `classmethod`

`getWikidataIdQueryPart(id_name, value, var)` `classmethod`

`DblpAuthors`

`load(force_query=False)`

`DblpEditors`

`load(force_query=False)`

`DblpEndpoint`

`init(endpoint, debug=False)`

`convertEntityIdToUrlId(entityId)`

`getDblpIdByVolumeNumber(number)`

`getDblpUrlByDblpId(entityId=None)`

`getEditorsOfVolume(number)`

`get_ceur_proceeding(volume_number)`

`get_ceur_volume_papers(volume_number)`

`get_lod(cache_name, query_name, force_query=False)`

`load_all(force_query=False)`

`toDblpUrl(entityId, withPostfix=False)`

`DblpManager`

`init(endpoint, cache_name, query_name)`

`load(force_query=False)`

`DblpPapers`

`load(force_query=False)`

`DblpVolumes`

`load(force_query=False)`

`indexparser`

`IndexHtmlParser`

`init(htmlText, config=None)`

`find(startLine, compiledPattern, step=1)`

`findVolume(volCount, startLine, expectedTr=3, progress=10)`