scan2wiki API Documentation

`amazon`

Created on 12023-11-16

@author: wf

`Amazon`

lookup products on amazon web site

Source code in scan/amazon.py

class Amazon:
    """
    lookup products on amazon web site
    """

    def __init__(self, debug: Optional[bool] = False):
        """
        constructor

        Args:
            debug (bool, optional): If set to True, pretty-prints the first product div for debugging.
        """
        self.debug = debug

    def extract_amazon_products(self, soup: BeautifulSoup) -> List[Product]:
        """
        Extracts product information from Amazon product listing HTML content.

        Args:
            soup (BeautifulSoup): Soup object of HTML content of the Amazon product listing page.

        Returns:
            List[Product]: A list of extracted product information as Product objects.
        """
        products = []
        # Find all div elements that match the product listing structure
        for index, div in enumerate(soup.find_all("div", class_="puisg-row")):
            product_info = {}

            # Pretty-print the first product div if debug is True
            if self.debug and index == 0:
                print("Debug - First Product Div:")
                print(div.prettify())  # Pretty-print the first div

            # Extracting product title
            title_div = div.find("h2", class_="a-size-mini")
            if title_div and title_div.a:
                product_info["title"] = title_div.a.get_text(strip=True)

            # Extracting product image URL and ASIN
            image_div = div.find("div", class_="s-product-image-container")
            if image_div and image_div.a:
                product_info["image_url"] = image_div.img["src"]
                link = image_div.a["href"]
                asin = link.split("/dp/")[-1].split("/")[0]
                product_info["asin"] = asin

            # Extracting product price
            price_span = div.find("span", class_="a-price")
            if price_span and price_span.find("span", class_="a-offscreen"):
                product_info["price"] = price_span.find(
                    "span", class_="a-offscreen"
                ).get_text(strip=True)
                # Replace '\xa0€' with ' €' in price
                product_info["price"] = product_info.get("price", "").replace(
                    "\xa0", " "
                )

            # Add product info to list if it contains any relevant data
            # Create a Product instance if title is present
            if "title" in product_info:
                product = Product(
                    title=product_info["title"],
                    image_url=product_info.get("image_url", ""),
                    price=product_info.get("price", ""),
                    asin=product_info.get("asin", ""),
                )
                products.append(product)

        return products

    def get_headers(self):
        # Possible components of a user agent string
        browsers = ["Chrome", "Firefox", "Safari", "Edge"]
        operating_systems = [
            "Windows NT 10.0; Win64; x64",
            "Macintosh; Intel Mac OS X 10_15_7",
            "X11; Linux x86_64",
        ]
        platforms = [
            "AppleWebKit/537.36 (KHTML, like Gecko)",
            "Gecko/20100101 Firefox/76.0",
            "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
        ]

        # Randomly select one component from each category
        browser = random.choice(browsers)
        os = random.choice(operating_systems)
        platform = random.choice(platforms)

        # Construct the user agent string
        user_agent = f"Mozilla/5.0 ({os}) {platform} {browser}/58.0.3029.110"

        headers = {"User-Agent": user_agent}
        return headers

    def lookup_products(self, search_key: str):
        """
        lookup the given search key e.g. ISBN or EAN
        """
        url = f"https://www.amazon.de/s?k={search_key}"

        headers = self.get_headers()

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            product_list = self.extract_amazon_products(soup)
            return product_list
        else:
            msg = f"lookup for {search_key} failed with HTML status code {response.status_code}"
            raise Exception(msg)

`init(debug=False)`

constructor

Parameters:

Name	Type	Description	Default
`debug`	`bool`	If set to True, pretty-prints the first product div for debugging.	`False`

Source code in scan/amazon.py

def __init__(self, debug: Optional[bool] = False):
    """
    constructor

    Args:
        debug (bool, optional): If set to True, pretty-prints the first product div for debugging.
    """
    self.debug = debug

`extract_amazon_products(soup)`

Extracts product information from Amazon product listing HTML content.

Parameters:

Name	Type	Description	Default
`soup`	`BeautifulSoup`	Soup object of HTML content of the Amazon product listing page.	required

Returns:

Type	Description
`List[Product]`	List[Product]: A list of extracted product information as Product objects.

Source code in scan/amazon.py

def extract_amazon_products(self, soup: BeautifulSoup) -> List[Product]:
    """
    Extracts product information from Amazon product listing HTML content.

    Args:
        soup (BeautifulSoup): Soup object of HTML content of the Amazon product listing page.

    Returns:
        List[Product]: A list of extracted product information as Product objects.
    """
    products = []
    # Find all div elements that match the product listing structure
    for index, div in enumerate(soup.find_all("div", class_="puisg-row")):
        product_info = {}

        # Pretty-print the first product div if debug is True
        if self.debug and index == 0:
            print("Debug - First Product Div:")
            print(div.prettify())  # Pretty-print the first div

        # Extracting product title
        title_div = div.find("h2", class_="a-size-mini")
        if title_div and title_div.a:
            product_info["title"] = title_div.a.get_text(strip=True)

        # Extracting product image URL and ASIN
        image_div = div.find("div", class_="s-product-image-container")
        if image_div and image_div.a:
            product_info["image_url"] = image_div.img["src"]
            link = image_div.a["href"]
            asin = link.split("/dp/")[-1].split("/")[0]
            product_info["asin"] = asin

        # Extracting product price
        price_span = div.find("span", class_="a-price")
        if price_span and price_span.find("span", class_="a-offscreen"):
            product_info["price"] = price_span.find(
                "span", class_="a-offscreen"
            ).get_text(strip=True)
            # Replace '\xa0€' with ' €' in price
            product_info["price"] = product_info.get("price", "").replace(
                "\xa0", " "
            )

        # Add product info to list if it contains any relevant data
        # Create a Product instance if title is present
        if "title" in product_info:
            product = Product(
                title=product_info["title"],
                image_url=product_info.get("image_url", ""),
                price=product_info.get("price", ""),
                asin=product_info.get("asin", ""),
            )
            products.append(product)

    return products

`lookup_products(search_key)`

lookup the given search key e.g. ISBN or EAN

Source code in scan/amazon.py

def lookup_products(self, search_key: str):
    """
    lookup the given search key e.g. ISBN or EAN
    """
    url = f"https://www.amazon.de/s?k={search_key}"

    headers = self.get_headers()

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        product_list = self.extract_amazon_products(soup)
        return product_list
    else:
        msg = f"lookup for {search_key} failed with HTML status code {response.status_code}"
        raise Exception(msg)

`barcode`

Created on 2023-11-16

@author: wf

`Barcode` `dataclass`

Barcode data structure with static methods e.g. e.g. pyzbar barcode decoder wrapper

Source code in scan/barcode.py

@dataclass
class Barcode:
    """
    Barcode data structure with
    static methods e.g.  e.g. pyzbar barcode decoder wrapper
    """

    code: str
    type: str
    orientation: str
    rect: Optional[dict] = None
    polygon: Optional[List[dict]] = None
    quality: Optional[int] = None

    @staticmethod
    def decode(image_file_path: str, debug: bool = False):
        """
        Decodes barcodes from the image at the given file path.

        Args:
            image_file_path (str): The file path of the image to decode.
            debug (bool): If False, suppress debug information of the PIL library. Default is False.

        Returns:
            list[Barcode]: A list of Barcode objects, or an empty list if no barcodes are found.
        """
        if not debug:
            # Suppress debug messages
            logging.getLogger("PIL").setLevel(logging.INFO)
        # Open the saved image
        image = Image.open(image_file_path)
        # Decode barcodes
        barcodes = decode(image)
        barcode_list = [
            Barcode(
                code=barcode.data.decode("utf-8"),
                type=barcode.type,
                rect=barcode.rect._asdict(),
                polygon=[point._asdict() for point in barcode.polygon],
                quality=barcode.quality,
                orientation=barcode.orientation,
            )
            for barcode in barcodes
        ]
        return barcode_list

`decode(image_file_path, debug=False)` `staticmethod`

Decodes barcodes from the image at the given file path.

Parameters:

Name	Type	Description	Default
`image_file_path`	`str`	The file path of the image to decode.	required
`debug`	`bool`	If False, suppress debug information of the PIL library. Default is False.	`False`

Returns:

Type	Description
	list[Barcode]: A list of Barcode objects, or an empty list if no barcodes are found.

Source code in scan/barcode.py

@staticmethod
def decode(image_file_path: str, debug: bool = False):
    """
    Decodes barcodes from the image at the given file path.

    Args:
        image_file_path (str): The file path of the image to decode.
        debug (bool): If False, suppress debug information of the PIL library. Default is False.

    Returns:
        list[Barcode]: A list of Barcode objects, or an empty list if no barcodes are found.
    """
    if not debug:
        # Suppress debug messages
        logging.getLogger("PIL").setLevel(logging.INFO)
    # Open the saved image
    image = Image.open(image_file_path)
    # Decode barcodes
    barcodes = decode(image)
    barcode_list = [
        Barcode(
            code=barcode.data.decode("utf-8"),
            type=barcode.type,
            rect=barcode.rect._asdict(),
            polygon=[point._asdict() for point in barcode.polygon],
            quality=barcode.quality,
            orientation=barcode.orientation,
        )
        for barcode in barcodes
    ]
    return barcode_list

`dms`

Created on 2021-10-21

@author: wf

see http://diagrams.bitplan.com/render/png/0xe1f1d160.png see http://diagrams.bitplan.com/render/txt/0xe1f1d160.txt

`Archive`

Bases: JSONAble

an Archive might be a filesystem on a server or a (semantic) mediawiki

Source code in scan/dms.py

class Archive(JSONAble):
    """
    an Archive might be a filesystem
    on a server or a (semantic) mediawiki
    """

    def __init__(self):
        """
        Constructor
        """

    @classmethod
    def getSamples(cls):
        samplesLOD = [
            {
                "server": "wiki.bitplan.com",
                "name": "wiki",
                "url": "http://wiki.bitplan.com",
                "wikiid": "wiki",
                "folderCount": 0,
                "documentCount": 0,
            },
            {
                "server": "media.bitplan.com",
                "name": "media",
                "url": "http://media.bitplan.com",
                "wikiid": "media",
                "folderCount": 9,
                "documentCount": 551,
            },
        ]
        return samplesLOD

    def normalizePageTitle(self, pageTitle):
        """
        normalize the given pageTitle
        """
        nPageTitle = pageTitle.replace(" ", "_")
        return nPageTitle

    def getFoldersAndDocuments(self, withOcr=False):
        """
        get the folders of this archive

        Return:
            the list of folders and files
        """
        foldersByPath = {}
        documentList = []
        # this archive is pointing to a wiki
        if hasattr(self, "wikiid") and self.wikiid is not None:
            smw = Wiki.getSMW(self.wikiid)
            for option in ["|format=count", ""]:
                askQuery = (
                    """{{#ask: [[Category:OCRDocument]]  
| mainlabel=page
| ?Category
| ?Modification date=lastModified
| ?Creation date=created
|limit=1000
%s
}}"""
                    % option
                )
                print(askQuery)
                result = smw.query(askQuery)
                baseUrl = f"{smw.site.scheme}://{smw.site.host}{smw.site.path}index.php"
                if option == "":
                    folderCounter = Counter()
                    folderCreated = {}
                    folderLastModified = {}
                    for record in result.values():
                        page = record["page"]
                        if "Kategorie" in record:
                            catname = "Kategorie"
                            categories = record["Kategorie"]
                        else:
                            catname = "Category"
                            categories = record["Category"]
                        doc = Document()
                        doc.archiveName = self.name
                        if isinstance(categories, list):
                            firstCategory = categories[0]
                        else:
                            firstCategory = categories
                        doc.folderPath = firstCategory.replace(f"{catname}:", "")
                        # print(f"{firstCategory}->{doc.folderPath}")
                        doc.lastModified = record["lastModified"]
                        doc.created = record["created"]
                        folderCounter[doc.folderPath] += 1
                        if doc.created:
                            if doc.folderPath in folderCreated:
                                folderCreated[doc.folderPath] = min(
                                    doc.created, folderCreated[doc.folderPath]
                                )
                            else:
                                folderCreated[doc.folderPath] = doc.created
                        if doc.lastModified:
                            if doc.folderPath in folderLastModified:
                                folderLastModified[doc.folderPath] = max(
                                    doc.lastModified, folderLastModified[doc.folderPath]
                                )
                            else:
                                folderLastModified[doc.folderPath] = doc.lastModified

                        doc.name = page
                        doc.url = f"{baseUrl}/{self.normalizePageTitle(page)}"
                        documentList.append(doc)
                    # collect folders
                    for folderName, count in folderCounter.most_common():
                        folder = Folder()
                        folder.archiveName = self.name
                        folder.name = folderName
                        folder.path = folderName
                        if folderName in folderLastModified:
                            folder.lastModified = folderLastModified[folderName]
                        if folderName in folderCreated:
                            folder.created = folderCreated[folderName]
                        folder.url = f"{baseUrl}/Category:{folderName}"
                        folder.fileCount = count
                        foldersByPath[folderName] = folder
                        pass
        else:
            # this archive is pointing to a folder
            pattern = rf"http://{self.server}"
            folderPath = re.sub(pattern, "", self.url)
            basePath = Folder.getFullpath(folderPath)
            for root, dirs, files in os.walk(basePath):
                relbase = Folder.getRelpath(root)
                # loop over all directories
                for dirname in dirs:
                    if not dirname.startswith("."):
                        folder = Folder()
                        folder.archive = self
                        fullpath = os.path.join(root, dirname)
                        folder.path = os.path.join(relbase, dirname)
                        folder.archiveName = self.name
                        folder.url = f"http://{self.server}{folder.path}"
                        folder.name = dirname
                        # files in folder ...
                        pdfFiles = folder.getFiles()
                        folder.fileCount = len(pdfFiles)
                        folder.lastModified = DMSStorage.getDatetime(fullpath)
                        folder.created = folder.lastModified
                        folderDocuments = folder.getDocuments(pdfFiles, withOcr=withOcr)
                        # add the results
                        documentList.extend(folderDocuments)
                        foldersByPath[folder.path] = folder
            pass
        return foldersByPath, documentList

`init()`

Constructor

Source code in scan/dms.py

def __init__(self):
    """
    Constructor
    """

`getFoldersAndDocuments(withOcr=False)`

get the folders of this archive

Return

the list of folders and files

Source code in scan/dms.py

    def getFoldersAndDocuments(self, withOcr=False):
        """
        get the folders of this archive

        Return:
            the list of folders and files
        """
        foldersByPath = {}
        documentList = []
        # this archive is pointing to a wiki
        if hasattr(self, "wikiid") and self.wikiid is not None:
            smw = Wiki.getSMW(self.wikiid)
            for option in ["|format=count", ""]:
                askQuery = (
                    """{{#ask: [[Category:OCRDocument]]  
| mainlabel=page
| ?Category
| ?Modification date=lastModified
| ?Creation date=created
|limit=1000
%s
}}"""
                    % option
                )
                print(askQuery)
                result = smw.query(askQuery)
                baseUrl = f"{smw.site.scheme}://{smw.site.host}{smw.site.path}index.php"
                if option == "":
                    folderCounter = Counter()
                    folderCreated = {}
                    folderLastModified = {}
                    for record in result.values():
                        page = record["page"]
                        if "Kategorie" in record:
                            catname = "Kategorie"
                            categories = record["Kategorie"]
                        else:
                            catname = "Category"
                            categories = record["Category"]
                        doc = Document()
                        doc.archiveName = self.name
                        if isinstance(categories, list):
                            firstCategory = categories[0]
                        else:
                            firstCategory = categories
                        doc.folderPath = firstCategory.replace(f"{catname}:", "")
                        # print(f"{firstCategory}->{doc.folderPath}")
                        doc.lastModified = record["lastModified"]
                        doc.created = record["created"]
                        folderCounter[doc.folderPath] += 1
                        if doc.created:
                            if doc.folderPath in folderCreated:
                                folderCreated[doc.folderPath] = min(
                                    doc.created, folderCreated[doc.folderPath]
                                )
                            else:
                                folderCreated[doc.folderPath] = doc.created
                        if doc.lastModified:
                            if doc.folderPath in folderLastModified:
                                folderLastModified[doc.folderPath] = max(
                                    doc.lastModified, folderLastModified[doc.folderPath]
                                )
                            else:
                                folderLastModified[doc.folderPath] = doc.lastModified

                        doc.name = page
                        doc.url = f"{baseUrl}/{self.normalizePageTitle(page)}"
                        documentList.append(doc)
                    # collect folders
                    for folderName, count in folderCounter.most_common():
                        folder = Folder()
                        folder.archiveName = self.name
                        folder.name = folderName
                        folder.path = folderName
                        if folderName in folderLastModified:
                            folder.lastModified = folderLastModified[folderName]
                        if folderName in folderCreated:
                            folder.created = folderCreated[folderName]
                        folder.url = f"{baseUrl}/Category:{folderName}"
                        folder.fileCount = count
                        foldersByPath[folderName] = folder
                        pass
        else:
            # this archive is pointing to a folder
            pattern = rf"http://{self.server}"
            folderPath = re.sub(pattern, "", self.url)
            basePath = Folder.getFullpath(folderPath)
            for root, dirs, files in os.walk(basePath):
                relbase = Folder.getRelpath(root)
                # loop over all directories
                for dirname in dirs:
                    if not dirname.startswith("."):
                        folder = Folder()
                        folder.archive = self
                        fullpath = os.path.join(root, dirname)
                        folder.path = os.path.join(relbase, dirname)
                        folder.archiveName = self.name
                        folder.url = f"http://{self.server}{folder.path}"
                        folder.name = dirname
                        # files in folder ...
                        pdfFiles = folder.getFiles()
                        folder.fileCount = len(pdfFiles)
                        folder.lastModified = DMSStorage.getDatetime(fullpath)
                        folder.created = folder.lastModified
                        folderDocuments = folder.getDocuments(pdfFiles, withOcr=withOcr)
                        # add the results
                        documentList.extend(folderDocuments)
                        foldersByPath[folder.path] = folder
            pass
        return foldersByPath, documentList

`normalizePageTitle(pageTitle)`

normalize the given pageTitle

Source code in scan/dms.py

def normalizePageTitle(self, pageTitle):
    """
    normalize the given pageTitle
    """
    nPageTitle = pageTitle.replace(" ", "_")
    return nPageTitle

`ArchiveManager`

Bases: EntityManager

manager for Archives

Source code in scan/dms.py

class ArchiveManager(EntityManager):
    """
    manager for Archives
    """

    def __init__(self, mode="sql", debug=False):
        """constructor"""
        name = "archive"
        entityName = "Archive"
        entityPluralName = "archives"
        listName = entityPluralName
        clazz = Archive
        tableName = name
        config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
        handleInvalidListTypes = True
        filterInvalidListTypes = True
        primaryKey = "url"
        super().__init__(
            name,
            entityName,
            entityPluralName,
            listName,
            clazz,
            tableName,
            primaryKey,
            config,
            handleInvalidListTypes,
            filterInvalidListTypes,
            debug,
        )

    @staticmethod
    def getInstance(mode=None):
        if mode is None:
            ams = ArchiveManager(mode="sql")
            if not ams.isCached():
                amj = ArchiveManager(mode="json")
                amj.fromCache()
                ams.archives = amj.archives
                ams.store()
            am = ams
            DMSStorage.fromCache(ams)
            am = ams
        else:
            am = ArchiveManager(mode)
        return am

    @staticmethod
    def addFilesAndFoldersForArchive(
        archive=None, withOcr=False, store=False, debug=True
    ):
        """
        add Files and folder for the given Archive

        Args:
            archive(Archive): the archive to add files and folder for
            store(bool): True if the result should be stored in the storage
            debug(bool): True if debugging messages should be displayed
        """
        if archive is None:
            return
        folders = []
        msg = f"getting folders for {archive.name}"
        if debug:
            print(msg)
        afoldersByPath, documentList = archive.getFoldersAndDocuments(withOcr=withOcr)
        folderCount = len(afoldersByPath)
        msg = f"found {folderCount} folders in {archive.name}"
        folders.extend(afoldersByPath.values())
        if debug:
            print(msg)
        if store:
            if len(folders) > 0:
                fms = FolderManager(mode="sql")
                fms.folders = folders
                fms.store(append=True, replace=True)
            if len(documentList) > 0:
                dms = DocumentManager(mode="sql")
                dms.documents = documentList
                dms.store(append=True, replace=True)

`init(mode='sql', debug=False)`

constructor

Source code in scan/dms.py

def __init__(self, mode="sql", debug=False):
    """constructor"""
    name = "archive"
    entityName = "Archive"
    entityPluralName = "archives"
    listName = entityPluralName
    clazz = Archive
    tableName = name
    config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
    handleInvalidListTypes = True
    filterInvalidListTypes = True
    primaryKey = "url"
    super().__init__(
        name,
        entityName,
        entityPluralName,
        listName,
        clazz,
        tableName,
        primaryKey,
        config,
        handleInvalidListTypes,
        filterInvalidListTypes,
        debug,
    )

`addFilesAndFoldersForArchive(archive=None, withOcr=False, store=False, debug=True)` `staticmethod`

add Files and folder for the given Archive

Parameters:

Name	Description	Default
`archive(Archive)`	the archive to add files and folder for	required
`store(bool)`	True if the result should be stored in the storage	required
`debug(bool)`	True if debugging messages should be displayed	required

Source code in scan/dms.py

@staticmethod
def addFilesAndFoldersForArchive(
    archive=None, withOcr=False, store=False, debug=True
):
    """
    add Files and folder for the given Archive

    Args:
        archive(Archive): the archive to add files and folder for
        store(bool): True if the result should be stored in the storage
        debug(bool): True if debugging messages should be displayed
    """
    if archive is None:
        return
    folders = []
    msg = f"getting folders for {archive.name}"
    if debug:
        print(msg)
    afoldersByPath, documentList = archive.getFoldersAndDocuments(withOcr=withOcr)
    folderCount = len(afoldersByPath)
    msg = f"found {folderCount} folders in {archive.name}"
    folders.extend(afoldersByPath.values())
    if debug:
        print(msg)
    if store:
        if len(folders) > 0:
            fms = FolderManager(mode="sql")
            fms.folders = folders
            fms.store(append=True, replace=True)
        if len(documentList) > 0:
            dms = DocumentManager(mode="sql")
            dms.documents = documentList
            dms.store(append=True, replace=True)

`DMSStorage`

Document management system storage configuration

Source code in scan/dms.py

class DMSStorage:
    """
    Document management system storage configuration
    """

    profile = True
    withShowProgress = True

    @staticmethod
    def getStorageConfig(debug: bool = False, mode="sql") -> StorageConfig:
        """
        get the storageConfiguration

        Args:
            debug(bool): if True show debug information
            mode(str): sql or json

        Return:
            StorageConfig: the storage configuration to be used
        """
        if mode == "sql":
            config = StorageConfig.getSQL(debug=debug)
        elif mode == "json":
            config = StorageConfig.getJSON()
        elif mode == "jsonpickle":
            config = StorageConfig.getJsonPickle(debug=debug)
        else:
            raise Exception(f"invalid mode {mode}")
        config.cacheDirName = "dms"
        cachedir = config.getCachePath()
        config.profile = DMSStorage.profile
        config.withShowProgress = DMSStorage.withShowProgress
        if mode == "sql":
            config.cacheFile = f"{cachedir}/dms.db"
        return config

    @staticmethod
    def getScanDir():
        """
        get the scan/watch directory to be used

        Returns:
            str: the path to the scan directory
        """
        home = str(Path.home())
        scandir = f"{home}/Pictures/scans"
        os.makedirs(scandir, exist_ok=True)
        return scandir

    @staticmethod
    def getSqlDB():
        """
        get the SQlite database connection
        """
        config = DMSStorage.getStorageConfig(mode="sql")
        # https://stackoverflow.com/a/48234567/1497139
        sqlDB = SQLDB(config.cacheFile, check_same_thread=False)
        return sqlDB

    @staticmethod
    def getDatetime(fullpath: str):
        """
        get the last modification time

        Args:
            fullpath(str): the path to get the datetime for
        """
        timestamp = os.path.getmtime(fullpath)
        ftime = datetime.fromtimestamp(timestamp)
        return ftime

    @staticmethod
    def getTimeStr(fullpath: str):
        """
        get the last modification time

        Args:
            fullpath(str): the path to get the time string for
        """
        ftime = DMSStorage.getDatetime(fullpath)
        ftimestr = ftime.strftime("%Y-%m-%d %H:%M:%S")
        return ftimestr

    @staticmethod
    def fromCache(em: EntityManager):
        """
        initialize the given entity manager from it's cache

        Args:
            em(EntityManager): the entity manager to initialize
        """
        if em.isCached():
            em.fromCache()
        else:
            if em.config.mode is StoreMode.SQL:
                sqlDB = DMSStorage.getSqlDB()
                em.initSQLDB(sqlDB)

`fromCache(em)` `staticmethod`

initialize the given entity manager from it's cache

Parameters:

Name	Type	Description	Default
`em(EntityManager)`		the entity manager to initialize	required

Source code in scan/dms.py

@staticmethod
def fromCache(em: EntityManager):
    """
    initialize the given entity manager from it's cache

    Args:
        em(EntityManager): the entity manager to initialize
    """
    if em.isCached():
        em.fromCache()
    else:
        if em.config.mode is StoreMode.SQL:
            sqlDB = DMSStorage.getSqlDB()
            em.initSQLDB(sqlDB)

`getDatetime(fullpath)` `staticmethod`

get the last modification time

Parameters:

Name	Type	Description	Default
`fullpath(str)`		the path to get the datetime for	required

Source code in scan/dms.py

@staticmethod
def getDatetime(fullpath: str):
    """
    get the last modification time

    Args:
        fullpath(str): the path to get the datetime for
    """
    timestamp = os.path.getmtime(fullpath)
    ftime = datetime.fromtimestamp(timestamp)
    return ftime

`getScanDir()` `staticmethod`

get the scan/watch directory to be used

Returns:

Name	Type	Description
`str`		the path to the scan directory

Source code in scan/dms.py

@staticmethod
def getScanDir():
    """
    get the scan/watch directory to be used

    Returns:
        str: the path to the scan directory
    """
    home = str(Path.home())
    scandir = f"{home}/Pictures/scans"
    os.makedirs(scandir, exist_ok=True)
    return scandir

`getSqlDB()` `staticmethod`

get the SQlite database connection

Source code in scan/dms.py

@staticmethod
def getSqlDB():
    """
    get the SQlite database connection
    """
    config = DMSStorage.getStorageConfig(mode="sql")
    # https://stackoverflow.com/a/48234567/1497139
    sqlDB = SQLDB(config.cacheFile, check_same_thread=False)
    return sqlDB

`getStorageConfig(debug=False, mode='sql')` `staticmethod`

get the storageConfiguration

Parameters:

Name	Type	Description	Default
`debug(bool)`		if True show debug information	required
`mode(str)`		sql or json	required

Return

StorageConfig: the storage configuration to be used

Source code in scan/dms.py

@staticmethod
def getStorageConfig(debug: bool = False, mode="sql") -> StorageConfig:
    """
    get the storageConfiguration

    Args:
        debug(bool): if True show debug information
        mode(str): sql or json

    Return:
        StorageConfig: the storage configuration to be used
    """
    if mode == "sql":
        config = StorageConfig.getSQL(debug=debug)
    elif mode == "json":
        config = StorageConfig.getJSON()
    elif mode == "jsonpickle":
        config = StorageConfig.getJsonPickle(debug=debug)
    else:
        raise Exception(f"invalid mode {mode}")
    config.cacheDirName = "dms"
    cachedir = config.getCachePath()
    config.profile = DMSStorage.profile
    config.withShowProgress = DMSStorage.withShowProgress
    if mode == "sql":
        config.cacheFile = f"{cachedir}/dms.db"
    return config

`getTimeStr(fullpath)` `staticmethod`

get the last modification time

Parameters:

Name	Type	Description	Default
`fullpath(str)`		the path to get the time string for	required

Source code in scan/dms.py

@staticmethod
def getTimeStr(fullpath: str):
    """
    get the last modification time

    Args:
        fullpath(str): the path to get the time string for
    """
    ftime = DMSStorage.getDatetime(fullpath)
    ftimestr = ftime.strftime("%Y-%m-%d %H:%M:%S")
    return ftimestr

`Document`

Bases: JSONAble

a document consist of one or more files in the filesystem or a wikipage - the name is the pagetitle or the filename without extension

types then has the list of available file types e.g. "pdf,txt" for single page Documents the document is somewhat redundant to the Page concept

Source code in scan/dms.py

class Document(JSONAble):
    """
    a document consist of one or more files in the filesystem
    or a wikipage - the name is the pagetitle
    or the filename without extension

    types then has the list of available file types e.g. "pdf,txt"
    for single page Documents  the document is somewhat redundant to the Page concept
    """

    @classmethod
    def getSamples(cls):
        samplesLOD = [
            {
                "archiveName": "bitplan-scan",
                "folderPath": "",
                # TODO: fullpath, filename, basename and timestampStr not needed
                "fullpath": "",
                "fileName": "",
                "basename": "",
                "timestampStr": "",
                "pageTitle": "",
                "categories": "",
                "topic": "",
                "url": "http://capri.bitplan.com/bitplan/scan/2019/",
                "created": datetime(2019, 2, 27, 10, 7, 56),
                "size": 15,
                "lastModified": datetime(2019, 2, 27, 10, 7, 56),
                "name": "2019",
                "types": "pdf",
                "ocrText": "",
            }
        ]
        return samplesLOD

    def __init__(self):
        """
        construct me
        """
        pass

    def fromDict(self, record):
        """
        overwrite the from Dict
        """
        super().fromDict(record)
        pass

    def fromFile(self, folderPath, file, local=False, withOcr=False):
        """
        Args:
            folderPath(str): the directory
            file(str): the file
            withOcr(bool): if true get the OCRText
        """
        self.folderPath = folderPath
        self.name = file
        self.fullpath = f"{Folder.getFullpath(self.folderPath,local)}/{file}"
        self.size = os.path.getsize(self.fullpath)
        self.lastModified = DMSStorage.getDatetime(self.fullpath)
        self.created = self.lastModified
        self.timestampStr = DMSStorage.getTimeStr(self.fullpath)
        self.fileName = Path(self.fullpath).name
        self.baseName = Path(self.fullpath).stem
        self.pageTitle = f"{self.baseName}"

        self.categories = f"{datetime.now().year}"
        self.topic = "OCRDocument"
        if withOcr:
            self.getOcrText()
        pass

    def __str__(self):
        text = "Upload:"
        self.fields = ["fileName", "ocrText"]
        delim = ""
        for fieldname in self.fields:
            text += "%s%s=%s" % (delim, fieldname, self.__dict__[fieldname])
            delim = ","
        return text

    def getPDFText(self):
        """
        get my PDF Text
        """
        pdfText = None
        if self.fullpath.lower().endswith(".pdf"):
            pdfText = PDFMiner.getPDFText(self.fullpath)
        return pdfText

    def readTextFromFile(self, fileName: str) -> str:
        """
        read text from the given fileName
        """
        try:
            with open(fileName, "r") as textFile:
                return textFile.read()
        except UnicodeDecodeError as _ude:
            # print(f"couldn't decode {fileName}")
            with open(fileName, "rb") as file:
                content = file.read()
                suggestion = UnicodeDammit(content)
                encoding = suggestion.original_encoding
                if encoding is None:
                    encoding = "utf-8"
                try:
                    text = content.decode(encoding)
                except Exception as ex:
                    raise (ex)
                return text

    def getOcrText(self):
        """
        get the OCR
        """
        parent = Path(self.fullpath).parent.absolute()
        ocrPath = f"{parent}/.ocr"
        self.ocrText = None
        if os.path.isdir(ocrPath):
            ocrFileName = f"{ocrPath}/{self.basename}.txt"
            if os.path.isfile(ocrFileName):
                self.ocrText = self.readTextFromFile(ocrFileName)
            else:
                page = 1
                maxPages = 1000
                pageFileName = f"{ocrPath}/{self.basename}_p{page:03d}.txt"
                if os.path.isfile(pageFileName):
                    pageText = self.readTextFromFile(pageFileName)
                    if pageText is not None:
                        self.ocrText = pageText
                        for page in range(2, maxPages):
                            pageFileName = f"{ocrPath}/{self.basename}_p{page:03d}.txt"
                            if not os.path.isfile(pageFileName):
                                break
                            nextPage = self.readTextFromFile(pageFileName)
                            if nextPage is not None:
                                self.ocrText += nextPage
        if self.ocrText is None:
            self.ocrText = self.getPDFText()
        return self.ocrText

    def uploadFile(self, wikiId):
        """
        call back
        """
        pageContent = self.getContent()
        ignoreExists = True
        wikipush = WikiPush(fromWikiId=None, toWikiId=wikiId, login=True)
        description = f"scanned at {self.timestampStr}"
        msg = f"uploading {self.pageTitle} ({self.fileName}) to {wikiId} ... "
        files = [self.fullpath]
        wikipush.upload(files, force=ignoreExists)
        pageToBeEdited = wikipush.toWiki.getPage(self.pageTitle)
        if (not pageToBeEdited.exists) or ignoreExists:
            pageToBeEdited.edit(pageContent, description)
            wikipush.log(msg + "✅")
            pass

    def getContent(self):
        """
        get my content

        Return:
            str: the content of the wikipage
        """
        wikicats = ""
        delim = ""
        for category in self.categories.split(","):
            wikicats += "%s[[Category:%s]]" % (delim, category)
            delim = "\n"
        if self.fileName.endswith(".pdf"):
            template = """= pdf pages =
<pdf>%s</pdf>
= text =
<pre>%s</pre>
= pdf =
[[File:%s]]
%s
<headertabs/>
"""
            pageContent = template % (
                self.fileName,
                self.ocrText,
                self.fileName,
                wikicats,
            )
        else:
            template = """[[File:%s]]
%s
<headertabs/>"""
            pageContent = template % (self.fileName, wikicats)

        return pageContent

`init()`

construct me

Source code in scan/dms.py

def __init__(self):
    """
    construct me
    """
    pass

`fromDict(record)`

overwrite the from Dict

Source code in scan/dms.py

def fromDict(self, record):
    """
    overwrite the from Dict
    """
    super().fromDict(record)
    pass

`fromFile(folderPath, file, local=False, withOcr=False)`

Parameters:

Name	Description	Default
`folderPath(str)`	the directory	required
`file(str)`	the file	required
`withOcr(bool)`	if true get the OCRText	required

Source code in scan/dms.py

def fromFile(self, folderPath, file, local=False, withOcr=False):
    """
    Args:
        folderPath(str): the directory
        file(str): the file
        withOcr(bool): if true get the OCRText
    """
    self.folderPath = folderPath
    self.name = file
    self.fullpath = f"{Folder.getFullpath(self.folderPath,local)}/{file}"
    self.size = os.path.getsize(self.fullpath)
    self.lastModified = DMSStorage.getDatetime(self.fullpath)
    self.created = self.lastModified
    self.timestampStr = DMSStorage.getTimeStr(self.fullpath)
    self.fileName = Path(self.fullpath).name
    self.baseName = Path(self.fullpath).stem
    self.pageTitle = f"{self.baseName}"

    self.categories = f"{datetime.now().year}"
    self.topic = "OCRDocument"
    if withOcr:
        self.getOcrText()
    pass

`getContent()`

get my content

Return

str: the content of the wikipage

Source code in scan/dms.py

    def getContent(self):
        """
        get my content

        Return:
            str: the content of the wikipage
        """
        wikicats = ""
        delim = ""
        for category in self.categories.split(","):
            wikicats += "%s[[Category:%s]]" % (delim, category)
            delim = "\n"
        if self.fileName.endswith(".pdf"):
            template = """= pdf pages =
<pdf>%s</pdf>
= text =
<pre>%s</pre>
= pdf =
[[File:%s]]
%s
<headertabs/>
"""
            pageContent = template % (
                self.fileName,
                self.ocrText,
                self.fileName,
                wikicats,
            )
        else:
            template = """[[File:%s]]
%s
<headertabs/>"""
            pageContent = template % (self.fileName, wikicats)

        return pageContent

`getOcrText()`

get the OCR

Source code in scan/dms.py

def getOcrText(self):
    """
    get the OCR
    """
    parent = Path(self.fullpath).parent.absolute()
    ocrPath = f"{parent}/.ocr"
    self.ocrText = None
    if os.path.isdir(ocrPath):
        ocrFileName = f"{ocrPath}/{self.basename}.txt"
        if os.path.isfile(ocrFileName):
            self.ocrText = self.readTextFromFile(ocrFileName)
        else:
            page = 1
            maxPages = 1000
            pageFileName = f"{ocrPath}/{self.basename}_p{page:03d}.txt"
            if os.path.isfile(pageFileName):
                pageText = self.readTextFromFile(pageFileName)
                if pageText is not None:
                    self.ocrText = pageText
                    for page in range(2, maxPages):
                        pageFileName = f"{ocrPath}/{self.basename}_p{page:03d}.txt"
                        if not os.path.isfile(pageFileName):
                            break
                        nextPage = self.readTextFromFile(pageFileName)
                        if nextPage is not None:
                            self.ocrText += nextPage
    if self.ocrText is None:
        self.ocrText = self.getPDFText()
    return self.ocrText

`getPDFText()`

get my PDF Text

Source code in scan/dms.py

def getPDFText(self):
    """
    get my PDF Text
    """
    pdfText = None
    if self.fullpath.lower().endswith(".pdf"):
        pdfText = PDFMiner.getPDFText(self.fullpath)
    return pdfText

`readTextFromFile(fileName)`

read text from the given fileName

Source code in scan/dms.py

def readTextFromFile(self, fileName: str) -> str:
    """
    read text from the given fileName
    """
    try:
        with open(fileName, "r") as textFile:
            return textFile.read()
    except UnicodeDecodeError as _ude:
        # print(f"couldn't decode {fileName}")
        with open(fileName, "rb") as file:
            content = file.read()
            suggestion = UnicodeDammit(content)
            encoding = suggestion.original_encoding
            if encoding is None:
                encoding = "utf-8"
            try:
                text = content.decode(encoding)
            except Exception as ex:
                raise (ex)
            return text

`uploadFile(wikiId)`

call back

Source code in scan/dms.py

def uploadFile(self, wikiId):
    """
    call back
    """
    pageContent = self.getContent()
    ignoreExists = True
    wikipush = WikiPush(fromWikiId=None, toWikiId=wikiId, login=True)
    description = f"scanned at {self.timestampStr}"
    msg = f"uploading {self.pageTitle} ({self.fileName}) to {wikiId} ... "
    files = [self.fullpath]
    wikipush.upload(files, force=ignoreExists)
    pageToBeEdited = wikipush.toWiki.getPage(self.pageTitle)
    if (not pageToBeEdited.exists) or ignoreExists:
        pageToBeEdited.edit(pageContent, description)
        wikipush.log(msg + "✅")
        pass

`DocumentManager`

Bases: EntityManager

manager for Documents

Source code in scan/dms.py

class DocumentManager(EntityManager):
    """
    manager for Documents
    """

    def __init__(self, mode="sql", debug=False):
        """constructor"""
        name = "document"
        entityName = "Document"
        entityPluralName = "documents"
        listName = entityPluralName
        clazz = Document
        tableName = name
        config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
        handleInvalidListTypes = True
        filterInvalidListTypes = True
        primaryKey = "url"
        super().__init__(
            name,
            entityName,
            entityPluralName,
            listName,
            clazz,
            tableName,
            primaryKey,
            config,
            handleInvalidListTypes,
            filterInvalidListTypes,
            debug,
        )

    @staticmethod
    def getInstance(mode="sql"):
        dm = DocumentManager(mode=mode)
        DMSStorage.fromCache(dm)
        return dm

`init(mode='sql', debug=False)`

constructor

Source code in scan/dms.py

def __init__(self, mode="sql", debug=False):
    """constructor"""
    name = "document"
    entityName = "Document"
    entityPluralName = "documents"
    listName = entityPluralName
    clazz = Document
    tableName = name
    config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
    handleInvalidListTypes = True
    filterInvalidListTypes = True
    primaryKey = "url"
    super().__init__(
        name,
        entityName,
        entityPluralName,
        listName,
        clazz,
        tableName,
        primaryKey,
        config,
        handleInvalidListTypes,
        filterInvalidListTypes,
        debug,
    )

`Folder`

Bases: JSONAble

a Folder might be a filesystem folder or a category in a wiki

Source code in scan/dms.py

class Folder(JSONAble):
    """
    a Folder might be a filesystem folder or a category in a wiki
    """

    def __init__(self):
        """
        Constructor
        """

    @classmethod
    def getSamples(cls):
        samplesLOD = [
            {
                "archiveName": "bitplan-scan",
                "url": "http://capri.bitplan.com/bitplan/scan/2019/",
                "fileCount": 15,
                "lastModified": datetime(2019, 2, 27, 10, 7, 56),
                "created": datetime(2019, 2, 27, 10, 7, 56),
                "name": "2019",
                "path": "/bitplan/scan/2019",
            }
        ]
        return samplesLOD

    @classmethod
    def getPrefix(cls):
        """
        get the path prefix for this platform (if any)

        Return:
            str: the prefix e.g. /Volumes on Darwin
        """
        if sys.platform == "darwin":
            prefix = f"/Volumes"
        else:
            prefix = ""
        return prefix

    @staticmethod
    def getFullpath(folderPath: str, local: bool = False):
        """
        get the full path as accessible on my platform

        Args:
           folderPath(str): the path of the folder
           local(bool): True if the path is for a local folder

        Return:
            str: the full path of the folder
        """
        if local:
            fullPath = folderPath
        else:
            fullPath = f"{Folder.getPrefix()}{folderPath}"
        return fullPath

    @classmethod
    def getRelpath(cls, folderPath: str) -> str:
        """
        get the relative path as accessible on my platform

        Args:
           folderPath(str): the path of the folder

        Return:
            str: the relative path of the folder
        """
        prefix = Folder.getPrefix()
        if prefix and folderPath.startswith(prefix):
            relbase = folderPath.replace(prefix, "")
        else:
            relbase = folderPath
        return relbase

    def getFiles(self, extension=".pdf"):
        """
        get all files with the given extension

        Args:
            extension(str): the extension to search for

        Return:
            list: the files with the given extension
        """
        files = []
        fullPath = Folder.getFullpath(self.path)
        for file in os.listdir(fullPath):
            if file.endswith(extension) and not file.startswith("._"):
                files.append(file)
        return files

    def getFileDocuments(self):
        """
        get all documents for the OCRDocument files in this folder

        Return:
            list: the list of documents
        """
        files = self.getFiles()
        documents = self.getDocuments(files)
        return documents

    def getDocuments(self, files, withOcr=False):
        """
        get the documents for this folder based on the files from my listdir
        """
        documentList = []
        msg = f"getting {len(files)} documents for {self.path}"
        Logger.log(msg)
        for file in files:
            try:
                if file.endswith(".pdf"):
                    doc = Document()
                    doc.archiveName = self.archiveName
                    doc.url = f"http://{self.archive.server}{self.path}/{file}"
                    doc.fromFile(self.path, file, withOcr=withOcr)
                    documentList.append(doc)
            except Exception as e:
                Logger.logException(e)
        return documentList

    def refreshDocuments(self):
        """
        refresh the documents in this folder
        """
        doclist = self.getFileDocuments()
        for doc in doclist:
            doc.getOcrText()
            pass
        pass

`init()`

Constructor

Source code in scan/dms.py

def __init__(self):
    """
    Constructor
    """

`getDocuments(files, withOcr=False)`

get the documents for this folder based on the files from my listdir

Source code in scan/dms.py

def getDocuments(self, files, withOcr=False):
    """
    get the documents for this folder based on the files from my listdir
    """
    documentList = []
    msg = f"getting {len(files)} documents for {self.path}"
    Logger.log(msg)
    for file in files:
        try:
            if file.endswith(".pdf"):
                doc = Document()
                doc.archiveName = self.archiveName
                doc.url = f"http://{self.archive.server}{self.path}/{file}"
                doc.fromFile(self.path, file, withOcr=withOcr)
                documentList.append(doc)
        except Exception as e:
            Logger.logException(e)
    return documentList

`getFileDocuments()`

get all documents for the OCRDocument files in this folder

Return

list: the list of documents

Source code in scan/dms.py

def getFileDocuments(self):
    """
    get all documents for the OCRDocument files in this folder

    Return:
        list: the list of documents
    """
    files = self.getFiles()
    documents = self.getDocuments(files)
    return documents

`getFiles(extension='.pdf')`

get all files with the given extension

Parameters:

Name	Type	Description	Default
`extension(str)`		the extension to search for	required

Return

list: the files with the given extension

Source code in scan/dms.py

def getFiles(self, extension=".pdf"):
    """
    get all files with the given extension

    Args:
        extension(str): the extension to search for

    Return:
        list: the files with the given extension
    """
    files = []
    fullPath = Folder.getFullpath(self.path)
    for file in os.listdir(fullPath):
        if file.endswith(extension) and not file.startswith("._"):
            files.append(file)
    return files

`getFullpath(folderPath, local=False)` `staticmethod`

get the full path as accessible on my platform

Parameters:

Name	Type	Description	Default
`folderPath(str)`		the path of the folder	required
`local(bool)`		True if the path is for a local folder	required

Return

str: the full path of the folder

Source code in scan/dms.py

@staticmethod
def getFullpath(folderPath: str, local: bool = False):
    """
    get the full path as accessible on my platform

    Args:
       folderPath(str): the path of the folder
       local(bool): True if the path is for a local folder

    Return:
        str: the full path of the folder
    """
    if local:
        fullPath = folderPath
    else:
        fullPath = f"{Folder.getPrefix()}{folderPath}"
    return fullPath

`getPrefix()` `classmethod`

get the path prefix for this platform (if any)

Return

str: the prefix e.g. /Volumes on Darwin

Source code in scan/dms.py

@classmethod
def getPrefix(cls):
    """
    get the path prefix for this platform (if any)

    Return:
        str: the prefix e.g. /Volumes on Darwin
    """
    if sys.platform == "darwin":
        prefix = f"/Volumes"
    else:
        prefix = ""
    return prefix

`getRelpath(folderPath)` `classmethod`

get the relative path as accessible on my platform

Parameters:

Name	Type	Description	Default
`folderPath(str)`		the path of the folder	required

Return

str: the relative path of the folder

Source code in scan/dms.py

@classmethod
def getRelpath(cls, folderPath: str) -> str:
    """
    get the relative path as accessible on my platform

    Args:
       folderPath(str): the path of the folder

    Return:
        str: the relative path of the folder
    """
    prefix = Folder.getPrefix()
    if prefix and folderPath.startswith(prefix):
        relbase = folderPath.replace(prefix, "")
    else:
        relbase = folderPath
    return relbase

`refreshDocuments()`

refresh the documents in this folder

Source code in scan/dms.py

def refreshDocuments(self):
    """
    refresh the documents in this folder
    """
    doclist = self.getFileDocuments()
    for doc in doclist:
        doc.getOcrText()
        pass
    pass

`FolderManager`

Bases: EntityManager

manager for Archives

Source code in scan/dms.py

class FolderManager(EntityManager):
    """
    manager for Archives
    """

    def __init__(self, mode="sql", debug=False):
        """constructor"""
        name = "folder"
        entityName = "Folder"
        entityPluralName = "folders"
        listName = entityPluralName
        clazz = Folder
        tableName = name
        config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
        handleInvalidListTypes = True
        filterInvalidListTypes = True
        primaryKey = None
        super().__init__(
            name,
            entityName,
            entityPluralName,
            listName,
            clazz,
            tableName,
            primaryKey,
            config,
            handleInvalidListTypes,
            filterInvalidListTypes,
            debug,
        )

    @staticmethod
    def getInstance(mode="sql"):
        fm = FolderManager(mode=mode)
        DMSStorage.fromCache(fm)
        return fm

    def getDocumentRecords(self, archiveName, folderPath):
        """
        get the document records
        """
        sqlDB = SQLDB(self.getCacheFile())
        sqlQuery = "SELECT * FROM document WHERE archiveName=(?) AND folderPath=(?)"
        params = (
            archiveName,
            folderPath,
        )
        dictList = sqlDB.query(sqlQuery, params)
        return dictList

    def getFolder(self, archive, folderPath: str):
        """
        get the folder for the given archive and folderPath

        Args:
            archive: the  archive
            folderPath: the path of the folder
        """
        sqlDB = SQLDB(self.getCacheFile())
        sqlQuery = "SELECT * FROM folder WHERE archiveName=(?) AND path=(?)"
        archiveName = archive.name
        params = (
            archiveName,
            folderPath,
        )
        records = sqlDB.query(sqlQuery, params)
        folder = None
        if len(records) > 1:
            msg = f"{len(records)} folders found for {archiveName}:{folderPath} - there should be only one"
            raise Exception(msg)
        elif len(records) == 1:
            folder = Folder()
            folder.fromDict(records[0])
        folder.archive = archive
        return folder

    def refreshFolder(self, archive, folderPath):
        """
        for the given archive and folderPath

        Args:
            archive: the name of the archive
            folderPath: the path of the folder
        """
        folder = self.getFolder(archive, folderPath)
        folder.refreshDocuments()
        pass

`init(mode='sql', debug=False)`

constructor

Source code in scan/dms.py

def __init__(self, mode="sql", debug=False):
    """constructor"""
    name = "folder"
    entityName = "Folder"
    entityPluralName = "folders"
    listName = entityPluralName
    clazz = Folder
    tableName = name
    config = DMSStorage.getStorageConfig(mode=mode, debug=debug)
    handleInvalidListTypes = True
    filterInvalidListTypes = True
    primaryKey = None
    super().__init__(
        name,
        entityName,
        entityPluralName,
        listName,
        clazz,
        tableName,
        primaryKey,
        config,
        handleInvalidListTypes,
        filterInvalidListTypes,
        debug,
    )

`getDocumentRecords(archiveName, folderPath)`

get the document records

Source code in scan/dms.py

def getDocumentRecords(self, archiveName, folderPath):
    """
    get the document records
    """
    sqlDB = SQLDB(self.getCacheFile())
    sqlQuery = "SELECT * FROM document WHERE archiveName=(?) AND folderPath=(?)"
    params = (
        archiveName,
        folderPath,
    )
    dictList = sqlDB.query(sqlQuery, params)
    return dictList

`getFolder(archive, folderPath)`

get the folder for the given archive and folderPath

Parameters:

Name	Type	Description	Default
`archive`		the archive	required
`folderPath`	`str`	the path of the folder	required

Source code in scan/dms.py

def getFolder(self, archive, folderPath: str):
    """
    get the folder for the given archive and folderPath

    Args:
        archive: the  archive
        folderPath: the path of the folder
    """
    sqlDB = SQLDB(self.getCacheFile())
    sqlQuery = "SELECT * FROM folder WHERE archiveName=(?) AND path=(?)"
    archiveName = archive.name
    params = (
        archiveName,
        folderPath,
    )
    records = sqlDB.query(sqlQuery, params)
    folder = None
    if len(records) > 1:
        msg = f"{len(records)} folders found for {archiveName}:{folderPath} - there should be only one"
        raise Exception(msg)
    elif len(records) == 1:
        folder = Folder()
        folder.fromDict(records[0])
    folder.archive = archive
    return folder

`refreshFolder(archive, folderPath)`

for the given archive and folderPath

Parameters:

Name	Type	Description	Default
`archive`		the name of the archive	required
`folderPath`		the path of the folder	required

Source code in scan/dms.py

def refreshFolder(self, archive, folderPath):
    """
    for the given archive and folderPath

    Args:
        archive: the name of the archive
        folderPath: the path of the folder
    """
    folder = self.getFolder(archive, folderPath)
    folder.refreshDocuments()
    pass

`Wiki`

Bases: object

Semantic Mediawiki access proxy

Source code in scan/dms.py

class Wiki(object):
    """
    Semantic Mediawiki access proxy
    """

    @staticmethod
    def getSMW(wikiId: str):
        """
        get the semantic mediawiki client with the given wikiId

        Args:
            wikiId: the wiki id of the client

        Return:
            SMWClient: the SMWClient with the given id
        """
        wikiClient = Wiki.get(wikiId)
        smw = SMWClient(wikiClient.getSite())
        return smw

    @staticmethod
    def get(wikiId: str):
        """
        get the Wiki Client with the given wikiId

        Args:
            wikiId: the wiki id of the client

        Return:
            WikiClient: the WikiClient with the given id
        """
        Wiki.checkIniFile(wikiId)
        wikiClient = WikiClient.ofWikiId(wikiId)
        wikiClient.login()
        return wikiClient

    @staticmethod
    def inPublicCI():
        """
        are we running in a public Continuous Integration Environment?
        """
        return getpass.getuser() in ["travis", "runner"]

    @staticmethod
    def checkIniFile(wikiId: str, save=None):
        """
        check the ini file for the given wikiId

        Args:
            wikiId(str): the wiki id of the wiki to check
            save(bool): True if a new ini file should be created e.g. for test purposes
                        if not set save is True if we are running in a public continuous integration environment
        """
        if save is None:
            save = Wiki.inPublicCI()
        iniFile = WikiUser.iniFilePath(wikiId)
        if not os.path.isfile(iniFile):
            wikiDict = None
            if wikiId == "wiki":
                wikiDict = {
                    "wikiId": wikiId,
                    "email": "noreply@nouser.com",
                    "url": "https://wiki.bitplan.com",
                    "scriptPath": "/",
                    "version": "MediaWiki 1.35.1",
                }
            if wikiDict is None:
                raise Exception(
                    f"wikiId {wikiId} is not configured in $HOME.mediawiki-japi"
                )
            else:
                wikiUser = WikiUser.ofDict(wikiDict, lenient=True)
                if save:
                    wikiUser.save()
            pass

`checkIniFile(wikiId, save=None)` `staticmethod`

check the ini file for the given wikiId

Parameters:

Name	Type	Description	Default
`wikiId(str)`		the wiki id of the wiki to check	required
`save(bool)`		True if a new ini file should be created e.g. for test purposes if not set save is True if we are running in a public continuous integration environment	required

Source code in scan/dms.py

@staticmethod
def checkIniFile(wikiId: str, save=None):
    """
    check the ini file for the given wikiId

    Args:
        wikiId(str): the wiki id of the wiki to check
        save(bool): True if a new ini file should be created e.g. for test purposes
                    if not set save is True if we are running in a public continuous integration environment
    """
    if save is None:
        save = Wiki.inPublicCI()
    iniFile = WikiUser.iniFilePath(wikiId)
    if not os.path.isfile(iniFile):
        wikiDict = None
        if wikiId == "wiki":
            wikiDict = {
                "wikiId": wikiId,
                "email": "noreply@nouser.com",
                "url": "https://wiki.bitplan.com",
                "scriptPath": "/",
                "version": "MediaWiki 1.35.1",
            }
        if wikiDict is None:
            raise Exception(
                f"wikiId {wikiId} is not configured in $HOME.mediawiki-japi"
            )
        else:
            wikiUser = WikiUser.ofDict(wikiDict, lenient=True)
            if save:
                wikiUser.save()
        pass

`get(wikiId)` `staticmethod`

get the Wiki Client with the given wikiId

Parameters:

Name	Type	Description	Default
`wikiId`	`str`	the wiki id of the client	required

Return

WikiClient: the WikiClient with the given id

Source code in scan/dms.py

@staticmethod
def get(wikiId: str):
    """
    get the Wiki Client with the given wikiId

    Args:
        wikiId: the wiki id of the client

    Return:
        WikiClient: the WikiClient with the given id
    """
    Wiki.checkIniFile(wikiId)
    wikiClient = WikiClient.ofWikiId(wikiId)
    wikiClient.login()
    return wikiClient

`getSMW(wikiId)` `staticmethod`

get the semantic mediawiki client with the given wikiId

Parameters:

Name	Type	Description	Default
`wikiId`	`str`	the wiki id of the client	required

Return

SMWClient: the SMWClient with the given id

Source code in scan/dms.py

@staticmethod
def getSMW(wikiId: str):
    """
    get the semantic mediawiki client with the given wikiId

    Args:
        wikiId: the wiki id of the client

    Return:
        SMWClient: the SMWClient with the given id
    """
    wikiClient = Wiki.get(wikiId)
    smw = SMWClient(wikiClient.getSite())
    return smw

`inPublicCI()` `staticmethod`

are we running in a public Continuous Integration Environment?

Source code in scan/dms.py

@staticmethod
def inPublicCI():
    """
    are we running in a public Continuous Integration Environment?
    """
    return getpass.getuser() in ["travis", "runner"]

`entity_view`

Created on 2023-11-17

@author: wf

`EntityManagerView`

a view for a given entity manager

Source code in scan/entity_view.py

class EntityManagerView:
    """
    a view for a given entity manager
    """

    def __init__(self, em: EntityManager):
        self.em = em
        self.setup_view()

    def setup_view(self):
        """
        set up my view elements
        """
        self.lod_grid = ListOfDictsGrid()

    def linkColumn(self, name, record, formatWith=None, formatTitleWith=None):
        """
        replace the column with the given name with a link
        """
        if name in record:
            value = record[name]
            if value is None:
                record[name] = ""
            else:
                if formatWith is None:
                    lurl = value
                else:
                    lurl = formatWith % value
                if formatTitleWith is None:
                    title = value
                else:
                    title = formatTitleWith % value
                record[name] = Link.create(lurl, title)

    def defaultRowHandler(self, row):
        self.linkColumn("url", row, formatWith="%s")

    def show(self, rowHandler=None, lodKeyHandler=None):
        """
        show my given entity manager
        """
        records = self.em.getList()
        if len(records) > 0:
            firstRecord = records[0]
            lodKeys = list(firstRecord.getJsonTypeSamples()[0].keys())
        else:
            lodKeys = ["url"]
        if lodKeyHandler is not None:
            lodKeyHandler(lodKeys)
        tableHeaders = lodKeys
        dictList = [vars(d).copy() for d in records]
        if rowHandler is None:
            rowHandler = self.defaultRowHandler
        for row in dictList:
            rowHandler(row)
        title = self.em.entityPluralName
        self.lod_grid.load_lod(dictList)

`linkColumn(name, record, formatWith=None, formatTitleWith=None)`

replace the column with the given name with a link

Source code in scan/entity_view.py

def linkColumn(self, name, record, formatWith=None, formatTitleWith=None):
    """
    replace the column with the given name with a link
    """
    if name in record:
        value = record[name]
        if value is None:
            record[name] = ""
        else:
            if formatWith is None:
                lurl = value
            else:
                lurl = formatWith % value
            if formatTitleWith is None:
                title = value
            else:
                title = formatTitleWith % value
            record[name] = Link.create(lurl, title)

`setup_view()`

set up my view elements

Source code in scan/entity_view.py

def setup_view(self):
    """
    set up my view elements
    """
    self.lod_grid = ListOfDictsGrid()

`show(rowHandler=None, lodKeyHandler=None)`

show my given entity manager

Source code in scan/entity_view.py

def show(self, rowHandler=None, lodKeyHandler=None):
    """
    show my given entity manager
    """
    records = self.em.getList()
    if len(records) > 0:
        firstRecord = records[0]
        lodKeys = list(firstRecord.getJsonTypeSamples()[0].keys())
    else:
        lodKeys = ["url"]
    if lodKeyHandler is not None:
        lodKeyHandler(lodKeys)
    tableHeaders = lodKeys
    dictList = [vars(d).copy() for d in records]
    if rowHandler is None:
        rowHandler = self.defaultRowHandler
    for row in dictList:
        rowHandler(row)
    title = self.em.entityPluralName
    self.lod_grid.load_lod(dictList)

`EntityView`

Source code in scan/entity_view.py

class EntityView:
    """ """

    def __init__(self, entity: JSONAble):
        """ """
        self.entity = entity

`init(entity)`

Source code in scan/entity_view.py

def __init__(self, entity: JSONAble):
    """ """
    self.entity = entity

`folderwatcher`

Created on 2021-04-21

see https://stackoverflow.com/a/66110795/1497139

`Handler`

Bases: PatternMatchingEventHandler

handle changes for a given wildcard pattern

Source code in scan/folderwatcher.py

class Handler(PatternMatchingEventHandler):
    """
    handle changes for a given wildcard pattern
    """

    def __init__(self, callback, patterns, debug=False):
        """
        construct me

        Args:
            callback: the function to call
            patterns: the patterns to trigger on
            debug(bool): if True print debug output
        """
        self.callback = callback
        self.debug = debug
        # Set the patterns for PatternMatchingEventHandler
        PatternMatchingEventHandler.__init__(
            self,
            patterns=patterns,
            ignore_directories=True,
            case_sensitive=False,
        )

    def on_any_event(self, event):
        if self.debug:
            print(
                "[{}] noticed: [{}] on: [{}] ".format(
                    time.asctime(), event.event_type, event.src_path
                )
            )
        if "modified" == event.event_type:
            self.callback(event.src_path)

`init(callback, patterns, debug=False)`

construct me

Parameters:

Name	Description	Default
`callback`	the function to call	required
`patterns`	the patterns to trigger on	required
`debug(bool)`	if True print debug output	required

Source code in scan/folderwatcher.py

def __init__(self, callback, patterns, debug=False):
    """
    construct me

    Args:
        callback: the function to call
        patterns: the patterns to trigger on
        debug(bool): if True print debug output
    """
    self.callback = callback
    self.debug = debug
    # Set the patterns for PatternMatchingEventHandler
    PatternMatchingEventHandler.__init__(
        self,
        patterns=patterns,
        ignore_directories=True,
        case_sensitive=False,
    )

`Watcher`

watch the given path with the given callback

Source code in scan/folderwatcher.py

class Watcher:
    """
    watch the given path with the given callback
    """

    def __init__(self, path, patterns=["*.pdf", "*.jpg"], debug=False):
        """
        construct me for the given path
        Args:
            path(str): the directory to observer
            patterns(list): a list of wildcard patterns
            debug(bool): True if debugging should be switched on
        """
        self.observer = Observer()
        self.path = path
        self.patterns = patterns
        self.debug = debug

    def run(self, callback, sleepTime=1, limit=sys.maxsize):
        """
        run me

        Args:
            callback(func): the function to trigger when a file appears
            sleepTime(float): how often to check for incoming files - default: 1.0 secs
            limit(float): the maximum time to run the server default: unlimited
        """
        event_handler = Handler(callback, patterns=self.patterns, debug=self.debug)
        self.observer.schedule(event_handler, self.path, recursive=True)
        self.observer.start()
        runTime = 0
        try:
            while runTime < limit:
                time.sleep(sleepTime)
                runTime += sleepTime

        except Exception as ex:
            self.observer.stop()
            if self.debug:
                print("Error %s " % str(ex))

`init(path, patterns=['.pdf', '.jpg'], debug=False)`

construct me for the given path Args: path(str): the directory to observer patterns(list): a list of wildcard patterns debug(bool): True if debugging should be switched on

Source code in scan/folderwatcher.py

def __init__(self, path, patterns=["*.pdf", "*.jpg"], debug=False):
    """
    construct me for the given path
    Args:
        path(str): the directory to observer
        patterns(list): a list of wildcard patterns
        debug(bool): True if debugging should be switched on
    """
    self.observer = Observer()
    self.path = path
    self.patterns = patterns
    self.debug = debug

`run(callback, sleepTime=1, limit=sys.maxsize)`

run me

Parameters:

Name	Description	Default
`callback(func)`	the function to trigger when a file appears	required
`sleepTime(float)`	how often to check for incoming files - default: 1.0 secs	required
`limit(float)`	the maximum time to run the server default: unlimited	required

Source code in scan/folderwatcher.py

def run(self, callback, sleepTime=1, limit=sys.maxsize):
    """
    run me

    Args:
        callback(func): the function to trigger when a file appears
        sleepTime(float): how often to check for incoming files - default: 1.0 secs
        limit(float): the maximum time to run the server default: unlimited
    """
    event_handler = Handler(callback, patterns=self.patterns, debug=self.debug)
    self.observer.schedule(event_handler, self.path, recursive=True)
    self.observer.start()
    runTime = 0
    try:
        while runTime < limit:
            time.sleep(sleepTime)
            runTime += sleepTime

    except Exception as ex:
        self.observer.stop()
        if self.debug:
            print("Error %s " % str(ex))

`logger`

Created on 2021-11-02

@author: wf

`Logger`

Bases: object

a logger module

Source code in scan/logger.py

class Logger(object):
    """
    a logger module
    """

    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    @staticmethod
    def log(msg: str):
        Logger.logger.info(msg)

    @staticmethod
    def logException(ex):
        # msg=f"{ex}"
        # print(msg,file=sys.stderr,flush=True)
        Logger.logger.exception(ex)

`pdf`

`PDFMiner`

PDFMiner.six wrapper to get PDF Text

Source code in scan/pdf.py

class PDFMiner:
    """
    PDFMiner.six wrapper to get PDF Text
    """

    @classmethod
    def getPDFText(cls, pdfFilenamePath, throwError: bool = True):
        retstr = StringIO()
        parser = PDFParser(open(pdfFilenamePath, "rb"))
        try:
            document = PDFDocument(parser)
        except Exception as e:
            errMsg = f"error {pdfFilenamePath}:{str(e)}"
            print(errMsg)
            if throwError:
                raise e
            return ""
        if document.is_extractable:
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
            return retstr.getvalue()
        else:
            print(pdfFilenamePath, "Warning: could not extract text from pdf file.")
            return ""

`product`

Created on 2023-11-16

@author: wf

`Product` `dataclass`

Data class representing a product.

Attributes:

Name	Type	Description
`title`	`str`	The title of the product.
`image_url`	`str`	The URL of the product image.
`price`	`str`	The price of the product.
`asin`	`Optional[str]`	The Amazon Standard Identification Number (ASIN) of the product, which is a unique identifier on Amazon's platform.

Source code in scan/product.py

@dataclass
class Product:
    """
    Data class representing a product.

    Attributes:
        title (str): The title of the product.
        image_url (str): The URL of the product image.
        price (str): The price of the product.
        asin (Optional[str]): The Amazon Standard Identification Number (ASIN) of the product,
                              which is a unique identifier on Amazon's platform.
    """

    title: str
    image_url: str
    price: str
    asin: Optional[str] = None
    gtin: Optional[str] = None

    @property
    def amazon_url(self) -> str:
        return f"https://www.amazon.com/dp/{self.asin}" if self.asin else None

    def as_html(self, img_size: int = 128) -> str:
        """
        Returns an HTML representation of the product with an image thumbnail and a link to the product page.

        Parameters:
            img_size (int): Size of the image thumbnail.

        Returns:
            str: HTML string representation of the product.
        """
        html = f"<div>"
        html += f'<img src="{self.image_url}" alt="{self.title}" width="{img_size}" height="{img_size}"/>'
        if self.amazon_url:
            html += f' <a href="{self.amazon_url}">{self.title}</a>'
        else:
            html += f" {self.title}"
        if self.gtin:
            html += f"Code: {self.gtin}"
        html += f" - {self.price}"
        html += f"</div>"
        return html

`as_html(img_size=128)`

Returns an HTML representation of the product with an image thumbnail and a link to the product page.

Parameters:

Name	Type	Description	Default
`img_size`	`int`	Size of the image thumbnail.	`128`

Returns:

Name	Type	Description
`str`	`str`	HTML string representation of the product.

Source code in scan/product.py

def as_html(self, img_size: int = 128) -> str:
    """
    Returns an HTML representation of the product with an image thumbnail and a link to the product page.

    Parameters:
        img_size (int): Size of the image thumbnail.

    Returns:
        str: HTML string representation of the product.
    """
    html = f"<div>"
    html += f'<img src="{self.image_url}" alt="{self.title}" width="{img_size}" height="{img_size}"/>'
    if self.amazon_url:
        html += f' <a href="{self.amazon_url}">{self.title}</a>'
    else:
        html += f" {self.title}"
    if self.gtin:
        html += f"Code: {self.gtin}"
    html += f" - {self.price}"
    html += f"</div>"
    return html

`Products`

Class to handle/manage product instances and make those persistent.

Attributes:

Name	Type	Description
`store_path`	`str`	The file path where products are stored as JSON.
`products`	`List[Product]`	List of product instances.
`products_by_asin`	`Dict[str, Product]`	Dictionary mapping ASIN to products.
`products_by_gtin`	`Dict[str, Product]`	Dictionary mapping gtin to products.

Source code in scan/product.py

class Products:
    """
    Class to handle/manage product instances and make those persistent.

    Attributes:
        store_path (str): The file path where products are stored as JSON.
        products (List[Product]): List of product instances.
        products_by_asin (Dict[str, Product]): Dictionary mapping ASIN to products.
        products_by_gtin (Dict[str, Product]): Dictionary mapping gtin to products.
    """

    def __init__(self, store_path: str = None):
        """
        Initialize the Products instance.

        Args:
            store_path (str, optional): The file path where products are stored as JSON.
                                       Defaults to ~/.scan2wiki/products.json.
        """
        self.store_path = store_path or expanduser("~/.scan2wiki/products.json")
        self.clear()

    def clear(self):
        """
        Clears the current product list and the associated mappings.
        """
        self.products = []
        self.products_by_asin = {}
        self.products_by_gtin = {}

    def add_product(self, product: Product):
        """
        Adds a product to the product list and updates the mappings.
        If a product with the same ASIN already exists, it updates the existing record.

        Args:
            product (Product): The product instance to add.
        """
        # Update product if it already exists in the by_asin list
        if product.asin and product.asin in self.products_by_asin:
            existing_product = self.products_by_asin[product.asin]
            existing_product.title = product.title
            existing_product.image_url = product.image_url
            existing_product.price = product.price
            existing_product.gtin = product.gtin
        else:
            # Add the product to the list and mappings
            self.products.append(product)
            if product.asin:
                self.products_by_asin[product.asin] = product
            if product.gtin:
                self.products_by_gtin[product.gtin] = product

        # Sort the products list by ASIN
        self.products.sort(key=lambda p: p.asin if p.asin else "")

    def delete_product(self, asin: str):
        """
        Delete a product with the given ASIN.

        Args:
            asin (str): The ASIN of the product to delete.
        """
        # Delete the product from the products list
        if asin in self.products.products_by_asin:
            product = self.products.products_by_asin[asin]
            self.products.products.remove(product)
            del self.products.products_by_asin[asin]
            if product.gtin and product.gtin in self.products.products_by_gtin:
                del self.products.products_by_gtin[product.gtin]
            self.products.save_to_json()  # Save the updated product list

    def get_aggrid_lod(self) -> List[Dict[str, str]]:
        """
        Generates a list of dictionaries for ag-Grid representation of the products.

        Returns:
            List[Dict[str, str]]: List of product information formatted for ag-Grid.
        """
        lod = []
        for index, product in enumerate(self.products, start=1):
            product_dict = {
                "#": str(index),
                "Product": product.as_html(),
                "ASIN": Link.create(product.amazon_url, product.asin)
                if product.asin
                else "",
                "Title": product.title,
                "gtin": product.gtin if product.gtin else "",
                "Price": product.price,
            }
            lod.append(product_dict)
        return lod

    def save_to_json(self, filename: str = None):
        """
        Saves the current list of products to a JSON file.

        Args:
            filename (str, optional): The filename where to save the JSON data.
                                      Defaults to the instance's store_path attribute.
        """

        filename = filename or self.store_path
        # Ensure the directory for the store_path exists
        directory = dirname(filename)
        if not exists(directory):
            os.makedirs(directory, exist_ok=True)

        product_data = [product.__dict__ for product in self.products]
        with open(filename, "w") as file:
            json.dump(product_data, file, indent=2)

    def load_from_json(self, filepath: str = None):
        """
        Loads products from a JSON file and updates the current list and mappings.

        Args:
            filepath (str, optional): The filepath from which to load the JSON data.
                                      Defaults to the instance's store_path attribute.
        """
        filename = filepath or self.store_path
        if os.path.exists(filename):
            with open(filename, "r") as file:
                product_records = json.load(file)
            for product_record in product_records:
                self.add_product(Product(**product_record))

`init(store_path=None)`

Initialize the Products instance.

Parameters:

Name	Type	Description	Default
`store_path`	`str`	The file path where products are stored as JSON. Defaults to ~/.scan2wiki/products.json.	`None`

Source code in scan/product.py

def __init__(self, store_path: str = None):
    """
    Initialize the Products instance.

    Args:
        store_path (str, optional): The file path where products are stored as JSON.
                                   Defaults to ~/.scan2wiki/products.json.
    """
    self.store_path = store_path or expanduser("~/.scan2wiki/products.json")
    self.clear()

`add_product(product)`

Adds a product to the product list and updates the mappings. If a product with the same ASIN already exists, it updates the existing record.

Parameters:

Name	Type	Description	Default
`product`	`Product`	The product instance to add.	required

Source code in scan/product.py

def add_product(self, product: Product):
    """
    Adds a product to the product list and updates the mappings.
    If a product with the same ASIN already exists, it updates the existing record.

    Args:
        product (Product): The product instance to add.
    """
    # Update product if it already exists in the by_asin list
    if product.asin and product.asin in self.products_by_asin:
        existing_product = self.products_by_asin[product.asin]
        existing_product.title = product.title
        existing_product.image_url = product.image_url
        existing_product.price = product.price
        existing_product.gtin = product.gtin
    else:
        # Add the product to the list and mappings
        self.products.append(product)
        if product.asin:
            self.products_by_asin[product.asin] = product
        if product.gtin:
            self.products_by_gtin[product.gtin] = product

    # Sort the products list by ASIN
    self.products.sort(key=lambda p: p.asin if p.asin else "")

`clear()`

Clears the current product list and the associated mappings.

Source code in scan/product.py

def clear(self):
    """
    Clears the current product list and the associated mappings.
    """
    self.products = []
    self.products_by_asin = {}
    self.products_by_gtin = {}

`delete_product(asin)`

Delete a product with the given ASIN.

Parameters:

Name	Type	Description	Default
`asin`	`str`	The ASIN of the product to delete.	required

Source code in scan/product.py

def delete_product(self, asin: str):
    """
    Delete a product with the given ASIN.

    Args:
        asin (str): The ASIN of the product to delete.
    """
    # Delete the product from the products list
    if asin in self.products.products_by_asin:
        product = self.products.products_by_asin[asin]
        self.products.products.remove(product)
        del self.products.products_by_asin[asin]
        if product.gtin and product.gtin in self.products.products_by_gtin:
            del self.products.products_by_gtin[product.gtin]
        self.products.save_to_json()  # Save the updated product list

`get_aggrid_lod()`

Generates a list of dictionaries for ag-Grid representation of the products.

Returns:

Type	Description
`List[Dict[str, str]]`	List[Dict[str, str]]: List of product information formatted for ag-Grid.

Source code in scan/product.py

def get_aggrid_lod(self) -> List[Dict[str, str]]:
    """
    Generates a list of dictionaries for ag-Grid representation of the products.

    Returns:
        List[Dict[str, str]]: List of product information formatted for ag-Grid.
    """
    lod = []
    for index, product in enumerate(self.products, start=1):
        product_dict = {
            "#": str(index),
            "Product": product.as_html(),
            "ASIN": Link.create(product.amazon_url, product.asin)
            if product.asin
            else "",
            "Title": product.title,
            "gtin": product.gtin if product.gtin else "",
            "Price": product.price,
        }
        lod.append(product_dict)
    return lod

`load_from_json(filepath=None)`

Loads products from a JSON file and updates the current list and mappings.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	The filepath from which to load the JSON data. Defaults to the instance's store_path attribute.	`None`

Source code in scan/product.py

def load_from_json(self, filepath: str = None):
    """
    Loads products from a JSON file and updates the current list and mappings.

    Args:
        filepath (str, optional): The filepath from which to load the JSON data.
                                  Defaults to the instance's store_path attribute.
    """
    filename = filepath or self.store_path
    if os.path.exists(filename):
        with open(filename, "r") as file:
            product_records = json.load(file)
        for product_record in product_records:
            self.add_product(Product(**product_record))

`save_to_json(filename=None)`

Saves the current list of products to a JSON file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	The filename where to save the JSON data. Defaults to the instance's store_path attribute.	`None`

Source code in scan/product.py

def save_to_json(self, filename: str = None):
    """
    Saves the current list of products to a JSON file.

    Args:
        filename (str, optional): The filename where to save the JSON data.
                                  Defaults to the instance's store_path attribute.
    """

    filename = filename or self.store_path
    # Ensure the directory for the store_path exists
    directory = dirname(filename)
    if not exists(directory):
        os.makedirs(directory, exist_ok=True)

    product_data = [product.__dict__ for product in self.products]
    with open(filename, "w") as file:
        json.dump(product_data, file, indent=2)

`profiler`

Created on 2021-10-26

@author: wf

`Profiler`

simple profiler

Source code in scan/profiler.py

class Profiler:
    """
    simple profiler
    """

    def __init__(self, msg:str, profile=True):
        """
        construct me with the given msg and profile active flag

        Args:
            msg (str): the message to show if profiling is active
            profile (bool): True if messages should be shown
        """
        self.msg = msg
        self.profile = profile

    def start(self) -> str:
        """
        start profiling

        Return:
            str: start message
        """
        msg = f"Starting {self.msg} ..."
        self.starttime = time.time()
        if self.profile:
            print(msg)
        return msg

    def time(self, extraMsg=""):
        """
        time the action and print if profile is active

        Return:
            (float,str): time and message for time
        """
        elapsed = time.time() - self.starttime
        elapsedMessage = f"{self.msg}{extraMsg} took {elapsed:5.3f} s"
        if self.profile:
            print(elapsedMessage)
        return elapsed, elapsedMessage

`init(msg, profile=True)`

construct me with the given msg and profile active flag

Parameters:

Name	Type	Description	Default
`msg`	`str`	the message to show if profiling is active	required
`profile`	`bool`	True if messages should be shown	`True`

Source code in scan/profiler.py

def __init__(self, msg:str, profile=True):
    """
    construct me with the given msg and profile active flag

    Args:
        msg (str): the message to show if profiling is active
        profile (bool): True if messages should be shown
    """
    self.msg = msg
    self.profile = profile

`start()`

start profiling

Return

str: start message

Source code in scan/profiler.py

def start(self) -> str:
    """
    start profiling

    Return:
        str: start message
    """
    msg = f"Starting {self.msg} ..."
    self.starttime = time.time()
    if self.profile:
        print(msg)
    return msg

`time(extraMsg='')`

time the action and print if profile is active

Return

(float,str): time and message for time

Source code in scan/profiler.py

def time(self, extraMsg=""):
    """
    time the action and print if profile is active

    Return:
        (float,str): time and message for time
    """
    elapsed = time.time() - self.starttime
    elapsedMessage = f"{self.msg}{extraMsg} took {elapsed:5.3f} s"
    if self.profile:
        print(elapsedMessage)
    return elapsed, elapsedMessage

`scan_cmd`

Created on 2023-11-14

@author: wf

`ScanCmd`

Bases: WebserverCmd

Command line for scan2wiki web server

Source code in scan/scan_cmd.py

class ScanCmd(WebserverCmd):
    """
    Command line for scan2wiki web server
    """

    def getArgParser(self, description: str, version_msg) -> ArgumentParser:
        """
        override the default argparser call
        """
        parser = super().getArgParser(description, version_msg)
        parser.add_argument(
            "-v",
            "--verbose",
            action="store_true",
            help="show verbose output [default: %(default)s]",
        )
        parser.add_argument(
            "-rp",
            "--root_path",
            default=ScanSolution.examples_path(),
            help="path to example pdf files [default: %(default)s]",
        )
        parser.add_argument(
            "-wc", "--webcam", help="url of webcam for scans [default: %(default)s]"
        )
        return parser

`getArgParser(description, version_msg)`

override the default argparser call

Source code in scan/scan_cmd.py

def getArgParser(self, description: str, version_msg) -> ArgumentParser:
    """
    override the default argparser call
    """
    parser = super().getArgParser(description, version_msg)
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="show verbose output [default: %(default)s]",
    )
    parser.add_argument(
        "-rp",
        "--root_path",
        default=ScanSolution.examples_path(),
        help="path to example pdf files [default: %(default)s]",
    )
    parser.add_argument(
        "-wc", "--webcam", help="url of webcam for scans [default: %(default)s]"
    )
    return parser

`main(argv=None)`

main call

Source code in scan/scan_cmd.py

def main(argv: list = None):
    """
    main call
    """
    cmd = ScanCmd(config=ScanWebServer.get_config(), webserver_cls=ScanWebServer)
    exit_code = cmd.cmd_main(argv)
    return exit_code

`scan_webserver`

Created on 2023-11-14

@author: wf