Month: May 2024
Image service
This is description how to use our internal image service. First you need to initialize wit api_key which is stored on Dataplatform backend associated with user. Note that we don’t have have UEX yet to create/show/find it.
dphelper = DPHelper(api_key='ADD_YOUR_KEY_TO_TEST')
After authorizing with DPHelper, you can now use image service. To upload single image:
dphelper.upload_image_from_url('https://www.google.com/images/branding/googlelogo/2x/googlelogo_color_92x30dp.png')
To upload multiple images by url
dphelper = DPHelper(api_key='ADD_YOUR_KEY_TO_TEST')
image_urls = [
'https://google.com/image1.png',
'https://google.com/image2.png',
]
results = dphelper.upload_all_images(image_urls, max_concurrent=10)
Note: API is permissive, i.e. does not throw if some of the imags are already downloaded
Streaming JSON
Sometimes data stored in memory can become very large, for example the pattern of pagination here:
# ssva.lt energinio naudingumo registras
# from pprint import pprint
import re
import json
from dphelper import DPHelper
helper = DPHelper(is_verbose=True)
headers = helper.create_headers(authority='ssva.lt')
def generate_url(page_number, PAGE_SIZE=10000):
return f'https://www.ssva.lt/registrai/pensreg/pensert_list.php?goto={page_number}&pagesize={PAGE_SIZE}'
PAGINATION_DATA_PATTERN = ('Toliau</a></li>.*?pageNum="(.*?)" >Pabaiga</a>')
DATA_PATTERN = (
'<tr id="gridRow.*?class="r-gridrow">'
'<tddata-record-id=".*?".*?SertifikatoNr".*?val="(.*?)" >' # sert nr
'.*?"IsdavimoD".*?val="(.*?)" >' # isdavimo data
'.*?GaliojimoD".*?val="(.*?)" >' # galiojimo data
'.*?UnikalusNr".*?val="(.*?)" >' # unikalus nr
'.*?Adresas".*?val="(.*?)" >' # adresas
'.*?Paskirtis.*?val="(.*?)" >' # paskirtis
'.*?PEN".*?val="(.*?)" >' # PEN
'.*?Ap" >.*?val="(.*?)" >' # sildomas plotas
'.*?Q".*?val="(.*?)" >' # E.sanaudos
'.*?Hsrc".*?val="(.*?)" >' # silumos saltinis
'.*?Pastaba".*?val="(.*?)" ></span></span></td></tr>' # pastaba
)
def get_page_count():
FIRST_PAGE_URL = generate_url(1)
raw_content = helper.from_url(FIRST_PAGE_URL, headers=headers)
rg = re.compile(PAGINATION_DATA_PATTERN)
last_page_nr = rg.findall(raw_content)
return int(last_page_nr[0])
if __name__ == "__main__":
last_page_nr = get_page_count()
# Compile is slow, so we do it once
rg = re.compile(DATA_PATTERN)
data = []
for page_number in range(1, last_page_nr + 1):
url_for_read = generate_url(page_number)
# print(url_for_read)
page_content = helper.from_url(url_for_read, headers=headers)
results = rg.findall(page_content)
data.extend(results)
print(json.dumps(results, indent=2))
print(json.dumps(data, indent=2))
In such scenarios it can be useful to define this as streamin g response from json_stream pip package. Basic example:
import sys
import json
from json_stream import streamable_list
def test_stream():
for i in range(20):
yield i
# wrap existing iterable
data = streamable_list(test_stream())
# consume iterable with standard json.dump()
json.dump(data, sys.stdout)
Applying example to this scenario:
# ssva.lt energinio naudingumo registras
# from pprint import pprint
import re
import json
import sys
from dphelper import DPHelper
from json_stream import streamable_list
helper = DPHelper(is_verbose=True)
headers = helper.create_headers(authority='ssva.lt')
def generate_url(page_number, PAGE_SIZE=100):
return f'https://www.ssva.lt/registrai/pensreg/pensert_list.php?goto={page_number}&pagesize={PAGE_SIZE}'
PAGINATION_DATA_PATTERN = ('Toliau</a></li>.*?pageNum="(.*?)" >Pabaiga</a>')
DATA_PATTERN = (
'<tr id="gridRow.*?class="r-gridrow">'
'<tddata-record-id=".*?".*?SertifikatoNr".*?val="(.*?)" >' # sert nr
'.*?"IsdavimoD".*?val="(.*?)" >' # isdavimo data
'.*?GaliojimoD".*?val="(.*?)" >' # galiojimo data
'.*?UnikalusNr".*?val="(.*?)" >' # unikalus nr
'.*?Adresas".*?val="(.*?)" >' # adresas
'.*?Paskirtis.*?val="(.*?)" >' # paskirtis
'.*?PEN".*?val="(.*?)" >' # PEN
'.*?Ap" >.*?val="(.*?)" >' # sildomas plotas
'.*?Q".*?val="(.*?)" >' # E.sanaudos
'.*?Hsrc".*?val="(.*?)" >' # silumos saltinis
'.*?Pastaba".*?val="(.*?)" ></span></span></td></tr>' # pastaba
)
def get_page_count():
FIRST_PAGE_URL = generate_url(1)
raw_content = helper.from_url(FIRST_PAGE_URL, headers=headers)
rg = re.compile(PAGINATION_DATA_PATTERN)
last_page_nr = rg.findall(raw_content)
return int(last_page_nr[0])
def yield_data():
last_page_nr = get_page_count()
rg = re.compile(DATA_PATTERN)
last_page_nr = 5
for page_number in range(1, last_page_nr + 1):
url_for_read = generate_url(page_number)
page_content = helper.from_url(url_for_read, headers=headers)
results = rg.findall(page_content)
for result in results:
yield result
if __name__ == "__main__":
# wrap existing iterable
data = streamable_list(yield_data())
# consume iterable with standard json.dump()
json.dump(data, sys.stdout)