Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
Show First 20 Lines • Show All 462 Lines • ▼ Show 20 Lines | ) -> Dict[str, Any]: | ||||
prefix_url = "Failed branches: " if i == 0 else "" | prefix_url = "Failed branches: " if i == 0 else "" | ||||
logger.warning("%s%s", prefix_url, urls) | logger.warning("%s%s", prefix_url, urls) | ||||
return result | return result | ||||
def load(self) -> Dict: | def load(self) -> Dict: | ||||
"""Load for a specific origin the associated contents. | """Load for a specific origin the associated contents. | ||||
for each package version of the origin | 1. Get the list of versions in an origin. | ||||
1. Fetch the files for one package version By default, this can be | 2. Get the snapshot from the previous run of the loader, | ||||
and filter out versions that were already loaded, if their | |||||
:term:`extids <extid>` match | |||||
Then, for each remaining version in the origin | |||||
3. Fetch the files for one package version By default, this can be | |||||
implemented as a simple HTTP request. Loaders with more specific | implemented as a simple HTTP request. Loaders with more specific | ||||
requirements can override this, e.g.: the PyPI loader checks the | requirements can override this, e.g.: the PyPI loader checks the | ||||
integrity of the downloaded files; the Debian loader has to download | integrity of the downloaded files; the Debian loader has to download | ||||
and check several files for one package version. | and check several files for one package version. | ||||
2. Extract the downloaded files By default, this would be a universal | 4. Extract the downloaded files. By default, this would be a universal | ||||
archive/tarball extraction. | archive/tarball extraction. | ||||
Loaders for specific formats can override this method (for instance, | Loaders for specific formats can override this method (for instance, | ||||
the Debian loader uses dpkg-source -x). | the Debian loader uses dpkg-source -x). | ||||
3. Convert the extracted directory to a set of Software Heritage | 5. Convert the extracted directory to a set of Software Heritage | ||||
objects Using swh.model.from_disk. | objects Using swh.model.from_disk. | ||||
4. Extract the metadata from the unpacked directories This would only | 6. Extract the metadata from the unpacked directories This would only | ||||
be applicable for "smart" loaders like npm (parsing the | be applicable for "smart" loaders like npm (parsing the | ||||
package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing | package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing | ||||
debian/changelog and debian/control). | debian/changelog and debian/control). | ||||
On "minimal-metadata" sources such as the GNU archive, the lister | On "minimal-metadata" sources such as the GNU archive, the lister | ||||
should provide the minimal set of metadata needed to populate the | should provide the minimal set of metadata needed to populate the | ||||
revision/release objects (authors, dates) as an argument to the | revision/release objects (authors, dates) as an argument to the | ||||
task. | task. | ||||
5. Generate the revision/release objects for the given version. From | 7. Generate the revision/release objects for the given version. From | ||||
the data generated at steps 3 and 4. | the data generated at steps 3 and 4. | ||||
end for each | end for each | ||||
6. Generate and load the snapshot for the visit | 8. Generate and load the snapshot for the visit | ||||
Using the revisions/releases collected at step 5., and the branch | Using the revisions/releases collected at step 7., and the branch | ||||
information from step 0., generate a snapshot and load it into the | information from step 2., generate a snapshot and load it into the | ||||
Software Heritage archive | Software Heritage archive | ||||
""" | """ | ||||
status_load = "uneventful" # either: eventful, uneventful, failed | status_load = "uneventful" # either: eventful, uneventful, failed | ||||
status_visit = "full" # see swh.model.model.OriginVisitStatus | status_visit = "full" # see swh.model.model.OriginVisitStatus | ||||
snapshot = None | snapshot = None | ||||
failed_branches: List[str] = [] | failed_branches: List[str] = [] | ||||
▲ Show 20 Lines • Show All 514 Lines • Show Last 20 Lines |